1 /*
2  * Copyright © 2018, VideoLAN and dav1d authors
3  * Copyright © 2018, Two Orioles, LLC
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  *    list of conditions and the following disclaimer.
11  *
12  * 2. Redistributions in binary form must reproduce the above copyright notice,
13  *    this list of conditions and the following disclaimer in the documentation
14  *    and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include "config.h"
29 
30 #include <stdlib.h>
31 #include <string.h>
32 
33 #include "common/attributes.h"
34 #include "common/intops.h"
35 
36 #include "src/mc.h"
37 #include "src/tables.h"
38 
39 #if BITDEPTH == 8
40 #define get_intermediate_bits(bitdepth_max) 4
41 // Output in interval [-5132, 9212], fits in int16_t as is
42 #define PREP_BIAS 0
43 #else
44 // 4 for 10 bits/component, 2 for 12 bits/component
45 #define get_intermediate_bits(bitdepth_max) (14 - bitdepth_from_max(bitdepth_max))
46 // Output in interval [-20588, 36956] (10-bit), [-20602, 36983] (12-bit)
47 // Subtract a bias to ensure the output fits in int16_t
48 #define PREP_BIAS 8192
49 #endif
50 
51 static NOINLINE void
put_c(pixel * dst,const ptrdiff_t dst_stride,const pixel * src,const ptrdiff_t src_stride,const int w,int h)52 put_c(pixel *dst, const ptrdiff_t dst_stride,
53       const pixel *src, const ptrdiff_t src_stride, const int w, int h)
54 {
55     do {
56         pixel_copy(dst, src, w);
57 
58         dst += dst_stride;
59         src += src_stride;
60     } while (--h);
61 }
62 
63 static NOINLINE void
prep_c(int16_t * tmp,const pixel * src,const ptrdiff_t src_stride,const int w,int h HIGHBD_DECL_SUFFIX)64 prep_c(int16_t *tmp, const pixel *src, const ptrdiff_t src_stride,
65        const int w, int h HIGHBD_DECL_SUFFIX)
66 {
67     const int intermediate_bits = get_intermediate_bits(bitdepth_max);
68     do {
69         for (int x = 0; x < w; x++)
70             tmp[x] = (src[x] << intermediate_bits) - PREP_BIAS;
71 
72         tmp += w;
73         src += src_stride;
74     } while (--h);
75 }
76 
77 #define FILTER_8TAP(src, x, F, stride) \
78     (F[0] * src[x + -3 * stride] + \
79      F[1] * src[x + -2 * stride] + \
80      F[2] * src[x + -1 * stride] + \
81      F[3] * src[x + +0 * stride] + \
82      F[4] * src[x + +1 * stride] + \
83      F[5] * src[x + +2 * stride] + \
84      F[6] * src[x + +3 * stride] + \
85      F[7] * src[x + +4 * stride])
86 
87 #define DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh) \
88     ((FILTER_8TAP(src, x, F, stride) + ((1 << (sh)) >> 1)) >> (sh))
89 
90 #define DAV1D_FILTER_8TAP_RND2(src, x, F, stride, rnd, sh) \
91     ((FILTER_8TAP(src, x, F, stride) + (rnd)) >> (sh))
92 
93 #define DAV1D_FILTER_8TAP_CLIP(src, x, F, stride, sh) \
94     iclip_pixel(DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh))
95 
96 #define DAV1D_FILTER_8TAP_CLIP2(src, x, F, stride, rnd, sh) \
97     iclip_pixel(DAV1D_FILTER_8TAP_RND2(src, x, F, stride, rnd, sh))
98 
99 #define GET_H_FILTER(mx) \
100     const int8_t *const fh = !(mx) ? NULL : w > 4 ? \
101         dav1d_mc_subpel_filters[filter_type & 3][(mx) - 1] : \
102         dav1d_mc_subpel_filters[3 + (filter_type & 1)][(mx) - 1]
103 
104 #define GET_V_FILTER(my) \
105     const int8_t *const fv = !(my) ? NULL : h > 4 ? \
106         dav1d_mc_subpel_filters[filter_type >> 2][(my) - 1] : \
107         dav1d_mc_subpel_filters[3 + ((filter_type >> 2) & 1)][(my) - 1]
108 
109 #define GET_FILTERS() \
110     GET_H_FILTER(mx); \
111     GET_V_FILTER(my)
112 
113 static NOINLINE void
put_8tap_c(pixel * dst,ptrdiff_t dst_stride,const pixel * src,ptrdiff_t src_stride,const int w,int h,const int mx,const int my,const int filter_type HIGHBD_DECL_SUFFIX)114 put_8tap_c(pixel *dst, ptrdiff_t dst_stride,
115            const pixel *src, ptrdiff_t src_stride,
116            const int w, int h, const int mx, const int my,
117            const int filter_type HIGHBD_DECL_SUFFIX)
118 {
119     const int intermediate_bits = get_intermediate_bits(bitdepth_max);
120     const int intermediate_rnd = 32 + ((1 << (6 - intermediate_bits)) >> 1);
121 
122     GET_FILTERS();
123     dst_stride = PXSTRIDE(dst_stride);
124     src_stride = PXSTRIDE(src_stride);
125 
126     if (fh) {
127         if (fv) {
128             int tmp_h = h + 7;
129             int16_t mid[128 * 135], *mid_ptr = mid;
130 
131             src -= src_stride * 3;
132             do {
133                 for (int x = 0; x < w; x++)
134                     mid_ptr[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1,
135                                                        6 - intermediate_bits);
136 
137                 mid_ptr += 128;
138                 src += src_stride;
139             } while (--tmp_h);
140 
141             mid_ptr = mid + 128 * 3;
142             do {
143                 for (int x = 0; x < w; x++)
144                     dst[x] = DAV1D_FILTER_8TAP_CLIP(mid_ptr, x, fv, 128,
145                                                     6 + intermediate_bits);
146 
147                 mid_ptr += 128;
148                 dst += dst_stride;
149             } while (--h);
150         } else {
151             do {
152                 for (int x = 0; x < w; x++) {
153                     dst[x] = DAV1D_FILTER_8TAP_CLIP2(src, x, fh, 1,
154                                                      intermediate_rnd, 6);
155                 }
156 
157                 dst += dst_stride;
158                 src += src_stride;
159             } while (--h);
160         }
161     } else if (fv) {
162         do {
163             for (int x = 0; x < w; x++)
164                 dst[x] = DAV1D_FILTER_8TAP_CLIP(src, x, fv, src_stride, 6);
165 
166             dst += dst_stride;
167             src += src_stride;
168         } while (--h);
169     } else
170         put_c(dst, dst_stride, src, src_stride, w, h);
171 }
172 
173 static NOINLINE void
put_8tap_scaled_c(pixel * dst,const ptrdiff_t dst_stride,const pixel * src,ptrdiff_t src_stride,const int w,int h,const int mx,int my,const int dx,const int dy,const int filter_type HIGHBD_DECL_SUFFIX)174 put_8tap_scaled_c(pixel *dst, const ptrdiff_t dst_stride,
175                   const pixel *src, ptrdiff_t src_stride,
176                   const int w, int h, const int mx, int my,
177                   const int dx, const int dy, const int filter_type
178                   HIGHBD_DECL_SUFFIX)
179 {
180     const int intermediate_bits = get_intermediate_bits(bitdepth_max);
181     const int intermediate_rnd = (1 << intermediate_bits) >> 1;
182     int tmp_h = (((h - 1) * dy + my) >> 10) + 8;
183     int16_t mid[128 * (256 + 7)], *mid_ptr = mid;
184     src_stride = PXSTRIDE(src_stride);
185 
186     src -= src_stride * 3;
187     do {
188         int x;
189         int imx = mx, ioff = 0;
190 
191         for (x = 0; x < w; x++) {
192             GET_H_FILTER(imx >> 6);
193             mid_ptr[x] = fh ? DAV1D_FILTER_8TAP_RND(src, ioff, fh, 1,
194                                                     6 - intermediate_bits) :
195                               src[ioff] << intermediate_bits;
196             imx += dx;
197             ioff += imx >> 10;
198             imx &= 0x3ff;
199         }
200 
201         mid_ptr += 128;
202         src += src_stride;
203     } while (--tmp_h);
204 
205     mid_ptr = mid + 128 * 3;
206     for (int y = 0; y < h; y++) {
207         int x;
208         GET_V_FILTER(my >> 6);
209 
210         for (x = 0; x < w; x++)
211             dst[x] = fv ? DAV1D_FILTER_8TAP_CLIP(mid_ptr, x, fv, 128,
212                                                  6 + intermediate_bits) :
213                           iclip_pixel((mid_ptr[x] + intermediate_rnd) >>
214                                               intermediate_bits);
215 
216         my += dy;
217         mid_ptr += (my >> 10) * 128;
218         my &= 0x3ff;
219         dst += PXSTRIDE(dst_stride);
220     }
221 }
222 
223 static NOINLINE void
prep_8tap_c(int16_t * tmp,const pixel * src,ptrdiff_t src_stride,const int w,int h,const int mx,const int my,const int filter_type HIGHBD_DECL_SUFFIX)224 prep_8tap_c(int16_t *tmp, const pixel *src, ptrdiff_t src_stride,
225             const int w, int h, const int mx, const int my,
226             const int filter_type HIGHBD_DECL_SUFFIX)
227 {
228     const int intermediate_bits = get_intermediate_bits(bitdepth_max);
229     GET_FILTERS();
230     src_stride = PXSTRIDE(src_stride);
231 
232     if (fh) {
233         if (fv) {
234             int tmp_h = h + 7;
235             int16_t mid[128 * 135], *mid_ptr = mid;
236 
237             src -= src_stride * 3;
238             do {
239                 for (int x = 0; x < w; x++)
240                     mid_ptr[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1,
241                                                        6 - intermediate_bits);
242 
243                 mid_ptr += 128;
244                 src += src_stride;
245             } while (--tmp_h);
246 
247             mid_ptr = mid + 128 * 3;
248             do {
249                 for (int x = 0; x < w; x++) {
250                     int t = DAV1D_FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6) -
251                                   PREP_BIAS;
252                     assert(t >= INT16_MIN && t <= INT16_MAX);
253                     tmp[x] = t;
254                 }
255 
256                 mid_ptr += 128;
257                 tmp += w;
258             } while (--h);
259         } else {
260             do {
261                 for (int x = 0; x < w; x++)
262                     tmp[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1,
263                                                    6 - intermediate_bits) -
264                              PREP_BIAS;
265 
266                 tmp += w;
267                 src += src_stride;
268             } while (--h);
269         }
270     } else if (fv) {
271         do {
272             for (int x = 0; x < w; x++)
273                 tmp[x] = DAV1D_FILTER_8TAP_RND(src, x, fv, src_stride,
274                                                6 - intermediate_bits) -
275                          PREP_BIAS;
276 
277             tmp += w;
278             src += src_stride;
279         } while (--h);
280     } else
281         prep_c(tmp, src, src_stride, w, h HIGHBD_TAIL_SUFFIX);
282 }
283 
284 static NOINLINE void
prep_8tap_scaled_c(int16_t * tmp,const pixel * src,ptrdiff_t src_stride,const int w,int h,const int mx,int my,const int dx,const int dy,const int filter_type HIGHBD_DECL_SUFFIX)285 prep_8tap_scaled_c(int16_t *tmp, const pixel *src, ptrdiff_t src_stride,
286                    const int w, int h, const int mx, int my,
287                    const int dx, const int dy, const int filter_type
288                    HIGHBD_DECL_SUFFIX)
289 {
290     const int intermediate_bits = get_intermediate_bits(bitdepth_max);
291     int tmp_h = (((h - 1) * dy + my) >> 10) + 8;
292     int16_t mid[128 * (256 + 7)], *mid_ptr = mid;
293     src_stride = PXSTRIDE(src_stride);
294 
295     src -= src_stride * 3;
296     do {
297         int x;
298         int imx = mx, ioff = 0;
299 
300         for (x = 0; x < w; x++) {
301             GET_H_FILTER(imx >> 6);
302             mid_ptr[x] = fh ? DAV1D_FILTER_8TAP_RND(src, ioff, fh, 1,
303                                                     6 - intermediate_bits) :
304                               src[ioff] << intermediate_bits;
305             imx += dx;
306             ioff += imx >> 10;
307             imx &= 0x3ff;
308         }
309 
310         mid_ptr += 128;
311         src += src_stride;
312     } while (--tmp_h);
313 
314     mid_ptr = mid + 128 * 3;
315     for (int y = 0; y < h; y++) {
316         int x;
317         GET_V_FILTER(my >> 6);
318 
319         for (x = 0; x < w; x++)
320             tmp[x] = (fv ? DAV1D_FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6)
321                          : mid_ptr[x]) - PREP_BIAS;
322 
323         my += dy;
324         mid_ptr += (my >> 10) * 128;
325         my &= 0x3ff;
326         tmp += w;
327     }
328 }
329 
330 #define filter_fns(type, type_h, type_v) \
331 static void put_8tap_##type##_c(pixel *const dst, \
332                                 const ptrdiff_t dst_stride, \
333                                 const pixel *const src, \
334                                 const ptrdiff_t src_stride, \
335                                 const int w, const int h, \
336                                 const int mx, const int my \
337                                 HIGHBD_DECL_SUFFIX) \
338 { \
339     put_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, \
340                type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \
341 } \
342 static void put_8tap_##type##_scaled_c(pixel *const dst, \
343                                        const ptrdiff_t dst_stride, \
344                                        const pixel *const src, \
345                                        const ptrdiff_t src_stride, \
346                                        const int w, const int h, \
347                                        const int mx, const int my, \
348                                        const int dx, const int dy \
349                                        HIGHBD_DECL_SUFFIX) \
350 { \
351     put_8tap_scaled_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, \
352                       type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \
353 } \
354 static void prep_8tap_##type##_c(int16_t *const tmp, \
355                                  const pixel *const src, \
356                                  const ptrdiff_t src_stride, \
357                                  const int w, const int h, \
358                                  const int mx, const int my \
359                                  HIGHBD_DECL_SUFFIX) \
360 { \
361     prep_8tap_c(tmp, src, src_stride, w, h, mx, my, \
362                 type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \
363 } \
364 static void prep_8tap_##type##_scaled_c(int16_t *const tmp, \
365                                         const pixel *const src, \
366                                         const ptrdiff_t src_stride, \
367                                         const int w, const int h, \
368                                         const int mx, const int my, \
369                                         const int dx, const int dy \
370                                         HIGHBD_DECL_SUFFIX) \
371 { \
372     prep_8tap_scaled_c(tmp, src, src_stride, w, h, mx, my, dx, dy, \
373                        type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \
374 }
375 
filter_fns(regular,DAV1D_FILTER_8TAP_REGULAR,DAV1D_FILTER_8TAP_REGULAR)376 filter_fns(regular,        DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_REGULAR)
377 filter_fns(regular_sharp,  DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SHARP)
378 filter_fns(regular_smooth, DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SMOOTH)
379 filter_fns(smooth,         DAV1D_FILTER_8TAP_SMOOTH,  DAV1D_FILTER_8TAP_SMOOTH)
380 filter_fns(smooth_regular, DAV1D_FILTER_8TAP_SMOOTH,  DAV1D_FILTER_8TAP_REGULAR)
381 filter_fns(smooth_sharp,   DAV1D_FILTER_8TAP_SMOOTH,  DAV1D_FILTER_8TAP_SHARP)
382 filter_fns(sharp,          DAV1D_FILTER_8TAP_SHARP,   DAV1D_FILTER_8TAP_SHARP)
383 filter_fns(sharp_regular,  DAV1D_FILTER_8TAP_SHARP,   DAV1D_FILTER_8TAP_REGULAR)
384 filter_fns(sharp_smooth,   DAV1D_FILTER_8TAP_SHARP,   DAV1D_FILTER_8TAP_SMOOTH)
385 
386 #define FILTER_BILIN(src, x, mxy, stride) \
387     (16 * src[x] + ((mxy) * (src[x + stride] - src[x])))
388 
389 #define FILTER_BILIN_RND(src, x, mxy, stride, sh) \
390     ((FILTER_BILIN(src, x, mxy, stride) + ((1 << (sh)) >> 1)) >> (sh))
391 
392 #define FILTER_BILIN_CLIP(src, x, mxy, stride, sh) \
393     iclip_pixel(FILTER_BILIN_RND(src, x, mxy, stride, sh))
394 
395 static void put_bilin_c(pixel *dst, ptrdiff_t dst_stride,
396                         const pixel *src, ptrdiff_t src_stride,
397                         const int w, int h, const int mx, const int my
398                         HIGHBD_DECL_SUFFIX)
399 {
400     const int intermediate_bits = get_intermediate_bits(bitdepth_max);
401     const int intermediate_rnd = (1 << intermediate_bits) >> 1;
402     dst_stride = PXSTRIDE(dst_stride);
403     src_stride = PXSTRIDE(src_stride);
404 
405     if (mx) {
406         if (my) {
407             int16_t mid[128 * 129], *mid_ptr = mid;
408             int tmp_h = h + 1;
409 
410             do {
411                 for (int x = 0; x < w; x++)
412                     mid_ptr[x] = FILTER_BILIN_RND(src, x, mx, 1,
413                                                   4 - intermediate_bits);
414 
415                 mid_ptr += 128;
416                 src += src_stride;
417             } while (--tmp_h);
418 
419             mid_ptr = mid;
420             do {
421                 for (int x = 0; x < w; x++)
422                     dst[x] = FILTER_BILIN_CLIP(mid_ptr, x, my, 128,
423                                                4 + intermediate_bits);
424 
425                 mid_ptr += 128;
426                 dst += dst_stride;
427             } while (--h);
428         } else {
429             do {
430                 for (int x = 0; x < w; x++) {
431                     const int px = FILTER_BILIN_RND(src, x, mx, 1,
432                                                     4 - intermediate_bits);
433                     dst[x] = iclip_pixel((px + intermediate_rnd) >> intermediate_bits);
434                 }
435 
436                 dst += dst_stride;
437                 src += src_stride;
438             } while (--h);
439         }
440     } else if (my) {
441         do {
442             for (int x = 0; x < w; x++)
443                 dst[x] = FILTER_BILIN_CLIP(src, x, my, src_stride, 4);
444 
445             dst += dst_stride;
446             src += src_stride;
447         } while (--h);
448     } else
449         put_c(dst, dst_stride, src, src_stride, w, h);
450 }
451 
put_bilin_scaled_c(pixel * dst,ptrdiff_t dst_stride,const pixel * src,ptrdiff_t src_stride,const int w,int h,const int mx,int my,const int dx,const int dy HIGHBD_DECL_SUFFIX)452 static void put_bilin_scaled_c(pixel *dst, ptrdiff_t dst_stride,
453                                const pixel *src, ptrdiff_t src_stride,
454                                const int w, int h, const int mx, int my,
455                                const int dx, const int dy
456                                HIGHBD_DECL_SUFFIX)
457 {
458     const int intermediate_bits = get_intermediate_bits(bitdepth_max);
459     int tmp_h = (((h - 1) * dy + my) >> 10) + 2;
460     int16_t mid[128 * (256 + 1)], *mid_ptr = mid;
461 
462     do {
463         int x;
464         int imx = mx, ioff = 0;
465 
466         for (x = 0; x < w; x++) {
467             mid_ptr[x] = FILTER_BILIN_RND(src, ioff, imx >> 6, 1,
468                                           4 - intermediate_bits);
469             imx += dx;
470             ioff += imx >> 10;
471             imx &= 0x3ff;
472         }
473 
474         mid_ptr += 128;
475         src += PXSTRIDE(src_stride);
476     } while (--tmp_h);
477 
478     mid_ptr = mid;
479     do {
480         int x;
481 
482         for (x = 0; x < w; x++)
483             dst[x] = FILTER_BILIN_CLIP(mid_ptr, x, my >> 6, 128,
484                                        4 + intermediate_bits);
485 
486         my += dy;
487         mid_ptr += (my >> 10) * 128;
488         my &= 0x3ff;
489         dst += PXSTRIDE(dst_stride);
490     } while (--h);
491 }
492 
prep_bilin_c(int16_t * tmp,const pixel * src,ptrdiff_t src_stride,const int w,int h,const int mx,const int my HIGHBD_DECL_SUFFIX)493 static void prep_bilin_c(int16_t *tmp,
494                          const pixel *src, ptrdiff_t src_stride,
495                          const int w, int h, const int mx, const int my
496                          HIGHBD_DECL_SUFFIX)
497 {
498     const int intermediate_bits = get_intermediate_bits(bitdepth_max);
499     src_stride = PXSTRIDE(src_stride);
500 
501     if (mx) {
502         if (my) {
503             int16_t mid[128 * 129], *mid_ptr = mid;
504             int tmp_h = h + 1;
505 
506             do {
507                 for (int x = 0; x < w; x++)
508                     mid_ptr[x] = FILTER_BILIN_RND(src, x, mx, 1,
509                                                   4 - intermediate_bits);
510 
511                 mid_ptr += 128;
512                 src += src_stride;
513             } while (--tmp_h);
514 
515             mid_ptr = mid;
516             do {
517                 for (int x = 0; x < w; x++)
518                     tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my, 128, 4) -
519                              PREP_BIAS;
520 
521                 mid_ptr += 128;
522                 tmp += w;
523             } while (--h);
524         } else {
525             do {
526                 for (int x = 0; x < w; x++)
527                     tmp[x] = FILTER_BILIN_RND(src, x, mx, 1,
528                                               4 - intermediate_bits) -
529                              PREP_BIAS;
530 
531                 tmp += w;
532                 src += src_stride;
533             } while (--h);
534         }
535     } else if (my) {
536         do {
537             for (int x = 0; x < w; x++)
538                 tmp[x] = FILTER_BILIN_RND(src, x, my, src_stride,
539                                           4 - intermediate_bits) - PREP_BIAS;
540 
541             tmp += w;
542             src += src_stride;
543         } while (--h);
544     } else
545         prep_c(tmp, src, src_stride, w, h HIGHBD_TAIL_SUFFIX);
546 }
547 
prep_bilin_scaled_c(int16_t * tmp,const pixel * src,ptrdiff_t src_stride,const int w,int h,const int mx,int my,const int dx,const int dy HIGHBD_DECL_SUFFIX)548 static void prep_bilin_scaled_c(int16_t *tmp,
549                                 const pixel *src, ptrdiff_t src_stride,
550                                 const int w, int h, const int mx, int my,
551                                 const int dx, const int dy HIGHBD_DECL_SUFFIX)
552 {
553     const int intermediate_bits = get_intermediate_bits(bitdepth_max);
554     int tmp_h = (((h - 1) * dy + my) >> 10) + 2;
555     int16_t mid[128 * (256 + 1)], *mid_ptr = mid;
556 
557     do {
558         int x;
559         int imx = mx, ioff = 0;
560 
561         for (x = 0; x < w; x++) {
562             mid_ptr[x] = FILTER_BILIN_RND(src, ioff, imx >> 6, 1,
563                                           4 - intermediate_bits);
564             imx += dx;
565             ioff += imx >> 10;
566             imx &= 0x3ff;
567         }
568 
569         mid_ptr += 128;
570         src += PXSTRIDE(src_stride);
571     } while (--tmp_h);
572 
573     mid_ptr = mid;
574     do {
575         int x;
576 
577         for (x = 0; x < w; x++)
578             tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my >> 6, 128, 4) - PREP_BIAS;
579 
580         my += dy;
581         mid_ptr += (my >> 10) * 128;
582         my &= 0x3ff;
583         tmp += w;
584     } while (--h);
585 }
586 
avg_c(pixel * dst,const ptrdiff_t dst_stride,const int16_t * tmp1,const int16_t * tmp2,const int w,int h HIGHBD_DECL_SUFFIX)587 static void avg_c(pixel *dst, const ptrdiff_t dst_stride,
588                   const int16_t *tmp1, const int16_t *tmp2, const int w, int h
589                   HIGHBD_DECL_SUFFIX)
590 {
591     const int intermediate_bits = get_intermediate_bits(bitdepth_max);
592     const int sh = intermediate_bits + 1;
593     const int rnd = (1 << intermediate_bits) + PREP_BIAS * 2;
594     do {
595         for (int x = 0; x < w; x++)
596             dst[x] = iclip_pixel((tmp1[x] + tmp2[x] + rnd) >> sh);
597 
598         tmp1 += w;
599         tmp2 += w;
600         dst += PXSTRIDE(dst_stride);
601     } while (--h);
602 }
603 
w_avg_c(pixel * dst,const ptrdiff_t dst_stride,const int16_t * tmp1,const int16_t * tmp2,const int w,int h,const int weight HIGHBD_DECL_SUFFIX)604 static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride,
605                     const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
606                     const int weight HIGHBD_DECL_SUFFIX)
607 {
608     const int intermediate_bits = get_intermediate_bits(bitdepth_max);
609     const int sh = intermediate_bits + 4;
610     const int rnd = (8 << intermediate_bits) + PREP_BIAS * 16;
611     do {
612         for (int x = 0; x < w; x++)
613             dst[x] = iclip_pixel((tmp1[x] * weight +
614                                   tmp2[x] * (16 - weight) + rnd) >> sh);
615 
616         tmp1 += w;
617         tmp2 += w;
618         dst += PXSTRIDE(dst_stride);
619     } while (--h);
620 }
621 
mask_c(pixel * dst,const ptrdiff_t dst_stride,const int16_t * tmp1,const int16_t * tmp2,const int w,int h,const uint8_t * mask HIGHBD_DECL_SUFFIX)622 static void mask_c(pixel *dst, const ptrdiff_t dst_stride,
623                    const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
624                    const uint8_t *mask HIGHBD_DECL_SUFFIX)
625 {
626     const int intermediate_bits = get_intermediate_bits(bitdepth_max);
627     const int sh = intermediate_bits + 6;
628     const int rnd = (32 << intermediate_bits) + PREP_BIAS * 64;
629     do {
630         for (int x = 0; x < w; x++)
631             dst[x] = iclip_pixel((tmp1[x] * mask[x] +
632                                   tmp2[x] * (64 - mask[x]) + rnd) >> sh);
633 
634         tmp1 += w;
635         tmp2 += w;
636         mask += w;
637         dst += PXSTRIDE(dst_stride);
638     } while (--h);
639 }
640 
641 #define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6)
blend_c(pixel * dst,const ptrdiff_t dst_stride,const pixel * tmp,const int w,int h,const uint8_t * mask)642 static void blend_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
643                     const int w, int h, const uint8_t *mask)
644 {
645     do {
646         for (int x = 0; x < w; x++) {
647             dst[x] = blend_px(dst[x], tmp[x], mask[x]);
648         }
649         dst += PXSTRIDE(dst_stride);
650         tmp += w;
651         mask += w;
652     } while (--h);
653 }
654 
blend_v_c(pixel * dst,const ptrdiff_t dst_stride,const pixel * tmp,const int w,int h)655 static void blend_v_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
656                       const int w, int h)
657 {
658     const uint8_t *const mask = &dav1d_obmc_masks[w];
659     do {
660         for (int x = 0; x < (w * 3) >> 2; x++) {
661             dst[x] = blend_px(dst[x], tmp[x], mask[x]);
662         }
663         dst += PXSTRIDE(dst_stride);
664         tmp += w;
665     } while (--h);
666 }
667 
blend_h_c(pixel * dst,const ptrdiff_t dst_stride,const pixel * tmp,const int w,int h)668 static void blend_h_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
669                       const int w, int h)
670 {
671     const uint8_t *mask = &dav1d_obmc_masks[h];
672     h = (h * 3) >> 2;
673     do {
674         const int m = *mask++;
675         for (int x = 0; x < w; x++) {
676             dst[x] = blend_px(dst[x], tmp[x], m);
677         }
678         dst += PXSTRIDE(dst_stride);
679         tmp += w;
680     } while (--h);
681 }
682 
w_mask_c(pixel * dst,const ptrdiff_t dst_stride,const int16_t * tmp1,const int16_t * tmp2,const int w,int h,uint8_t * mask,const int sign,const int ss_hor,const int ss_ver HIGHBD_DECL_SUFFIX)683 static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride,
684                      const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
685                      uint8_t *mask, const int sign,
686                      const int ss_hor, const int ss_ver HIGHBD_DECL_SUFFIX)
687 {
688     // store mask at 2x2 resolution, i.e. store 2x1 sum for even rows,
689     // and then load this intermediate to calculate final value for odd rows
690     const int intermediate_bits = get_intermediate_bits(bitdepth_max);
691     const int bitdepth = bitdepth_from_max(bitdepth_max);
692     const int sh = intermediate_bits + 6;
693     const int rnd = (32 << intermediate_bits) + PREP_BIAS * 64;
694     const int mask_sh = bitdepth + intermediate_bits - 4;
695     const int mask_rnd = 1 << (mask_sh - 5);
696     do {
697         for (int x = 0; x < w; x++) {
698             const int m = imin(38 + ((abs(tmp1[x] - tmp2[x]) + mask_rnd) >> mask_sh), 64);
699             dst[x] = iclip_pixel((tmp1[x] * m +
700                                   tmp2[x] * (64 - m) + rnd) >> sh);
701 
702             if (ss_hor) {
703                 x++;
704 
705                 const int n = imin(38 + ((abs(tmp1[x] - tmp2[x]) + mask_rnd) >> mask_sh), 64);
706                 dst[x] = iclip_pixel((tmp1[x] * n +
707                                       tmp2[x] * (64 - n) + rnd) >> sh);
708 
709                 if (h & ss_ver) {
710                     mask[x >> 1] = (m + n + mask[x >> 1] + 2 - sign) >> 2;
711                 } else if (ss_ver) {
712                     mask[x >> 1] = m + n;
713                 } else {
714                     mask[x >> 1] = (m + n + 1 - sign) >> 1;
715                 }
716             } else {
717                 mask[x] = m;
718             }
719         }
720 
721         tmp1 += w;
722         tmp2 += w;
723         dst += PXSTRIDE(dst_stride);
724         if (!ss_ver || (h & 1)) mask += w >> ss_hor;
725     } while (--h);
726 }
727 
728 #define w_mask_fns(ssn, ss_hor, ss_ver) \
729 static void w_mask_##ssn##_c(pixel *const dst, const ptrdiff_t dst_stride, \
730                              const int16_t *const tmp1, const int16_t *const tmp2, \
731                              const int w, const int h, uint8_t *mask, \
732                              const int sign HIGHBD_DECL_SUFFIX) \
733 { \
734     w_mask_c(dst, dst_stride, tmp1, tmp2, w, h, mask, sign, ss_hor, ss_ver \
735              HIGHBD_TAIL_SUFFIX); \
736 }
737 
738 w_mask_fns(444, 0, 0);
739 w_mask_fns(422, 1, 0);
740 w_mask_fns(420, 1, 1);
741 
742 #undef w_mask_fns
743 
744 #define FILTER_WARP_RND(src, x, F, stride, sh) \
745     ((F[0] * src[x - 3 * stride] + \
746       F[1] * src[x - 2 * stride] + \
747       F[2] * src[x - 1 * stride] + \
748       F[3] * src[x + 0 * stride] + \
749       F[4] * src[x + 1 * stride] + \
750       F[5] * src[x + 2 * stride] + \
751       F[6] * src[x + 3 * stride] + \
752       F[7] * src[x + 4 * stride] + \
753       ((1 << (sh)) >> 1)) >> (sh))
754 
755 #define FILTER_WARP_CLIP(src, x, F, stride, sh) \
756     iclip_pixel(FILTER_WARP_RND(src, x, F, stride, sh))
757 
warp_affine_8x8_c(pixel * dst,const ptrdiff_t dst_stride,const pixel * src,const ptrdiff_t src_stride,const int16_t * const abcd,int mx,int my HIGHBD_DECL_SUFFIX)758 static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride,
759                               const pixel *src, const ptrdiff_t src_stride,
760                               const int16_t *const abcd, int mx, int my
761                               HIGHBD_DECL_SUFFIX)
762 {
763     const int intermediate_bits = get_intermediate_bits(bitdepth_max);
764     int16_t mid[15 * 8], *mid_ptr = mid;
765 
766     src -= 3 * PXSTRIDE(src_stride);
767     for (int y = 0; y < 15; y++, mx += abcd[1]) {
768         for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {
769             const int8_t *const filter =
770                 dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];
771 
772             mid_ptr[x] = FILTER_WARP_RND(src, x, filter, 1,
773                                          7 - intermediate_bits);
774         }
775         src += PXSTRIDE(src_stride);
776         mid_ptr += 8;
777     }
778 
779     mid_ptr = &mid[3 * 8];
780     for (int y = 0; y < 8; y++, my += abcd[3]) {
781         for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {
782             const int8_t *const filter =
783                 dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];
784 
785             dst[x] = FILTER_WARP_CLIP(mid_ptr, x, filter, 8,
786                                       7 + intermediate_bits);
787         }
788         mid_ptr += 8;
789         dst += PXSTRIDE(dst_stride);
790     }
791 }
792 
warp_affine_8x8t_c(int16_t * tmp,const ptrdiff_t tmp_stride,const pixel * src,const ptrdiff_t src_stride,const int16_t * const abcd,int mx,int my HIGHBD_DECL_SUFFIX)793 static void warp_affine_8x8t_c(int16_t *tmp, const ptrdiff_t tmp_stride,
794                                const pixel *src, const ptrdiff_t src_stride,
795                                const int16_t *const abcd, int mx, int my
796                                HIGHBD_DECL_SUFFIX)
797 {
798     const int intermediate_bits = get_intermediate_bits(bitdepth_max);
799     int16_t mid[15 * 8], *mid_ptr = mid;
800 
801     src -= 3 * PXSTRIDE(src_stride);
802     for (int y = 0; y < 15; y++, mx += abcd[1]) {
803         for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {
804             const int8_t *const filter =
805                 dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];
806 
807             mid_ptr[x] = FILTER_WARP_RND(src, x, filter, 1,
808                                          7 - intermediate_bits);
809         }
810         src += PXSTRIDE(src_stride);
811         mid_ptr += 8;
812     }
813 
814     mid_ptr = &mid[3 * 8];
815     for (int y = 0; y < 8; y++, my += abcd[3]) {
816         for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {
817             const int8_t *const filter =
818                 dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];
819 
820             tmp[x] = FILTER_WARP_RND(mid_ptr, x, filter, 8, 7) - PREP_BIAS;
821         }
822         mid_ptr += 8;
823         tmp += tmp_stride;
824     }
825 }
826 
emu_edge_c(const intptr_t bw,const intptr_t bh,const intptr_t iw,const intptr_t ih,const intptr_t x,const intptr_t y,pixel * dst,const ptrdiff_t dst_stride,const pixel * ref,const ptrdiff_t ref_stride)827 static void emu_edge_c(const intptr_t bw, const intptr_t bh,
828                        const intptr_t iw, const intptr_t ih,
829                        const intptr_t x, const intptr_t y,
830                        pixel *dst, const ptrdiff_t dst_stride,
831                        const pixel *ref, const ptrdiff_t ref_stride)
832 {
833     // find offset in reference of visible block to copy
834     ref += iclip((int) y, 0, (int) ih - 1) * PXSTRIDE(ref_stride) +
835            iclip((int) x, 0, (int) iw - 1);
836 
837     // number of pixels to extend (left, right, top, bottom)
838     const int left_ext = iclip((int) -x, 0, (int) bw - 1);
839     const int right_ext = iclip((int) (x + bw - iw), 0, (int) bw - 1);
840     assert(left_ext + right_ext < bw);
841     const int top_ext = iclip((int) -y, 0, (int) bh - 1);
842     const int bottom_ext = iclip((int) (y + bh - ih), 0, (int) bh - 1);
843     assert(top_ext + bottom_ext < bh);
844 
845     // copy visible portion first
846     pixel *blk = dst + top_ext * PXSTRIDE(dst_stride);
847     const int center_w = (int) (bw - left_ext - right_ext);
848     const int center_h = (int) (bh - top_ext - bottom_ext);
849     for (int y = 0; y < center_h; y++) {
850         pixel_copy(blk + left_ext, ref, center_w);
851         // extend left edge for this line
852         if (left_ext)
853             pixel_set(blk, blk[left_ext], left_ext);
854         // extend right edge for this line
855         if (right_ext)
856             pixel_set(blk + left_ext + center_w, blk[left_ext + center_w - 1],
857                       right_ext);
858         ref += PXSTRIDE(ref_stride);
859         blk += PXSTRIDE(dst_stride);
860     }
861 
862     // copy top
863     blk = dst + top_ext * PXSTRIDE(dst_stride);
864     for (int y = 0; y < top_ext; y++) {
865         pixel_copy(dst, blk, bw);
866         dst += PXSTRIDE(dst_stride);
867     }
868 
869     // copy bottom
870     dst += center_h * PXSTRIDE(dst_stride);
871     for (int y = 0; y < bottom_ext; y++) {
872         pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], bw);
873         dst += PXSTRIDE(dst_stride);
874     }
875 }
876 
resize_c(pixel * dst,const ptrdiff_t dst_stride,const pixel * src,const ptrdiff_t src_stride,const int dst_w,int h,const int src_w,const int dx,const int mx0 HIGHBD_DECL_SUFFIX)877 static void resize_c(pixel *dst, const ptrdiff_t dst_stride,
878                      const pixel *src, const ptrdiff_t src_stride,
879                      const int dst_w, int h, const int src_w,
880                      const int dx, const int mx0 HIGHBD_DECL_SUFFIX)
881 {
882     do {
883         int mx = mx0, src_x = -1;
884         for (int x = 0; x < dst_w; x++) {
885             const int8_t *const F = dav1d_resize_filter[mx >> 8];
886             dst[x] = iclip_pixel((-(F[0] * src[iclip(src_x - 3, 0, src_w - 1)] +
887                                     F[1] * src[iclip(src_x - 2, 0, src_w - 1)] +
888                                     F[2] * src[iclip(src_x - 1, 0, src_w - 1)] +
889                                     F[3] * src[iclip(src_x + 0, 0, src_w - 1)] +
890                                     F[4] * src[iclip(src_x + 1, 0, src_w - 1)] +
891                                     F[5] * src[iclip(src_x + 2, 0, src_w - 1)] +
892                                     F[6] * src[iclip(src_x + 3, 0, src_w - 1)] +
893                                     F[7] * src[iclip(src_x + 4, 0, src_w - 1)]) +
894                                   64) >> 7);
895             mx += dx;
896             src_x += mx >> 14;
897             mx &= 0x3fff;
898         }
899 
900         dst += PXSTRIDE(dst_stride);
901         src += PXSTRIDE(src_stride);
902     } while (--h);
903 }
904 
bitfn(dav1d_mc_dsp_init)905 COLD void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {
906 #define init_mc_fns(type, name) do { \
907     c->mc        [type] = put_##name##_c; \
908     c->mc_scaled [type] = put_##name##_scaled_c; \
909     c->mct       [type] = prep_##name##_c; \
910     c->mct_scaled[type] = prep_##name##_scaled_c; \
911 } while (0)
912 
913     init_mc_fns(FILTER_2D_8TAP_REGULAR,        8tap_regular);
914     init_mc_fns(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth);
915     init_mc_fns(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp);
916     init_mc_fns(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular);
917     init_mc_fns(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth);
918     init_mc_fns(FILTER_2D_8TAP_SHARP,          8tap_sharp);
919     init_mc_fns(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular);
920     init_mc_fns(FILTER_2D_8TAP_SMOOTH,         8tap_smooth);
921     init_mc_fns(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp);
922     init_mc_fns(FILTER_2D_BILINEAR,            bilin);
923 
924     c->avg      = avg_c;
925     c->w_avg    = w_avg_c;
926     c->mask     = mask_c;
927     c->blend    = blend_c;
928     c->blend_v  = blend_v_c;
929     c->blend_h  = blend_h_c;
930     c->w_mask[0] = w_mask_444_c;
931     c->w_mask[1] = w_mask_422_c;
932     c->w_mask[2] = w_mask_420_c;
933     c->warp8x8  = warp_affine_8x8_c;
934     c->warp8x8t = warp_affine_8x8t_c;
935     c->emu_edge = emu_edge_c;
936     c->resize   = resize_c;
937 
938 #if HAVE_ASM
939 #if ARCH_AARCH64 || ARCH_ARM
940     bitfn(dav1d_mc_dsp_init_arm)(c);
941 #elif ARCH_X86
942     bitfn(dav1d_mc_dsp_init_x86)(c);
943 #endif
944 #endif
945 }
946