1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <memory.h>
15 #include <math.h>
16 #include <assert.h>
17 
18 #include "./av1_rtcd.h"
19 #include "av1/common/warped_motion.h"
20 #include "av1/common/scale.h"
21 
22 #define WARP_ERROR_BLOCK 32
23 
24 /* clang-format off */
25 static const int error_measure_lut[512] = {
26   // pow 0.7
27   16384, 16339, 16294, 16249, 16204, 16158, 16113, 16068,
28   16022, 15977, 15932, 15886, 15840, 15795, 15749, 15703,
29   15657, 15612, 15566, 15520, 15474, 15427, 15381, 15335,
30   15289, 15242, 15196, 15149, 15103, 15056, 15010, 14963,
31   14916, 14869, 14822, 14775, 14728, 14681, 14634, 14587,
32   14539, 14492, 14445, 14397, 14350, 14302, 14254, 14206,
33   14159, 14111, 14063, 14015, 13967, 13918, 13870, 13822,
34   13773, 13725, 13676, 13628, 13579, 13530, 13481, 13432,
35   13383, 13334, 13285, 13236, 13187, 13137, 13088, 13038,
36   12988, 12939, 12889, 12839, 12789, 12739, 12689, 12639,
37   12588, 12538, 12487, 12437, 12386, 12335, 12285, 12234,
38   12183, 12132, 12080, 12029, 11978, 11926, 11875, 11823,
39   11771, 11719, 11667, 11615, 11563, 11511, 11458, 11406,
40   11353, 11301, 11248, 11195, 11142, 11089, 11036, 10982,
41   10929, 10875, 10822, 10768, 10714, 10660, 10606, 10552,
42   10497, 10443, 10388, 10333, 10279, 10224, 10168, 10113,
43   10058, 10002,  9947,  9891,  9835,  9779,  9723,  9666,
44   9610, 9553, 9497, 9440, 9383, 9326, 9268, 9211,
45   9153, 9095, 9037, 8979, 8921, 8862, 8804, 8745,
46   8686, 8627, 8568, 8508, 8449, 8389, 8329, 8269,
47   8208, 8148, 8087, 8026, 7965, 7903, 7842, 7780,
48   7718, 7656, 7593, 7531, 7468, 7405, 7341, 7278,
49   7214, 7150, 7086, 7021, 6956, 6891, 6826, 6760,
50   6695, 6628, 6562, 6495, 6428, 6361, 6293, 6225,
51   6157, 6089, 6020, 5950, 5881, 5811, 5741, 5670,
52   5599, 5527, 5456, 5383, 5311, 5237, 5164, 5090,
53   5015, 4941, 4865, 4789, 4713, 4636, 4558, 4480,
54   4401, 4322, 4242, 4162, 4080, 3998, 3916, 3832,
55   3748, 3663, 3577, 3490, 3402, 3314, 3224, 3133,
56   3041, 2948, 2854, 2758, 2661, 2562, 2461, 2359,
57   2255, 2148, 2040, 1929, 1815, 1698, 1577, 1452,
58   1323, 1187, 1045,  894,  731,  550,  339,    0,
59   339,  550,  731,  894, 1045, 1187, 1323, 1452,
60   1577, 1698, 1815, 1929, 2040, 2148, 2255, 2359,
61   2461, 2562, 2661, 2758, 2854, 2948, 3041, 3133,
62   3224, 3314, 3402, 3490, 3577, 3663, 3748, 3832,
63   3916, 3998, 4080, 4162, 4242, 4322, 4401, 4480,
64   4558, 4636, 4713, 4789, 4865, 4941, 5015, 5090,
65   5164, 5237, 5311, 5383, 5456, 5527, 5599, 5670,
66   5741, 5811, 5881, 5950, 6020, 6089, 6157, 6225,
67   6293, 6361, 6428, 6495, 6562, 6628, 6695, 6760,
68   6826, 6891, 6956, 7021, 7086, 7150, 7214, 7278,
69   7341, 7405, 7468, 7531, 7593, 7656, 7718, 7780,
70   7842, 7903, 7965, 8026, 8087, 8148, 8208, 8269,
71   8329, 8389, 8449, 8508, 8568, 8627, 8686, 8745,
72   8804, 8862, 8921, 8979, 9037, 9095, 9153, 9211,
73   9268, 9326, 9383, 9440, 9497, 9553, 9610, 9666,
74   9723,  9779,  9835,  9891,  9947, 10002, 10058, 10113,
75   10168, 10224, 10279, 10333, 10388, 10443, 10497, 10552,
76   10606, 10660, 10714, 10768, 10822, 10875, 10929, 10982,
77   11036, 11089, 11142, 11195, 11248, 11301, 11353, 11406,
78   11458, 11511, 11563, 11615, 11667, 11719, 11771, 11823,
79   11875, 11926, 11978, 12029, 12080, 12132, 12183, 12234,
80   12285, 12335, 12386, 12437, 12487, 12538, 12588, 12639,
81   12689, 12739, 12789, 12839, 12889, 12939, 12988, 13038,
82   13088, 13137, 13187, 13236, 13285, 13334, 13383, 13432,
83   13481, 13530, 13579, 13628, 13676, 13725, 13773, 13822,
84   13870, 13918, 13967, 14015, 14063, 14111, 14159, 14206,
85   14254, 14302, 14350, 14397, 14445, 14492, 14539, 14587,
86   14634, 14681, 14728, 14775, 14822, 14869, 14916, 14963,
87   15010, 15056, 15103, 15149, 15196, 15242, 15289, 15335,
88   15381, 15427, 15474, 15520, 15566, 15612, 15657, 15703,
89   15749, 15795, 15840, 15886, 15932, 15977, 16022, 16068,
90   16113, 16158, 16204, 16249, 16294, 16339, 16384, 16384,
91 };
92 /* clang-format on */
93 
get_project_points_type(TransformationType type)94 static ProjectPointsFunc get_project_points_type(TransformationType type) {
95   switch (type) {
96     case VERTRAPEZOID: return project_points_vertrapezoid;
97     case HORTRAPEZOID: return project_points_hortrapezoid;
98     case HOMOGRAPHY: return project_points_homography;
99     case AFFINE: return project_points_affine;
100     case ROTZOOM: return project_points_rotzoom;
101     case TRANSLATION: return project_points_translation;
102     default: assert(0); return NULL;
103   }
104 }
105 
project_points_translation(const int32_t * mat,int * points,int * proj,const int n,const int stride_points,const int stride_proj,const int subsampling_x,const int subsampling_y)106 void project_points_translation(const int32_t *mat, int *points, int *proj,
107                                 const int n, const int stride_points,
108                                 const int stride_proj, const int subsampling_x,
109                                 const int subsampling_y) {
110   int i;
111   for (i = 0; i < n; ++i) {
112     const int x = *(points++), y = *(points++);
113     if (subsampling_x)
114       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
115           ((x * (1 << (WARPEDMODEL_PREC_BITS + 1))) + mat[0]),
116           WARPEDDIFF_PREC_BITS + 1);
117     else
118       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
119           ((x * (1 << WARPEDMODEL_PREC_BITS)) + mat[0]), WARPEDDIFF_PREC_BITS);
120     if (subsampling_y)
121       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
122           ((y * (1 << (WARPEDMODEL_PREC_BITS + 1))) + mat[1]),
123           WARPEDDIFF_PREC_BITS + 1);
124     else
125       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
126           ((y * (1 << WARPEDMODEL_PREC_BITS))) + mat[1], WARPEDDIFF_PREC_BITS);
127     points += stride_points - 2;
128     proj += stride_proj - 2;
129   }
130 }
131 
project_points_rotzoom(const int32_t * mat,int * points,int * proj,const int n,const int stride_points,const int stride_proj,const int subsampling_x,const int subsampling_y)132 void project_points_rotzoom(const int32_t *mat, int *points, int *proj,
133                             const int n, const int stride_points,
134                             const int stride_proj, const int subsampling_x,
135                             const int subsampling_y) {
136   int i;
137   for (i = 0; i < n; ++i) {
138     const int x = *(points++), y = *(points++);
139     if (subsampling_x)
140       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
141           mat[2] * 2 * x + mat[3] * 2 * y + mat[0] +
142               (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
143           WARPEDDIFF_PREC_BITS + 1);
144     else
145       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[2] * x + mat[3] * y + mat[0],
146                                             WARPEDDIFF_PREC_BITS);
147     if (subsampling_y)
148       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
149           -mat[3] * 2 * x + mat[2] * 2 * y + mat[1] +
150               (-mat[3] + mat[2] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
151           WARPEDDIFF_PREC_BITS + 1);
152     else
153       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(-mat[3] * x + mat[2] * y + mat[1],
154                                             WARPEDDIFF_PREC_BITS);
155     points += stride_points - 2;
156     proj += stride_proj - 2;
157   }
158 }
159 
project_points_affine(const int32_t * mat,int * points,int * proj,const int n,const int stride_points,const int stride_proj,const int subsampling_x,const int subsampling_y)160 void project_points_affine(const int32_t *mat, int *points, int *proj,
161                            const int n, const int stride_points,
162                            const int stride_proj, const int subsampling_x,
163                            const int subsampling_y) {
164   int i;
165   for (i = 0; i < n; ++i) {
166     const int x = *(points++), y = *(points++);
167     if (subsampling_x)
168       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
169           mat[2] * 2 * x + mat[3] * 2 * y + mat[0] +
170               (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
171           WARPEDDIFF_PREC_BITS + 1);
172     else
173       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[2] * x + mat[3] * y + mat[0],
174                                             WARPEDDIFF_PREC_BITS);
175     if (subsampling_y)
176       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
177           mat[4] * 2 * x + mat[5] * 2 * y + mat[1] +
178               (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
179           WARPEDDIFF_PREC_BITS + 1);
180     else
181       *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[4] * x + mat[5] * y + mat[1],
182                                             WARPEDDIFF_PREC_BITS);
183     points += stride_points - 2;
184     proj += stride_proj - 2;
185   }
186 }
187 
project_points_hortrapezoid(const int32_t * mat,int * points,int * proj,const int n,const int stride_points,const int stride_proj,const int subsampling_x,const int subsampling_y)188 void project_points_hortrapezoid(const int32_t *mat, int *points, int *proj,
189                                  const int n, const int stride_points,
190                                  const int stride_proj, const int subsampling_x,
191                                  const int subsampling_y) {
192   int i;
193   int64_t x, y, Z;
194   int64_t xp, yp;
195   for (i = 0; i < n; ++i) {
196     x = *(points++), y = *(points++);
197     x = (subsampling_x ? 4 * x + 1 : 2 * x);
198     y = (subsampling_y ? 4 * y + 1 : 2 * y);
199 
200     Z = (mat[7] * y + (1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS + 1)));
201     xp = (mat[2] * x + mat[3] * y + 2 * mat[0]) *
202          (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
203                 WARPEDMODEL_PREC_BITS));
204     yp = (mat[5] * y + 2 * mat[1]) *
205          (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
206                 WARPEDMODEL_PREC_BITS));
207 
208     xp = xp > 0 ? (xp + Z / 2) / Z : (xp - Z / 2) / Z;
209     yp = yp > 0 ? (yp + Z / 2) / Z : (yp - Z / 2) / Z;
210 
211     if (subsampling_x) xp = (xp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
212     if (subsampling_y) yp = (yp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
213     *(proj++) = (int)xp;
214     *(proj++) = (int)yp;
215 
216     points += stride_points - 2;
217     proj += stride_proj - 2;
218   }
219 }
220 
project_points_vertrapezoid(const int32_t * mat,int * points,int * proj,const int n,const int stride_points,const int stride_proj,const int subsampling_x,const int subsampling_y)221 void project_points_vertrapezoid(const int32_t *mat, int *points, int *proj,
222                                  const int n, const int stride_points,
223                                  const int stride_proj, const int subsampling_x,
224                                  const int subsampling_y) {
225   int i;
226   int64_t x, y, Z;
227   int64_t xp, yp;
228   for (i = 0; i < n; ++i) {
229     x = *(points++), y = *(points++);
230     x = (subsampling_x ? 4 * x + 1 : 2 * x);
231     y = (subsampling_y ? 4 * y + 1 : 2 * y);
232 
233     Z = (mat[6] * x + (1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS + 1)));
234     xp = (mat[2] * x + 2 * mat[0]) *
235          (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
236                 WARPEDMODEL_PREC_BITS));
237     yp = (mat[4] * x + mat[5] * y + 2 * mat[1]) *
238          (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
239                 WARPEDMODEL_PREC_BITS));
240 
241     xp = xp > 0 ? (xp + Z / 2) / Z : (xp - Z / 2) / Z;
242     yp = yp > 0 ? (yp + Z / 2) / Z : (yp - Z / 2) / Z;
243 
244     if (subsampling_x) xp = (xp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
245     if (subsampling_y) yp = (yp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
246     *(proj++) = (int)xp;
247     *(proj++) = (int)yp;
248 
249     points += stride_points - 2;
250     proj += stride_proj - 2;
251   }
252 }
253 
project_points_homography(const int32_t * mat,int * points,int * proj,const int n,const int stride_points,const int stride_proj,const int subsampling_x,const int subsampling_y)254 void project_points_homography(const int32_t *mat, int *points, int *proj,
255                                const int n, const int stride_points,
256                                const int stride_proj, const int subsampling_x,
257                                const int subsampling_y) {
258   int i;
259   int64_t x, y, Z;
260   int64_t xp, yp;
261   for (i = 0; i < n; ++i) {
262     x = *(points++), y = *(points++);
263     x = (subsampling_x ? 4 * x + 1 : 2 * x);
264     y = (subsampling_y ? 4 * y + 1 : 2 * y);
265 
266     Z = (mat[6] * x + mat[7] * y + (1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS + 1)));
267     xp = (mat[2] * x + mat[3] * y + 2 * mat[0]) *
268          (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
269                 WARPEDMODEL_PREC_BITS));
270     yp = (mat[4] * x + mat[5] * y + 2 * mat[1]) *
271          (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
272                 WARPEDMODEL_PREC_BITS));
273 
274     xp = xp > 0 ? (xp + Z / 2) / Z : (xp - Z / 2) / Z;
275     yp = yp > 0 ? (yp + Z / 2) / Z : (yp - Z / 2) / Z;
276 
277     if (subsampling_x) xp = (xp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
278     if (subsampling_y) yp = (yp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
279     *(proj++) = (int)xp;
280     *(proj++) = (int)yp;
281 
282     points += stride_points - 2;
283     proj += stride_proj - 2;
284   }
285 }
286 
287 static const int16_t
288     filter_ntap[WARPEDPIXEL_PREC_SHIFTS][WARPEDPIXEL_FILTER_TAPS] = {
289 #if WARPEDPIXEL_PREC_BITS == 6
290       { 0, 0, 128, 0, 0, 0 },      { 0, -1, 128, 2, -1, 0 },
291       { 1, -3, 127, 4, -1, 0 },    { 1, -4, 126, 6, -2, 1 },
292       { 1, -5, 126, 8, -3, 1 },    { 1, -6, 125, 11, -4, 1 },
293       { 1, -7, 124, 13, -4, 1 },   { 2, -8, 123, 15, -5, 1 },
294       { 2, -9, 122, 18, -6, 1 },   { 2, -10, 121, 20, -6, 1 },
295       { 2, -11, 120, 22, -7, 2 },  { 2, -12, 119, 25, -8, 2 },
296       { 3, -13, 117, 27, -8, 2 },  { 3, -13, 116, 29, -9, 2 },
297       { 3, -14, 114, 32, -10, 3 }, { 3, -15, 113, 35, -10, 2 },
298       { 3, -15, 111, 37, -11, 3 }, { 3, -16, 109, 40, -11, 3 },
299       { 3, -16, 108, 42, -12, 3 }, { 4, -17, 106, 45, -13, 3 },
300       { 4, -17, 104, 47, -13, 3 }, { 4, -17, 102, 50, -14, 3 },
301       { 4, -17, 100, 52, -14, 3 }, { 4, -18, 98, 55, -15, 4 },
302       { 4, -18, 96, 58, -15, 3 },  { 4, -18, 94, 60, -16, 4 },
303       { 4, -18, 91, 63, -16, 4 },  { 4, -18, 89, 65, -16, 4 },
304       { 4, -18, 87, 68, -17, 4 },  { 4, -18, 85, 70, -17, 4 },
305       { 4, -18, 82, 73, -17, 4 },  { 4, -18, 80, 75, -17, 4 },
306       { 4, -18, 78, 78, -18, 4 },  { 4, -17, 75, 80, -18, 4 },
307       { 4, -17, 73, 82, -18, 4 },  { 4, -17, 70, 85, -18, 4 },
308       { 4, -17, 68, 87, -18, 4 },  { 4, -16, 65, 89, -18, 4 },
309       { 4, -16, 63, 91, -18, 4 },  { 4, -16, 60, 94, -18, 4 },
310       { 3, -15, 58, 96, -18, 4 },  { 4, -15, 55, 98, -18, 4 },
311       { 3, -14, 52, 100, -17, 4 }, { 3, -14, 50, 102, -17, 4 },
312       { 3, -13, 47, 104, -17, 4 }, { 3, -13, 45, 106, -17, 4 },
313       { 3, -12, 42, 108, -16, 3 }, { 3, -11, 40, 109, -16, 3 },
314       { 3, -11, 37, 111, -15, 3 }, { 2, -10, 35, 113, -15, 3 },
315       { 3, -10, 32, 114, -14, 3 }, { 2, -9, 29, 116, -13, 3 },
316       { 2, -8, 27, 117, -13, 3 },  { 2, -8, 25, 119, -12, 2 },
317       { 2, -7, 22, 120, -11, 2 },  { 1, -6, 20, 121, -10, 2 },
318       { 1, -6, 18, 122, -9, 2 },   { 1, -5, 15, 123, -8, 2 },
319       { 1, -4, 13, 124, -7, 1 },   { 1, -4, 11, 125, -6, 1 },
320       { 1, -3, 8, 126, -5, 1 },    { 1, -2, 6, 126, -4, 1 },
321       { 0, -1, 4, 127, -3, 1 },    { 0, -1, 2, 128, -1, 0 },
322 #elif WARPEDPIXEL_PREC_BITS == 5
323       { 0, 0, 128, 0, 0, 0 },      { 1, -3, 127, 4, -1, 0 },
324       { 1, -5, 126, 8, -3, 1 },    { 1, -7, 124, 13, -4, 1 },
325       { 2, -9, 122, 18, -6, 1 },   { 2, -11, 120, 22, -7, 2 },
326       { 3, -13, 117, 27, -8, 2 },  { 3, -14, 114, 32, -10, 3 },
327       { 3, -15, 111, 37, -11, 3 }, { 3, -16, 108, 42, -12, 3 },
328       { 4, -17, 104, 47, -13, 3 }, { 4, -17, 100, 52, -14, 3 },
329       { 4, -18, 96, 58, -15, 3 },  { 4, -18, 91, 63, -16, 4 },
330       { 4, -18, 87, 68, -17, 4 },  { 4, -18, 82, 73, -17, 4 },
331       { 4, -18, 78, 78, -18, 4 },  { 4, -17, 73, 82, -18, 4 },
332       { 4, -17, 68, 87, -18, 4 },  { 4, -16, 63, 91, -18, 4 },
333       { 3, -15, 58, 96, -18, 4 },  { 3, -14, 52, 100, -17, 4 },
334       { 3, -13, 47, 104, -17, 4 }, { 3, -12, 42, 108, -16, 3 },
335       { 3, -11, 37, 111, -15, 3 }, { 3, -10, 32, 114, -14, 3 },
336       { 2, -8, 27, 117, -13, 3 },  { 2, -7, 22, 120, -11, 2 },
337       { 1, -6, 18, 122, -9, 2 },   { 1, -4, 13, 124, -7, 1 },
338       { 1, -3, 8, 126, -5, 1 },    { 0, -1, 4, 127, -3, 1 },
339 #endif  // WARPEDPIXEL_PREC_BITS == 6
340     };
341 
do_ntap_filter(const int32_t * const p,int x)342 static int32_t do_ntap_filter(const int32_t *const p, int x) {
343   int i;
344   int32_t sum = 0;
345   for (i = 0; i < WARPEDPIXEL_FILTER_TAPS; ++i) {
346     sum += p[i - WARPEDPIXEL_FILTER_TAPS / 2 + 1] * filter_ntap[x][i];
347   }
348   return sum;
349 }
350 
do_cubic_filter(const int32_t * const p,int x)351 static int32_t do_cubic_filter(const int32_t *const p, int x) {
352   if (x == 0) {
353     return p[0] * (1 << WARPEDPIXEL_FILTER_BITS);
354   } else if (x == (1 << WARPEDPIXEL_PREC_BITS)) {
355     return p[1] * (1 << WARPEDPIXEL_FILTER_BITS);
356   } else {
357     const int64_t v1 = (int64_t)x * x * x * (3 * (p[0] - p[1]) + p[2] - p[-1]);
358     const int64_t v2 =
359         (int64_t)x * x * (2 * p[-1] - 5 * p[0] + 4 * p[1] - p[2]);
360     const int64_t v3 = x * (p[1] - p[-1]);
361     const int64_t v4 = 2 * p[0];
362     return (int32_t)ROUND_POWER_OF_TWO_SIGNED(
363         (v4 * (1 << (3 * WARPEDPIXEL_PREC_BITS))) +
364             (v3 * (1 << (2 * WARPEDPIXEL_PREC_BITS))) +
365             (v2 * (1 << WARPEDPIXEL_PREC_BITS)) + v1,
366         3 * WARPEDPIXEL_PREC_BITS + 1 - WARPEDPIXEL_FILTER_BITS);
367   }
368 }
369 
get_subcolumn(int taps,const uint8_t * const ref,int32_t * col,int stride,int x,int y_start)370 static INLINE void get_subcolumn(int taps, const uint8_t *const ref,
371                                  int32_t *col, int stride, int x, int y_start) {
372   int i;
373   for (i = 0; i < taps; ++i) {
374     col[i] = ref[(i + y_start) * stride + x];
375   }
376 }
377 
bi_ntap_filter(const uint8_t * const ref,int x,int y,int stride)378 static uint8_t bi_ntap_filter(const uint8_t *const ref, int x, int y,
379                               int stride) {
380   int32_t val, arr[WARPEDPIXEL_FILTER_TAPS];
381   int k;
382   const int i = (int)x >> WARPEDPIXEL_PREC_BITS;
383   const int j = (int)y >> WARPEDPIXEL_PREC_BITS;
384   for (k = 0; k < WARPEDPIXEL_FILTER_TAPS; ++k) {
385     int32_t arr_temp[WARPEDPIXEL_FILTER_TAPS];
386     get_subcolumn(WARPEDPIXEL_FILTER_TAPS, ref, arr_temp, stride,
387                   i + k + 1 - WARPEDPIXEL_FILTER_TAPS / 2,
388                   j + 1 - WARPEDPIXEL_FILTER_TAPS / 2);
389     arr[k] = do_ntap_filter(arr_temp + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
390                             y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
391   }
392   val = do_ntap_filter(arr + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
393                        x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
394   val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
395   return (uint8_t)clip_pixel(val);
396 }
397 
bi_cubic_filter(const uint8_t * const ref,int x,int y,int stride)398 static uint8_t bi_cubic_filter(const uint8_t *const ref, int x, int y,
399                                int stride) {
400   int32_t val, arr[4];
401   int k;
402   const int i = (int)x >> WARPEDPIXEL_PREC_BITS;
403   const int j = (int)y >> WARPEDPIXEL_PREC_BITS;
404   for (k = 0; k < 4; ++k) {
405     int32_t arr_temp[4];
406     get_subcolumn(4, ref, arr_temp, stride, i + k - 1, j - 1);
407     arr[k] =
408         do_cubic_filter(arr_temp + 1, y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
409   }
410   val = do_cubic_filter(arr + 1, x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
411   val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
412   return (uint8_t)clip_pixel(val);
413 }
414 
bi_linear_filter(const uint8_t * const ref,int x,int y,int stride)415 static uint8_t bi_linear_filter(const uint8_t *const ref, int x, int y,
416                                 int stride) {
417   const int ix = x >> WARPEDPIXEL_PREC_BITS;
418   const int iy = y >> WARPEDPIXEL_PREC_BITS;
419   const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
420   const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
421   int32_t val;
422   val = ROUND_POWER_OF_TWO_SIGNED(
423       ref[iy * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sy) *
424               (WARPEDPIXEL_PREC_SHIFTS - sx) +
425           ref[iy * stride + ix + 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) * sx +
426           ref[(iy + 1) * stride + ix] * sy * (WARPEDPIXEL_PREC_SHIFTS - sx) +
427           ref[(iy + 1) * stride + ix + 1] * sy * sx,
428       WARPEDPIXEL_PREC_BITS * 2);
429   return (uint8_t)clip_pixel(val);
430 }
431 
warp_interpolate(const uint8_t * const ref,int x,int y,int width,int height,int stride)432 static uint8_t warp_interpolate(const uint8_t *const ref, int x, int y,
433                                 int width, int height, int stride) {
434   const int ix = x >> WARPEDPIXEL_PREC_BITS;
435   const int iy = y >> WARPEDPIXEL_PREC_BITS;
436   const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
437   const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
438   int32_t v;
439 
440   if (ix < 0 && iy < 0)
441     return ref[0];
442   else if (ix < 0 && iy >= height - 1)
443     return ref[(height - 1) * stride];
444   else if (ix >= width - 1 && iy < 0)
445     return ref[width - 1];
446   else if (ix >= width - 1 && iy >= height - 1)
447     return ref[(height - 1) * stride + (width - 1)];
448   else if (ix < 0) {
449     v = ROUND_POWER_OF_TWO_SIGNED(
450         ref[iy * stride] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
451             ref[(iy + 1) * stride] * sy,
452         WARPEDPIXEL_PREC_BITS);
453     return clip_pixel(v);
454   } else if (iy < 0) {
455     v = ROUND_POWER_OF_TWO_SIGNED(
456         ref[ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) + ref[ix + 1] * sx,
457         WARPEDPIXEL_PREC_BITS);
458     return clip_pixel(v);
459   } else if (ix >= width - 1) {
460     v = ROUND_POWER_OF_TWO_SIGNED(
461         ref[iy * stride + width - 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
462             ref[(iy + 1) * stride + width - 1] * sy,
463         WARPEDPIXEL_PREC_BITS);
464     return clip_pixel(v);
465   } else if (iy >= height - 1) {
466     v = ROUND_POWER_OF_TWO_SIGNED(
467         ref[(height - 1) * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) +
468             ref[(height - 1) * stride + ix + 1] * sx,
469         WARPEDPIXEL_PREC_BITS);
470     return clip_pixel(v);
471   } else if (ix >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
472              iy >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
473              ix < width - WARPEDPIXEL_FILTER_TAPS / 2 &&
474              iy < height - WARPEDPIXEL_FILTER_TAPS / 2) {
475     return bi_ntap_filter(ref, x, y, stride);
476   } else if (ix >= 1 && iy >= 1 && ix < width - 2 && iy < height - 2) {
477     return bi_cubic_filter(ref, x, y, stride);
478   } else {
479     return bi_linear_filter(ref, x, y, stride);
480   }
481 }
482 
483 // For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
484 // at a time. The zoom/rotation/shear in the model are applied to the
485 // "fractional" position of each pixel, which therefore varies within
486 // [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
487 // We need an extra 2 taps to fit this in, for a total of 8 taps.
488 /* clang-format off */
489 const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
490 #if WARPEDPIXEL_PREC_BITS == 6
491   // [-1, 0)
492   { 0,   0, 127,   1,   0, 0, 0, 0 }, { 0, - 1, 127,   2,   0, 0, 0, 0 },
493   { 1, - 3, 127,   4, - 1, 0, 0, 0 }, { 1, - 4, 126,   6, - 2, 1, 0, 0 },
494   { 1, - 5, 126,   8, - 3, 1, 0, 0 }, { 1, - 6, 125,  11, - 4, 1, 0, 0 },
495   { 1, - 7, 124,  13, - 4, 1, 0, 0 }, { 2, - 8, 123,  15, - 5, 1, 0, 0 },
496   { 2, - 9, 122,  18, - 6, 1, 0, 0 }, { 2, -10, 121,  20, - 6, 1, 0, 0 },
497   { 2, -11, 120,  22, - 7, 2, 0, 0 }, { 2, -12, 119,  25, - 8, 2, 0, 0 },
498   { 3, -13, 117,  27, - 8, 2, 0, 0 }, { 3, -13, 116,  29, - 9, 2, 0, 0 },
499   { 3, -14, 114,  32, -10, 3, 0, 0 }, { 3, -15, 113,  35, -10, 2, 0, 0 },
500   { 3, -15, 111,  37, -11, 3, 0, 0 }, { 3, -16, 109,  40, -11, 3, 0, 0 },
501   { 3, -16, 108,  42, -12, 3, 0, 0 }, { 4, -17, 106,  45, -13, 3, 0, 0 },
502   { 4, -17, 104,  47, -13, 3, 0, 0 }, { 4, -17, 102,  50, -14, 3, 0, 0 },
503   { 4, -17, 100,  52, -14, 3, 0, 0 }, { 4, -18,  98,  55, -15, 4, 0, 0 },
504   { 4, -18,  96,  58, -15, 3, 0, 0 }, { 4, -18,  94,  60, -16, 4, 0, 0 },
505   { 4, -18,  91,  63, -16, 4, 0, 0 }, { 4, -18,  89,  65, -16, 4, 0, 0 },
506   { 4, -18,  87,  68, -17, 4, 0, 0 }, { 4, -18,  85,  70, -17, 4, 0, 0 },
507   { 4, -18,  82,  73, -17, 4, 0, 0 }, { 4, -18,  80,  75, -17, 4, 0, 0 },
508   { 4, -18,  78,  78, -18, 4, 0, 0 }, { 4, -17,  75,  80, -18, 4, 0, 0 },
509   { 4, -17,  73,  82, -18, 4, 0, 0 }, { 4, -17,  70,  85, -18, 4, 0, 0 },
510   { 4, -17,  68,  87, -18, 4, 0, 0 }, { 4, -16,  65,  89, -18, 4, 0, 0 },
511   { 4, -16,  63,  91, -18, 4, 0, 0 }, { 4, -16,  60,  94, -18, 4, 0, 0 },
512   { 3, -15,  58,  96, -18, 4, 0, 0 }, { 4, -15,  55,  98, -18, 4, 0, 0 },
513   { 3, -14,  52, 100, -17, 4, 0, 0 }, { 3, -14,  50, 102, -17, 4, 0, 0 },
514   { 3, -13,  47, 104, -17, 4, 0, 0 }, { 3, -13,  45, 106, -17, 4, 0, 0 },
515   { 3, -12,  42, 108, -16, 3, 0, 0 }, { 3, -11,  40, 109, -16, 3, 0, 0 },
516   { 3, -11,  37, 111, -15, 3, 0, 0 }, { 2, -10,  35, 113, -15, 3, 0, 0 },
517   { 3, -10,  32, 114, -14, 3, 0, 0 }, { 2, - 9,  29, 116, -13, 3, 0, 0 },
518   { 2, - 8,  27, 117, -13, 3, 0, 0 }, { 2, - 8,  25, 119, -12, 2, 0, 0 },
519   { 2, - 7,  22, 120, -11, 2, 0, 0 }, { 1, - 6,  20, 121, -10, 2, 0, 0 },
520   { 1, - 6,  18, 122, - 9, 2, 0, 0 }, { 1, - 5,  15, 123, - 8, 2, 0, 0 },
521   { 1, - 4,  13, 124, - 7, 1, 0, 0 }, { 1, - 4,  11, 125, - 6, 1, 0, 0 },
522   { 1, - 3,   8, 126, - 5, 1, 0, 0 }, { 1, - 2,   6, 126, - 4, 1, 0, 0 },
523   { 0, - 1,   4, 127, - 3, 1, 0, 0 }, { 0,   0,   2, 127, - 1, 0, 0, 0 },
524 
525   // [0, 1)
526   { 0,  0,   0, 127,   1,   0,  0,  0}, { 0,  0,  -1, 127,   2,   0,  0,  0},
527   { 0,  1,  -3, 127,   4,  -2,  1,  0}, { 0,  1,  -5, 127,   6,  -2,  1,  0},
528   { 0,  2,  -6, 126,   8,  -3,  1,  0}, {-1,  2,  -7, 126,  11,  -4,  2, -1},
529   {-1,  3,  -8, 125,  13,  -5,  2, -1}, {-1,  3, -10, 124,  16,  -6,  3, -1},
530   {-1,  4, -11, 123,  18,  -7,  3, -1}, {-1,  4, -12, 122,  20,  -7,  3, -1},
531   {-1,  4, -13, 121,  23,  -8,  3, -1}, {-2,  5, -14, 120,  25,  -9,  4, -1},
532   {-1,  5, -15, 119,  27, -10,  4, -1}, {-1,  5, -16, 118,  30, -11,  4, -1},
533   {-2,  6, -17, 116,  33, -12,  5, -1}, {-2,  6, -17, 114,  35, -12,  5, -1},
534   {-2,  6, -18, 113,  38, -13,  5, -1}, {-2,  7, -19, 111,  41, -14,  6, -2},
535   {-2,  7, -19, 110,  43, -15,  6, -2}, {-2,  7, -20, 108,  46, -15,  6, -2},
536   {-2,  7, -20, 106,  49, -16,  6, -2}, {-2,  7, -21, 104,  51, -16,  7, -2},
537   {-2,  7, -21, 102,  54, -17,  7, -2}, {-2,  8, -21, 100,  56, -18,  7, -2},
538   {-2,  8, -22,  98,  59, -18,  7, -2}, {-2,  8, -22,  96,  62, -19,  7, -2},
539   {-2,  8, -22,  94,  64, -19,  7, -2}, {-2,  8, -22,  91,  67, -20,  8, -2},
540   {-2,  8, -22,  89,  69, -20,  8, -2}, {-2,  8, -22,  87,  72, -21,  8, -2},
541   {-2,  8, -21,  84,  74, -21,  8, -2}, {-2,  8, -22,  82,  77, -21,  8, -2},
542   {-2,  8, -21,  79,  79, -21,  8, -2}, {-2,  8, -21,  77,  82, -22,  8, -2},
543   {-2,  8, -21,  74,  84, -21,  8, -2}, {-2,  8, -21,  72,  87, -22,  8, -2},
544   {-2,  8, -20,  69,  89, -22,  8, -2}, {-2,  8, -20,  67,  91, -22,  8, -2},
545   {-2,  7, -19,  64,  94, -22,  8, -2}, {-2,  7, -19,  62,  96, -22,  8, -2},
546   {-2,  7, -18,  59,  98, -22,  8, -2}, {-2,  7, -18,  56, 100, -21,  8, -2},
547   {-2,  7, -17,  54, 102, -21,  7, -2}, {-2,  7, -16,  51, 104, -21,  7, -2},
548   {-2,  6, -16,  49, 106, -20,  7, -2}, {-2,  6, -15,  46, 108, -20,  7, -2},
549   {-2,  6, -15,  43, 110, -19,  7, -2}, {-2,  6, -14,  41, 111, -19,  7, -2},
550   {-1,  5, -13,  38, 113, -18,  6, -2}, {-1,  5, -12,  35, 114, -17,  6, -2},
551   {-1,  5, -12,  33, 116, -17,  6, -2}, {-1,  4, -11,  30, 118, -16,  5, -1},
552   {-1,  4, -10,  27, 119, -15,  5, -1}, {-1,  4,  -9,  25, 120, -14,  5, -2},
553   {-1,  3,  -8,  23, 121, -13,  4, -1}, {-1,  3,  -7,  20, 122, -12,  4, -1},
554   {-1,  3,  -7,  18, 123, -11,  4, -1}, {-1,  3,  -6,  16, 124, -10,  3, -1},
555   {-1,  2,  -5,  13, 125,  -8,  3, -1}, {-1,  2,  -4,  11, 126,  -7,  2, -1},
556   { 0,  1,  -3,   8, 126,  -6,  2,  0}, { 0,  1,  -2,   6, 127,  -5,  1,  0},
557   { 0,  1,  -2,   4, 127,  -3,  1,  0}, { 0,  0,   0,   2, 127,  -1,  0,  0},
558 
559   // [1, 2)
560   { 0, 0, 0,   1, 127,   0,   0, 0 }, { 0, 0, 0, - 1, 127,   2,   0, 0 },
561   { 0, 0, 1, - 3, 127,   4, - 1, 0 }, { 0, 0, 1, - 4, 126,   6, - 2, 1 },
562   { 0, 0, 1, - 5, 126,   8, - 3, 1 }, { 0, 0, 1, - 6, 125,  11, - 4, 1 },
563   { 0, 0, 1, - 7, 124,  13, - 4, 1 }, { 0, 0, 2, - 8, 123,  15, - 5, 1 },
564   { 0, 0, 2, - 9, 122,  18, - 6, 1 }, { 0, 0, 2, -10, 121,  20, - 6, 1 },
565   { 0, 0, 2, -11, 120,  22, - 7, 2 }, { 0, 0, 2, -12, 119,  25, - 8, 2 },
566   { 0, 0, 3, -13, 117,  27, - 8, 2 }, { 0, 0, 3, -13, 116,  29, - 9, 2 },
567   { 0, 0, 3, -14, 114,  32, -10, 3 }, { 0, 0, 3, -15, 113,  35, -10, 2 },
568   { 0, 0, 3, -15, 111,  37, -11, 3 }, { 0, 0, 3, -16, 109,  40, -11, 3 },
569   { 0, 0, 3, -16, 108,  42, -12, 3 }, { 0, 0, 4, -17, 106,  45, -13, 3 },
570   { 0, 0, 4, -17, 104,  47, -13, 3 }, { 0, 0, 4, -17, 102,  50, -14, 3 },
571   { 0, 0, 4, -17, 100,  52, -14, 3 }, { 0, 0, 4, -18,  98,  55, -15, 4 },
572   { 0, 0, 4, -18,  96,  58, -15, 3 }, { 0, 0, 4, -18,  94,  60, -16, 4 },
573   { 0, 0, 4, -18,  91,  63, -16, 4 }, { 0, 0, 4, -18,  89,  65, -16, 4 },
574   { 0, 0, 4, -18,  87,  68, -17, 4 }, { 0, 0, 4, -18,  85,  70, -17, 4 },
575   { 0, 0, 4, -18,  82,  73, -17, 4 }, { 0, 0, 4, -18,  80,  75, -17, 4 },
576   { 0, 0, 4, -18,  78,  78, -18, 4 }, { 0, 0, 4, -17,  75,  80, -18, 4 },
577   { 0, 0, 4, -17,  73,  82, -18, 4 }, { 0, 0, 4, -17,  70,  85, -18, 4 },
578   { 0, 0, 4, -17,  68,  87, -18, 4 }, { 0, 0, 4, -16,  65,  89, -18, 4 },
579   { 0, 0, 4, -16,  63,  91, -18, 4 }, { 0, 0, 4, -16,  60,  94, -18, 4 },
580   { 0, 0, 3, -15,  58,  96, -18, 4 }, { 0, 0, 4, -15,  55,  98, -18, 4 },
581   { 0, 0, 3, -14,  52, 100, -17, 4 }, { 0, 0, 3, -14,  50, 102, -17, 4 },
582   { 0, 0, 3, -13,  47, 104, -17, 4 }, { 0, 0, 3, -13,  45, 106, -17, 4 },
583   { 0, 0, 3, -12,  42, 108, -16, 3 }, { 0, 0, 3, -11,  40, 109, -16, 3 },
584   { 0, 0, 3, -11,  37, 111, -15, 3 }, { 0, 0, 2, -10,  35, 113, -15, 3 },
585   { 0, 0, 3, -10,  32, 114, -14, 3 }, { 0, 0, 2, - 9,  29, 116, -13, 3 },
586   { 0, 0, 2, - 8,  27, 117, -13, 3 }, { 0, 0, 2, - 8,  25, 119, -12, 2 },
587   { 0, 0, 2, - 7,  22, 120, -11, 2 }, { 0, 0, 1, - 6,  20, 121, -10, 2 },
588   { 0, 0, 1, - 6,  18, 122, - 9, 2 }, { 0, 0, 1, - 5,  15, 123, - 8, 2 },
589   { 0, 0, 1, - 4,  13, 124, - 7, 1 }, { 0, 0, 1, - 4,  11, 125, - 6, 1 },
590   { 0, 0, 1, - 3,   8, 126, - 5, 1 }, { 0, 0, 1, - 2,   6, 126, - 4, 1 },
591   { 0, 0, 0, - 1,   4, 127, - 3, 1 }, { 0, 0, 0,   0,   2, 127, - 1, 0 },
592   // dummy (replicate row index 191)
593   { 0, 0, 0,   0,   2, 127, - 1, 0 },
594 
595 #elif WARPEDPIXEL_PREC_BITS == 5
596   // [-1, 0)
597   {0,   0, 127,   1,   0, 0, 0, 0}, {1,  -3, 127,   4,  -1, 0, 0, 0},
598   {1,  -5, 126,   8,  -3, 1, 0, 0}, {1,  -7, 124,  13,  -4, 1, 0, 0},
599   {2,  -9, 122,  18,  -6, 1, 0, 0}, {2, -11, 120,  22,  -7, 2, 0, 0},
600   {3, -13, 117,  27,  -8, 2, 0, 0}, {3, -14, 114,  32, -10, 3, 0, 0},
601   {3, -15, 111,  37, -11, 3, 0, 0}, {3, -16, 108,  42, -12, 3, 0, 0},
602   {4, -17, 104,  47, -13, 3, 0, 0}, {4, -17, 100,  52, -14, 3, 0, 0},
603   {4, -18,  96,  58, -15, 3, 0, 0}, {4, -18,  91,  63, -16, 4, 0, 0},
604   {4, -18,  87,  68, -17, 4, 0, 0}, {4, -18,  82,  73, -17, 4, 0, 0},
605   {4, -18,  78,  78, -18, 4, 0, 0}, {4, -17,  73,  82, -18, 4, 0, 0},
606   {4, -17,  68,  87, -18, 4, 0, 0}, {4, -16,  63,  91, -18, 4, 0, 0},
607   {3, -15,  58,  96, -18, 4, 0, 0}, {3, -14,  52, 100, -17, 4, 0, 0},
608   {3, -13,  47, 104, -17, 4, 0, 0}, {3, -12,  42, 108, -16, 3, 0, 0},
609   {3, -11,  37, 111, -15, 3, 0, 0}, {3, -10,  32, 114, -14, 3, 0, 0},
610   {2,  -8,  27, 117, -13, 3, 0, 0}, {2,  -7,  22, 120, -11, 2, 0, 0},
611   {1,  -6,  18, 122,  -9, 2, 0, 0}, {1,  -4,  13, 124,  -7, 1, 0, 0},
612   {1,  -3,   8, 126,  -5, 1, 0, 0}, {0,  -1,   4, 127,  -3, 1, 0, 0},
613   // [0, 1)
614   { 0,  0,   0, 127,   1,   0,   0,  0}, { 0,  1,  -3, 127,   4,  -2,   1,  0},
615   { 0,  2,  -6, 126,   8,  -3,   1,  0}, {-1,  3,  -8, 125,  13,  -5,   2, -1},
616   {-1,  4, -11, 123,  18,  -7,   3, -1}, {-1,  4, -13, 121,  23,  -8,   3, -1},
617   {-1,  5, -15, 119,  27, -10,   4, -1}, {-2,  6, -17, 116,  33, -12,   5, -1},
618   {-2,  6, -18, 113,  38, -13,   5, -1}, {-2,  7, -19, 110,  43, -15,   6, -2},
619   {-2,  7, -20, 106,  49, -16,   6, -2}, {-2,  7, -21, 102,  54, -17,   7, -2},
620   {-2,  8, -22,  98,  59, -18,   7, -2}, {-2,  8, -22,  94,  64, -19,   7, -2},
621   {-2,  8, -22,  89,  69, -20,   8, -2}, {-2,  8, -21,  84,  74, -21,   8, -2},
622   {-2,  8, -21,  79,  79, -21,   8, -2}, {-2,  8, -21,  74,  84, -21,   8, -2},
623   {-2,  8, -20,  69,  89, -22,   8, -2}, {-2,  7, -19,  64,  94, -22,   8, -2},
624   {-2,  7, -18,  59,  98, -22,   8, -2}, {-2,  7, -17,  54, 102, -21,   7, -2},
625   {-2,  6, -16,  49, 106, -20,   7, -2}, {-2,  6, -15,  43, 110, -19,   7, -2},
626   {-1,  5, -13,  38, 113, -18,   6, -2}, {-1,  5, -12,  33, 116, -17,   6, -2},
627   {-1,  4, -10,  27, 119, -15,   5, -1}, {-1,  3,  -8,  23, 121, -13,   4, -1},
628   {-1,  3,  -7,  18, 123, -11,   4, -1}, {-1,  2,  -5,  13, 125,  -8,   3, -1},
629   { 0,  1,  -3,   8, 126,  -6,   2,  0}, { 0,  1,  -2,   4, 127,  -3,   1,  0},
630   // [1, 2)
631   {0, 0, 0,   1, 127,   0,   0, 0}, {0, 0, 1,  -3, 127,   4,  -1, 0},
632   {0, 0, 1,  -5, 126,   8,  -3, 1}, {0, 0, 1,  -7, 124,  13,  -4, 1},
633   {0, 0, 2,  -9, 122,  18,  -6, 1}, {0, 0, 2, -11, 120,  22,  -7, 2},
634   {0, 0, 3, -13, 117,  27,  -8, 2}, {0, 0, 3, -14, 114,  32, -10, 3},
635   {0, 0, 3, -15, 111,  37, -11, 3}, {0, 0, 3, -16, 108,  42, -12, 3},
636   {0, 0, 4, -17, 104,  47, -13, 3}, {0, 0, 4, -17, 100,  52, -14, 3},
637   {0, 0, 4, -18,  96,  58, -15, 3}, {0, 0, 4, -18,  91,  63, -16, 4},
638   {0, 0, 4, -18,  87,  68, -17, 4}, {0, 0, 4, -18,  82,  73, -17, 4},
639   {0, 0, 4, -18,  78,  78, -18, 4}, {0, 0, 4, -17,  73,  82, -18, 4},
640   {0, 0, 4, -17,  68,  87, -18, 4}, {0, 0, 4, -16,  63,  91, -18, 4},
641   {0, 0, 3, -15,  58,  96, -18, 4}, {0, 0, 3, -14,  52, 100, -17, 4},
642   {0, 0, 3, -13,  47, 104, -17, 4}, {0, 0, 3, -12,  42, 108, -16, 3},
643   {0, 0, 3, -11,  37, 111, -15, 3}, {0, 0, 3, -10,  32, 114, -14, 3},
644   {0, 0, 2,  -8,  27, 117, -13, 3}, {0, 0, 2,  -7,  22, 120, -11, 2},
645   {0, 0, 1,  -6,  18, 122,  -9, 2}, {0, 0, 1,  -4,  13, 124,  -7, 1},
646   {0, 0, 1,  -3,   8, 126,  -5, 1}, {0, 0, 0,  -1,   4, 127,  -3, 1},
647   // dummy (replicate row index 95)
648   {0, 0, 0,  -1,   4, 127,  -3, 1},
649 
650 #endif  // WARPEDPIXEL_PREC_BITS == 6
651 };
652 
653 /* clang-format on */
654 
655 #define DIV_LUT_PREC_BITS 14
656 #define DIV_LUT_BITS 8
657 #define DIV_LUT_NUM (1 << DIV_LUT_BITS)
658 
659 static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
660   16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
661   15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
662   15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
663   14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
664   13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
665   13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
666   13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
667   12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
668   12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
669   11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
670   11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
671   11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
672   10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
673   10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
674   10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
675   9963,  9939,  9916,  9892,  9869,  9846,  9823,  9800,  9777,  9754,  9732,
676   9709,  9687,  9664,  9642,  9620,  9598,  9576,  9554,  9533,  9511,  9489,
677   9468,  9447,  9425,  9404,  9383,  9362,  9341,  9321,  9300,  9279,  9259,
678   9239,  9218,  9198,  9178,  9158,  9138,  9118,  9098,  9079,  9059,  9039,
679   9020,  9001,  8981,  8962,  8943,  8924,  8905,  8886,  8867,  8849,  8830,
680   8812,  8793,  8775,  8756,  8738,  8720,  8702,  8684,  8666,  8648,  8630,
681   8613,  8595,  8577,  8560,  8542,  8525,  8508,  8490,  8473,  8456,  8439,
682   8422,  8405,  8389,  8372,  8355,  8339,  8322,  8306,  8289,  8273,  8257,
683   8240,  8224,  8208,  8192,
684 };
685 
686 #if CONFIG_WARPED_MOTION
687 // Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
688 // at precision of DIV_LUT_PREC_BITS along with the shift.
resolve_divisor_64(uint64_t D,int16_t * shift)689 static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
690   int64_t e, f;
691   *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
692                                : get_msb((unsigned int)D));
693   // e is obtained from D after resetting the most significant 1 bit.
694   e = D - ((uint64_t)1 << *shift);
695   // Get the most significant DIV_LUT_BITS (8) bits of e into f
696   if (*shift > DIV_LUT_BITS)
697     f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
698   else
699     f = e << (DIV_LUT_BITS - *shift);
700   assert(f <= DIV_LUT_NUM);
701   *shift += DIV_LUT_PREC_BITS;
702   // Use f as lookup into the precomputed table of multipliers
703   return div_lut[f];
704 }
705 #endif  // CONFIG_WARPED_MOTION
706 
resolve_divisor_32(uint32_t D,int16_t * shift)707 static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
708   int32_t e, f;
709   *shift = get_msb(D);
710   // e is obtained from D after resetting the most significant 1 bit.
711   e = D - ((uint32_t)1 << *shift);
712   // Get the most significant DIV_LUT_BITS (8) bits of e into f
713   if (*shift > DIV_LUT_BITS)
714     f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
715   else
716     f = e << (DIV_LUT_BITS - *shift);
717   assert(f <= DIV_LUT_NUM);
718   *shift += DIV_LUT_PREC_BITS;
719   // Use f as lookup into the precomputed table of multipliers
720   return div_lut[f];
721 }
722 
is_affine_valid(const WarpedMotionParams * const wm)723 static int is_affine_valid(const WarpedMotionParams *const wm) {
724   const int32_t *mat = wm->wmmat;
725   return (mat[2] > 0);
726 }
727 
is_affine_shear_allowed(int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)728 static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma,
729                                    int16_t delta) {
730   if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
731       (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS)))
732     return 0;
733   else
734     return 1;
735 }
736 
737 // Returns 1 on success or 0 on an invalid affine set
get_shear_params(WarpedMotionParams * wm)738 int get_shear_params(WarpedMotionParams *wm) {
739   const int32_t *mat = wm->wmmat;
740   if (!is_affine_valid(wm)) return 0;
741   wm->alpha =
742       clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
743   wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
744   int16_t shift;
745   int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
746   int64_t v;
747   v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
748   wm->gamma =
749       clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX);
750   v = ((int64_t)mat[3] * mat[4]) * y;
751   wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) -
752                         (1 << WARPEDMODEL_PREC_BITS),
753                     INT16_MIN, INT16_MAX);
754   if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
755     return 0;
756 
757   wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) *
758               (1 << WARP_PARAM_REDUCE_BITS);
759   wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) *
760              (1 << WARP_PARAM_REDUCE_BITS);
761   wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) *
762               (1 << WARP_PARAM_REDUCE_BITS);
763   wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) *
764               (1 << WARP_PARAM_REDUCE_BITS);
765   return 1;
766 }
767 
768 #if CONFIG_HIGHBITDEPTH
highbd_get_subcolumn(int taps,const uint16_t * const ref,int32_t * col,int stride,int x,int y_start)769 static INLINE void highbd_get_subcolumn(int taps, const uint16_t *const ref,
770                                         int32_t *col, int stride, int x,
771                                         int y_start) {
772   int i;
773   for (i = 0; i < taps; ++i) {
774     col[i] = ref[(i + y_start) * stride + x];
775   }
776 }
777 
highbd_bi_ntap_filter(const uint16_t * const ref,int x,int y,int stride,int bd)778 static uint16_t highbd_bi_ntap_filter(const uint16_t *const ref, int x, int y,
779                                       int stride, int bd) {
780   int32_t val, arr[WARPEDPIXEL_FILTER_TAPS];
781   int k;
782   const int i = (int)x >> WARPEDPIXEL_PREC_BITS;
783   const int j = (int)y >> WARPEDPIXEL_PREC_BITS;
784   for (k = 0; k < WARPEDPIXEL_FILTER_TAPS; ++k) {
785     int32_t arr_temp[WARPEDPIXEL_FILTER_TAPS];
786     highbd_get_subcolumn(WARPEDPIXEL_FILTER_TAPS, ref, arr_temp, stride,
787                          i + k + 1 - WARPEDPIXEL_FILTER_TAPS / 2,
788                          j + 1 - WARPEDPIXEL_FILTER_TAPS / 2);
789     arr[k] = do_ntap_filter(arr_temp + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
790                             y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
791   }
792   val = do_ntap_filter(arr + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
793                        x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
794   val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
795   return (uint16_t)clip_pixel_highbd(val, bd);
796 }
797 
highbd_bi_cubic_filter(const uint16_t * const ref,int x,int y,int stride,int bd)798 static uint16_t highbd_bi_cubic_filter(const uint16_t *const ref, int x, int y,
799                                        int stride, int bd) {
800   int32_t val, arr[4];
801   int k;
802   const int i = (int)x >> WARPEDPIXEL_PREC_BITS;
803   const int j = (int)y >> WARPEDPIXEL_PREC_BITS;
804   for (k = 0; k < 4; ++k) {
805     int32_t arr_temp[4];
806     highbd_get_subcolumn(4, ref, arr_temp, stride, i + k - 1, j - 1);
807     arr[k] =
808         do_cubic_filter(arr_temp + 1, y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
809   }
810   val = do_cubic_filter(arr + 1, x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
811   val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
812   return (uint16_t)clip_pixel_highbd(val, bd);
813 }
814 
highbd_bi_linear_filter(const uint16_t * const ref,int x,int y,int stride,int bd)815 static uint16_t highbd_bi_linear_filter(const uint16_t *const ref, int x, int y,
816                                         int stride, int bd) {
817   const int ix = x >> WARPEDPIXEL_PREC_BITS;
818   const int iy = y >> WARPEDPIXEL_PREC_BITS;
819   const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
820   const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
821   int32_t val;
822   val = ROUND_POWER_OF_TWO_SIGNED(
823       ref[iy * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sy) *
824               (WARPEDPIXEL_PREC_SHIFTS - sx) +
825           ref[iy * stride + ix + 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) * sx +
826           ref[(iy + 1) * stride + ix] * sy * (WARPEDPIXEL_PREC_SHIFTS - sx) +
827           ref[(iy + 1) * stride + ix + 1] * sy * sx,
828       WARPEDPIXEL_PREC_BITS * 2);
829   return (uint16_t)clip_pixel_highbd(val, bd);
830 }
831 
highbd_warp_interpolate(const uint16_t * const ref,int x,int y,int width,int height,int stride,int bd)832 static uint16_t highbd_warp_interpolate(const uint16_t *const ref, int x, int y,
833                                         int width, int height, int stride,
834                                         int bd) {
835   const int ix = x >> WARPEDPIXEL_PREC_BITS;
836   const int iy = y >> WARPEDPIXEL_PREC_BITS;
837   const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
838   const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
839   int32_t v;
840 
841   if (ix < 0 && iy < 0)
842     return ref[0];
843   else if (ix < 0 && iy > height - 1)
844     return ref[(height - 1) * stride];
845   else if (ix > width - 1 && iy < 0)
846     return ref[width - 1];
847   else if (ix > width - 1 && iy > height - 1)
848     return ref[(height - 1) * stride + (width - 1)];
849   else if (ix < 0) {
850     v = ROUND_POWER_OF_TWO_SIGNED(
851         ref[iy * stride] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
852             ref[(iy + 1) * stride] * sy,
853         WARPEDPIXEL_PREC_BITS);
854     return clip_pixel_highbd(v, bd);
855   } else if (iy < 0) {
856     v = ROUND_POWER_OF_TWO_SIGNED(
857         ref[ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) + ref[ix + 1] * sx,
858         WARPEDPIXEL_PREC_BITS);
859     return clip_pixel_highbd(v, bd);
860   } else if (ix > width - 1) {
861     v = ROUND_POWER_OF_TWO_SIGNED(
862         ref[iy * stride + width - 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
863             ref[(iy + 1) * stride + width - 1] * sy,
864         WARPEDPIXEL_PREC_BITS);
865     return clip_pixel_highbd(v, bd);
866   } else if (iy > height - 1) {
867     v = ROUND_POWER_OF_TWO_SIGNED(
868         ref[(height - 1) * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) +
869             ref[(height - 1) * stride + ix + 1] * sx,
870         WARPEDPIXEL_PREC_BITS);
871     return clip_pixel_highbd(v, bd);
872   } else if (ix >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
873              iy >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
874              ix < width - WARPEDPIXEL_FILTER_TAPS / 2 &&
875              iy < height - WARPEDPIXEL_FILTER_TAPS / 2) {
876     return highbd_bi_ntap_filter(ref, x, y, stride, bd);
877   } else if (ix >= 1 && iy >= 1 && ix < width - 2 && iy < height - 2) {
878     return highbd_bi_cubic_filter(ref, x, y, stride, bd);
879   } else {
880     return highbd_bi_linear_filter(ref, x, y, stride, bd);
881   }
882 }
883 
highbd_error_measure(int err,int bd)884 static INLINE int highbd_error_measure(int err, int bd) {
885   const int b = bd - 8;
886   const int bmask = (1 << b) - 1;
887   const int v = (1 << b);
888   int e1, e2;
889   err = abs(err);
890   e1 = err >> b;
891   e2 = err & bmask;
892   return error_measure_lut[255 + e1] * (v - e2) +
893          error_measure_lut[256 + e1] * e2;
894 }
895 
highbd_warp_plane_old(const WarpedMotionParams * const wm,const uint8_t * const ref8,int width,int height,int stride,const uint8_t * const pred8,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int x_scale,int y_scale,int bd,ConvolveParams * conv_params)896 static void highbd_warp_plane_old(const WarpedMotionParams *const wm,
897                                   const uint8_t *const ref8, int width,
898                                   int height, int stride,
899                                   const uint8_t *const pred8, int p_col,
900                                   int p_row, int p_width, int p_height,
901                                   int p_stride, int subsampling_x,
902                                   int subsampling_y, int x_scale, int y_scale,
903                                   int bd, ConvolveParams *conv_params) {
904   int i, j;
905   ProjectPointsFunc projectpoints = get_project_points_type(wm->wmtype);
906   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
907   const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
908   if (projectpoints == NULL) return;
909   for (i = p_row; i < p_row + p_height; ++i) {
910     for (j = p_col; j < p_col + p_width; ++j) {
911       int in[2], out[2];
912       in[0] = j;
913       in[1] = i;
914       projectpoints(wm->wmmat, in, out, 1, 2, 2, subsampling_x, subsampling_y);
915       out[0] = ROUND_POWER_OF_TWO_SIGNED(out[0] * x_scale, SCALE_SUBPEL_BITS);
916       out[1] = ROUND_POWER_OF_TWO_SIGNED(out[1] * y_scale, SCALE_SUBPEL_BITS);
917       if (conv_params->do_average)
918         pred[(j - p_col) + (i - p_row) * p_stride] = ROUND_POWER_OF_TWO(
919             pred[(j - p_col) + (i - p_row) * p_stride] +
920                 highbd_warp_interpolate(ref, out[0], out[1], width, height,
921                                         stride, bd),
922             1);
923       else
924         pred[(j - p_col) + (i - p_row) * p_stride] = highbd_warp_interpolate(
925             ref, out[0], out[1], width, height, stride, bd);
926     }
927   }
928 }
929 
930 /* Note: For an explanation of the warp algorithm, and some notes on bit widths
931     for hardware implementations, see the comments above av1_warp_affine_c
932 */
av1_highbd_warp_affine_c(const int32_t * mat,const uint16_t * ref,int width,int height,int stride,uint16_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)933 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
934                               int width, int height, int stride, uint16_t *pred,
935                               int p_col, int p_row, int p_width, int p_height,
936                               int p_stride, int subsampling_x,
937                               int subsampling_y, int bd,
938                               ConvolveParams *conv_params, int16_t alpha,
939                               int16_t beta, int16_t gamma, int16_t delta) {
940   int32_t tmp[15 * 8];
941   int i, j, k, l, m;
942 #if CONFIG_CONVOLVE_ROUND
943   const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
944   const int reduce_bits_horiz =
945       use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
946   const int max_bits_horiz =
947       use_conv_params
948           ? bd + FILTER_BITS + 1 - conv_params->round_0
949           : bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS;
950   const int offset_bits_horiz =
951       use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
952   const int offset_bits_vert =
953       use_conv_params
954           ? bd + 2 * FILTER_BITS - conv_params->round_0
955           : bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS;
956   if (use_conv_params) {
957     conv_params->do_post_rounding = 1;
958   }
959   assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
960 #else
961   const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
962   const int max_bits_horiz =
963       bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS;
964   const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
965   const int offset_bits_vert =
966       bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS;
967 #endif
968   (void)max_bits_horiz;
969 
970   for (i = p_row; i < p_row + p_height; i += 8) {
971     for (j = p_col; j < p_col + p_width; j += 8) {
972       // Calculate the center of this 8x8 block,
973       // project to luma coordinates (if in a subsampled chroma plane),
974       // apply the affine transformation,
975       // then convert back to the original coordinates (if necessary)
976       const int32_t src_x = (j + 4) << subsampling_x;
977       const int32_t src_y = (i + 4) << subsampling_y;
978       const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
979       const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
980       const int32_t x4 = dst_x >> subsampling_x;
981       const int32_t y4 = dst_y >> subsampling_y;
982 
983       int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
984       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
985       int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
986       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
987 
988       sx4 += alpha * (-4) + beta * (-4);
989       sy4 += gamma * (-4) + delta * (-4);
990 
991       sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
992       sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
993 
994       // Horizontal filter
995       for (k = -7; k < 8; ++k) {
996         int iy = iy4 + k;
997         if (iy < 0)
998           iy = 0;
999         else if (iy > height - 1)
1000           iy = height - 1;
1001 
1002         int sx = sx4 + beta * (k + 4);
1003         for (l = -4; l < 4; ++l) {
1004           int ix = ix4 + l - 3;
1005           const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
1006                            WARPEDPIXEL_PREC_SHIFTS;
1007           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
1008           const int16_t *coeffs = warped_filter[offs];
1009 
1010           int32_t sum = 1 << offset_bits_horiz;
1011           for (m = 0; m < 8; ++m) {
1012             int sample_x = ix + m;
1013             if (sample_x < 0)
1014               sample_x = 0;
1015             else if (sample_x > width - 1)
1016               sample_x = width - 1;
1017             sum += ref[iy * stride + sample_x] * coeffs[m];
1018           }
1019           sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
1020           assert(0 <= sum && sum < (1 << max_bits_horiz));
1021           tmp[(k + 7) * 8 + (l + 4)] = sum;
1022           sx += alpha;
1023         }
1024       }
1025 
1026       // Vertical filter
1027       for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
1028         int sy = sy4 + delta * (k + 4);
1029         for (l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
1030           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
1031                            WARPEDPIXEL_PREC_SHIFTS;
1032           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
1033           const int16_t *coeffs = warped_filter[offs];
1034 
1035           int32_t sum = 1 << offset_bits_vert;
1036           for (m = 0; m < 8; ++m) {
1037             sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
1038           }
1039 #if CONFIG_CONVOLVE_ROUND
1040           if (use_conv_params) {
1041             CONV_BUF_TYPE *p =
1042                 &conv_params
1043                      ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
1044                            (j - p_col + l + 4)];
1045             sum = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
1046                   (1 << (offset_bits_horiz + FILTER_BITS -
1047                          conv_params->round_0 - conv_params->round_1)) -
1048                   (1 << (offset_bits_vert - conv_params->round_1));
1049             if (conv_params->do_average)
1050               *p += sum;
1051             else
1052               *p = sum;
1053           } else {
1054 #else
1055           {
1056 #endif
1057             uint16_t *p =
1058                 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
1059             sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS);
1060             assert(0 <= sum && sum < (1 << (bd + 2)));
1061             uint16_t px =
1062                 clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
1063             if (conv_params->do_average)
1064               *p = ROUND_POWER_OF_TWO(*p + px, 1);
1065             else
1066               *p = px;
1067           }
1068           sy += gamma;
1069         }
1070       }
1071     }
1072   }
1073 }
1074 
1075 static void highbd_warp_plane(WarpedMotionParams *wm, const uint8_t *const ref8,
1076                               int width, int height, int stride,
1077                               const uint8_t *const pred8, int p_col, int p_row,
1078                               int p_width, int p_height, int p_stride,
1079                               int subsampling_x, int subsampling_y, int x_scale,
1080                               int y_scale, int bd,
1081                               ConvolveParams *conv_params) {
1082   if (wm->wmtype == ROTZOOM) {
1083     wm->wmmat[5] = wm->wmmat[2];
1084     wm->wmmat[4] = -wm->wmmat[3];
1085   }
1086   if ((wm->wmtype == ROTZOOM || wm->wmtype == AFFINE) &&
1087       x_scale == SCALE_SUBPEL_SHIFTS && y_scale == SCALE_SUBPEL_SHIFTS) {
1088     const int32_t *const mat = wm->wmmat;
1089     const int16_t alpha = wm->alpha;
1090     const int16_t beta = wm->beta;
1091     const int16_t gamma = wm->gamma;
1092     const int16_t delta = wm->delta;
1093 
1094     const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
1095     uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
1096     av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
1097                            p_width, p_height, p_stride, subsampling_x,
1098                            subsampling_y, bd, conv_params, alpha, beta, gamma,
1099                            delta);
1100   } else {
1101     highbd_warp_plane_old(wm, ref8, width, height, stride, pred8, p_col, p_row,
1102                           p_width, p_height, p_stride, subsampling_x,
1103                           subsampling_y, x_scale, y_scale, bd, conv_params);
1104   }
1105 }
1106 
1107 static int64_t highbd_frame_error(const uint16_t *const ref, int stride,
1108                                   const uint16_t *const dst, int p_width,
1109                                   int p_height, int p_stride, int bd) {
1110   int64_t sum_error = 0;
1111   for (int i = 0; i < p_height; ++i) {
1112     for (int j = 0; j < p_width; ++j) {
1113       sum_error +=
1114           highbd_error_measure(dst[j + i * p_stride] - ref[j + i * stride], bd);
1115     }
1116   }
1117   return sum_error;
1118 }
1119 
1120 static int64_t highbd_warp_error(
1121     WarpedMotionParams *wm, const uint8_t *const ref8, int width, int height,
1122     int stride, const uint8_t *const dst8, int p_col, int p_row, int p_width,
1123     int p_height, int p_stride, int subsampling_x, int subsampling_y,
1124     int x_scale, int y_scale, int bd, int64_t best_error) {
1125   int64_t gm_sumerr = 0;
1126   int warp_w, warp_h;
1127   int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
1128   int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
1129   uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
1130 
1131   ConvolveParams conv_params = get_conv_params(0, 0, 0);
1132   for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
1133     for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
1134       // avoid warping extra 8x8 blocks in the padded region of the frame
1135       // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
1136       warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
1137       warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
1138       highbd_warp_plane(wm, ref8, width, height, stride,
1139                         CONVERT_TO_BYTEPTR(tmp), j, i, warp_w, warp_h,
1140                         WARP_ERROR_BLOCK, subsampling_x, subsampling_y, x_scale,
1141                         y_scale, bd, &conv_params);
1142 
1143       gm_sumerr += highbd_frame_error(
1144           tmp, WARP_ERROR_BLOCK, CONVERT_TO_SHORTPTR(dst8) + j + i * p_stride,
1145           warp_w, warp_h, p_stride, bd);
1146       if (gm_sumerr > best_error) return gm_sumerr;
1147     }
1148   }
1149   return gm_sumerr;
1150 }
1151 #endif  // CONFIG_HIGHBITDEPTH
1152 
1153 static INLINE int error_measure(int err) {
1154   return error_measure_lut[255 + err];
1155 }
1156 
1157 static void warp_plane_old(const WarpedMotionParams *const wm,
1158                            const uint8_t *const ref, int width, int height,
1159                            int stride, uint8_t *pred, int p_col, int p_row,
1160                            int p_width, int p_height, int p_stride,
1161                            int subsampling_x, int subsampling_y, int x_scale,
1162                            int y_scale, ConvolveParams *conv_params) {
1163   int i, j;
1164   ProjectPointsFunc projectpoints = get_project_points_type(wm->wmtype);
1165   if (projectpoints == NULL) return;
1166   for (i = p_row; i < p_row + p_height; ++i) {
1167     for (j = p_col; j < p_col + p_width; ++j) {
1168       int in[2], out[2];
1169       in[0] = j;
1170       in[1] = i;
1171       projectpoints(wm->wmmat, in, out, 1, 2, 2, subsampling_x, subsampling_y);
1172       out[0] = ROUND_POWER_OF_TWO_SIGNED(out[0] * x_scale, SCALE_SUBPEL_BITS);
1173       out[1] = ROUND_POWER_OF_TWO_SIGNED(out[1] * y_scale, SCALE_SUBPEL_BITS);
1174       if (conv_params->do_average)
1175         pred[(j - p_col) + (i - p_row) * p_stride] = ROUND_POWER_OF_TWO(
1176             pred[(j - p_col) + (i - p_row) * p_stride] +
1177                 warp_interpolate(ref, out[0], out[1], width, height, stride),
1178             1);
1179       else
1180         pred[(j - p_col) + (i - p_row) * p_stride] =
1181             warp_interpolate(ref, out[0], out[1], width, height, stride);
1182     }
1183   }
1184 }
1185 
1186 /* The warp filter for ROTZOOM and AFFINE models works as follows:
1187    * Split the input into 8x8 blocks
1188    * For each block, project the point (4, 4) within the block, to get the
1189      overall block position. Split into integer and fractional coordinates,
1190      maintaining full WARPEDMODEL precision
1191    * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
1192      variable horizontal offset. This means that, while the rows of the
1193      intermediate buffer align with the rows of the *reference* image, the
1194      columns align with the columns of the *destination* image.
1195    * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
1196      destination is too small we crop the output at this stage). Each pixel has
1197      a variable vertical offset, so that the resulting rows are aligned with
1198      the rows of the destination image.
1199 
1200    To accomplish these alignments, we factor the warp matrix as a
1201    product of two shear / asymmetric zoom matrices:
1202    / a b \  = /   1       0    \ * / 1+alpha  beta \
1203    \ c d /    \ gamma  1+delta /   \    0      1   /
1204    where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
1205    The horizontal shear (with alpha and beta) is applied first,
1206    then the vertical shear (with gamma and delta) is applied second.
1207 
1208    The only limitation is that, to fit this in a fixed 8-tap filter size,
1209    the fractional pixel offsets must be at most +-1. Since the horizontal filter
1210    generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
1211    within the block, the parameters must satisfy
1212    4 * |alpha| + 7 * |beta| <= 1   and   4 * |gamma| + 4 * |delta| <= 1
1213    for this filter to be applicable.
1214 
1215    Note: This function assumes that the caller has done all of the relevant
1216    checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
1217    are set appropriately (if using a ROTZOOM model), and that alpha, beta,
1218    gamma, delta are all in range.
1219 
1220    TODO(david.barker): Maybe support scaled references?
1221 */
1222 /* A note on hardware implementation:
1223     The warp filter is intended to be implementable using the same hardware as
1224     the high-precision convolve filters from the loop-restoration and
1225     convolve-round experiments.
1226 
1227     For a single filter stage, considering all of the coefficient sets for the
1228     warp filter and the regular convolution filter, an input in the range
1229     [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)]
1230     before rounding.
1231 
1232     Allowing for some changes to the filter coefficient sets, call the range
1233     [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k,
1234     we can replace this by the range [0, 256 * 2^k], which can be stored in an
1235     unsigned value with 8 + k bits.
1236 
1237     This allows the derivation of the appropriate bit widths and offsets for
1238     the various intermediate values: If
1239 
1240     F := WARPEDPIXEL_FILTER_BITS = 7 (or else the above ranges need adjusting)
1241          So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit
1242          intermediate value.
1243     H := HORSHEAR_REDUCE_PREC_BITS
1244     V := VERSHEAR_REDUCE_PREC_BITS
1245     (and note that we must have H + V = 2*F for the output to have the same
1246      scale as the input)
1247 
1248     then we end up with the following offsets and ranges:
1249     Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a
1250                        uint{bd + F + 1}
1251     After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}.
1252     Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a
1253                      uint{bd + 2*F + 2 - H}
1254     After rounding: The final value, before undoing the offset, fits into a
1255                     uint{bd + 2}.
1256 
1257     Then we need to undo the offsets before clamping to a pixel. Note that,
1258     if we do this at the end, the amount to subtract is actually independent
1259     of H and V:
1260 
1261     offset to subtract = (1 << ((bd + F - 1) - H + F - V)) +
1262                          (1 << ((bd + 2*F - H) - V))
1263                       == (1 << (bd - 1)) + (1 << bd)
1264 
1265     This allows us to entirely avoid clamping in both the warp filter and
1266     the convolve-round experiment. As of the time of writing, the Wiener filter
1267     from loop-restoration can encode a central coefficient up to 216, which
1268     leads to a maximum value of about 282 * 2^k after applying the offset.
1269     So in that case we still need to clamp.
1270 */
1271 void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
1272                        int height, int stride, uint8_t *pred, int p_col,
1273                        int p_row, int p_width, int p_height, int p_stride,
1274                        int subsampling_x, int subsampling_y,
1275                        ConvolveParams *conv_params, int16_t alpha, int16_t beta,
1276                        int16_t gamma, int16_t delta) {
1277   int32_t tmp[15 * 8];
1278   int i, j, k, l, m;
1279   const int bd = 8;
1280 #if CONFIG_CONVOLVE_ROUND
1281   const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
1282   const int reduce_bits_horiz =
1283       use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
1284   const int max_bits_horiz =
1285       use_conv_params
1286           ? bd + FILTER_BITS + 1 - conv_params->round_0
1287           : bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS;
1288   const int offset_bits_horiz =
1289       use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
1290   const int offset_bits_vert =
1291       use_conv_params
1292           ? bd + 2 * FILTER_BITS - conv_params->round_0
1293           : bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS;
1294   if (use_conv_params) {
1295     conv_params->do_post_rounding = 1;
1296   }
1297   assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
1298 #else
1299   const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
1300   const int max_bits_horiz =
1301       bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS;
1302   const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
1303   const int offset_bits_vert =
1304       bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS;
1305 #endif
1306   (void)max_bits_horiz;
1307 
1308   for (i = p_row; i < p_row + p_height; i += 8) {
1309     for (j = p_col; j < p_col + p_width; j += 8) {
1310       // Calculate the center of this 8x8 block,
1311       // project to luma coordinates (if in a subsampled chroma plane),
1312       // apply the affine transformation,
1313       // then convert back to the original coordinates (if necessary)
1314       const int32_t src_x = (j + 4) << subsampling_x;
1315       const int32_t src_y = (i + 4) << subsampling_y;
1316       const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
1317       const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
1318       const int32_t x4 = dst_x >> subsampling_x;
1319       const int32_t y4 = dst_y >> subsampling_y;
1320 
1321       int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
1322       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
1323       int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
1324       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
1325 
1326       sx4 += alpha * (-4) + beta * (-4);
1327       sy4 += gamma * (-4) + delta * (-4);
1328 
1329       sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
1330       sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
1331 
1332       // Horizontal filter
1333       for (k = -7; k < 8; ++k) {
1334         // Clamp to top/bottom edge of the frame
1335         int iy = iy4 + k;
1336         if (iy < 0)
1337           iy = 0;
1338         else if (iy > height - 1)
1339           iy = height - 1;
1340 
1341         int sx = sx4 + beta * (k + 4);
1342 
1343         for (l = -4; l < 4; ++l) {
1344           int ix = ix4 + l - 3;
1345           // At this point, sx = sx4 + alpha * l + beta * k
1346           const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
1347                            WARPEDPIXEL_PREC_SHIFTS;
1348           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
1349           const int16_t *coeffs = warped_filter[offs];
1350 
1351           int32_t sum = 1 << offset_bits_horiz;
1352           for (m = 0; m < 8; ++m) {
1353             // Clamp to left/right edge of the frame
1354             int sample_x = ix + m;
1355             if (sample_x < 0)
1356               sample_x = 0;
1357             else if (sample_x > width - 1)
1358               sample_x = width - 1;
1359 
1360             sum += ref[iy * stride + sample_x] * coeffs[m];
1361           }
1362           sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
1363           assert(0 <= sum && sum < (1 << max_bits_horiz));
1364           tmp[(k + 7) * 8 + (l + 4)] = sum;
1365           sx += alpha;
1366         }
1367       }
1368 
1369       // Vertical filter
1370       for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
1371         int sy = sy4 + delta * (k + 4);
1372         for (l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
1373           // At this point, sy = sy4 + gamma * l + delta * k
1374           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
1375                            WARPEDPIXEL_PREC_SHIFTS;
1376           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
1377           const int16_t *coeffs = warped_filter[offs];
1378 
1379           int32_t sum = 1 << offset_bits_vert;
1380           for (m = 0; m < 8; ++m) {
1381             sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
1382           }
1383 #if CONFIG_CONVOLVE_ROUND
1384           if (use_conv_params) {
1385             CONV_BUF_TYPE *p =
1386                 &conv_params
1387                      ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
1388                            (j - p_col + l + 4)];
1389             sum = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
1390                   (1 << (offset_bits_horiz + FILTER_BITS -
1391                          conv_params->round_0 - conv_params->round_1)) -
1392                   (1 << (offset_bits_vert - conv_params->round_1));
1393             if (conv_params->do_average)
1394               *p += sum;
1395             else
1396               *p = sum;
1397           } else {
1398 #else
1399           {
1400 #endif
1401             uint8_t *p =
1402                 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
1403             sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS);
1404             assert(0 <= sum && sum < (1 << (bd + 2)));
1405             uint8_t px = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
1406             if (conv_params->do_average)
1407               *p = ROUND_POWER_OF_TWO(*p + px, 1);
1408             else
1409               *p = px;
1410           }
1411           sy += gamma;
1412         }
1413       }
1414     }
1415   }
1416 }
1417 
1418 static void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref,
1419                        int width, int height, int stride, uint8_t *pred,
1420                        int p_col, int p_row, int p_width, int p_height,
1421                        int p_stride, int subsampling_x, int subsampling_y,
1422                        int x_scale, int y_scale, ConvolveParams *conv_params) {
1423   if (wm->wmtype == ROTZOOM) {
1424     wm->wmmat[5] = wm->wmmat[2];
1425     wm->wmmat[4] = -wm->wmmat[3];
1426   }
1427   if ((wm->wmtype == ROTZOOM || wm->wmtype == AFFINE) &&
1428       x_scale == SCALE_SUBPEL_SHIFTS && y_scale == SCALE_SUBPEL_SHIFTS) {
1429     const int32_t *const mat = wm->wmmat;
1430     const int16_t alpha = wm->alpha;
1431     const int16_t beta = wm->beta;
1432     const int16_t gamma = wm->gamma;
1433     const int16_t delta = wm->delta;
1434 
1435     av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
1436                     p_width, p_height, p_stride, subsampling_x, subsampling_y,
1437                     conv_params, alpha, beta, gamma, delta);
1438   } else {
1439     warp_plane_old(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
1440                    p_height, p_stride, subsampling_x, subsampling_y, x_scale,
1441                    y_scale, conv_params);
1442   }
1443 }
1444 
1445 static int64_t frame_error(const uint8_t *const ref, int stride,
1446                            const uint8_t *const dst, int p_width, int p_height,
1447                            int p_stride) {
1448   int64_t sum_error = 0;
1449   for (int i = 0; i < p_height; ++i) {
1450     for (int j = 0; j < p_width; ++j) {
1451       sum_error +=
1452           (int64_t)error_measure(dst[j + i * p_stride] - ref[j + i * stride]);
1453     }
1454   }
1455   return sum_error;
1456 }
1457 
1458 static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
1459                           int width, int height, int stride,
1460                           const uint8_t *const dst, int p_col, int p_row,
1461                           int p_width, int p_height, int p_stride,
1462                           int subsampling_x, int subsampling_y, int x_scale,
1463                           int y_scale, int64_t best_error) {
1464   int64_t gm_sumerr = 0;
1465   int warp_w, warp_h;
1466   int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
1467   int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
1468   uint8_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
1469   ConvolveParams conv_params = get_conv_params(0, 0, 0);
1470 
1471   for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
1472     for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
1473       // avoid warping extra 8x8 blocks in the padded region of the frame
1474       // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
1475       warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
1476       warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
1477       warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w, warp_h,
1478                  WARP_ERROR_BLOCK, subsampling_x, subsampling_y, x_scale,
1479                  y_scale, &conv_params);
1480 
1481       gm_sumerr += frame_error(tmp, WARP_ERROR_BLOCK, dst + j + i * p_stride,
1482                                warp_w, warp_h, p_stride);
1483       if (gm_sumerr > best_error) return gm_sumerr;
1484     }
1485   }
1486   return gm_sumerr;
1487 }
1488 
1489 int64_t av1_frame_error(
1490 #if CONFIG_HIGHBITDEPTH
1491     int use_hbd, int bd,
1492 #endif  // CONFIG_HIGHBITDEPTH
1493     const uint8_t *ref, int stride, uint8_t *dst, int p_width, int p_height,
1494     int p_stride) {
1495 #if CONFIG_HIGHBITDEPTH
1496   if (use_hbd) {
1497     return highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
1498                               CONVERT_TO_SHORTPTR(dst), p_width, p_height,
1499                               p_stride, bd);
1500   }
1501 #endif  // CONFIG_HIGHBITDEPTH
1502   return frame_error(ref, stride, dst, p_width, p_height, p_stride);
1503 }
1504 
1505 int64_t av1_warp_error(WarpedMotionParams *wm,
1506 #if CONFIG_HIGHBITDEPTH
1507                        int use_hbd, int bd,
1508 #endif  // CONFIG_HIGHBITDEPTH
1509                        const uint8_t *ref, int width, int height, int stride,
1510                        uint8_t *dst, int p_col, int p_row, int p_width,
1511                        int p_height, int p_stride, int subsampling_x,
1512                        int subsampling_y, int x_scale, int y_scale,
1513                        int64_t best_error) {
1514   if (wm->wmtype <= AFFINE)
1515     if (!get_shear_params(wm)) return 1;
1516 #if CONFIG_HIGHBITDEPTH
1517   if (use_hbd)
1518     return highbd_warp_error(wm, ref, width, height, stride, dst, p_col, p_row,
1519                              p_width, p_height, p_stride, subsampling_x,
1520                              subsampling_y, x_scale, y_scale, bd, best_error);
1521 #endif  // CONFIG_HIGHBITDEPTH
1522   return warp_error(wm, ref, width, height, stride, dst, p_col, p_row, p_width,
1523                     p_height, p_stride, subsampling_x, subsampling_y, x_scale,
1524                     y_scale, best_error);
1525 }
1526 
1527 void av1_warp_plane(WarpedMotionParams *wm,
1528 #if CONFIG_HIGHBITDEPTH
1529                     int use_hbd, int bd,
1530 #endif  // CONFIG_HIGHBITDEPTH
1531                     const uint8_t *ref, int width, int height, int stride,
1532                     uint8_t *pred, int p_col, int p_row, int p_width,
1533                     int p_height, int p_stride, int subsampling_x,
1534                     int subsampling_y, int x_scale, int y_scale,
1535                     ConvolveParams *conv_params) {
1536 #if CONFIG_HIGHBITDEPTH
1537   if (use_hbd)
1538     highbd_warp_plane(wm, ref, width, height, stride, pred, p_col, p_row,
1539                       p_width, p_height, p_stride, subsampling_x, subsampling_y,
1540                       x_scale, y_scale, bd, conv_params);
1541   else
1542 #endif  // CONFIG_HIGHBITDEPTH
1543     warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
1544                p_height, p_stride, subsampling_x, subsampling_y, x_scale,
1545                y_scale, conv_params);
1546 }
1547 
1548 #if CONFIG_WARPED_MOTION
1549 #define LEAST_SQUARES_ORDER 2
1550 
1551 #define LS_MV_MAX 256  // max mv in 1/8-pel
1552 #define LS_STEP 2
1553 
1554 // Assuming LS_MV_MAX is < MAX_SB_SIZE * 8,
1555 // the precision needed is:
1556 //   (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] +
1557 //   (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] +
1558 //   1 [for sign] +
1559 //   LEAST_SQUARES_SAMPLES_MAX_BITS
1560 //        [for adding up to LEAST_SQUARES_SAMPLES_MAX samples]
1561 // The value is 23
1562 #define LS_MAT_RANGE_BITS \
1563   ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS)
1564 
1565 // Bit-depth reduction from the full-range
1566 #define LS_MAT_DOWN_BITS 2
1567 
1568 // bits range of A, Bx and By after downshifting
1569 #define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS)
1570 #define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1)))
1571 #define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1)
1572 
1573 #define LS_SUM(a) ((a)*4 + LS_STEP * 2)
1574 #define LS_SQUARE(a) \
1575   (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> 2)
1576 #define LS_PRODUCT1(a, b) \
1577   (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> 2)
1578 #define LS_PRODUCT2(a, b) \
1579   (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> 2)
1580 
1581 #define USE_LIMITED_PREC_MULT 0
1582 
1583 #if USE_LIMITED_PREC_MULT
1584 
1585 #define MUL_PREC_BITS 16
1586 static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) {
1587   int msb = 0;
1588   uint16_t mult = 0;
1589   *shift = 0;
1590   if (D != 0) {
1591     msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
1592                               : get_msb((unsigned int)D));
1593     if (msb >= MUL_PREC_BITS) {
1594       mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS);
1595       *shift = msb + 1 - MUL_PREC_BITS;
1596     } else {
1597       mult = (uint16_t)D;
1598       *shift = 0;
1599     }
1600   }
1601   return mult;
1602 }
1603 
1604 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
1605   int32_t ret;
1606   int16_t mshift;
1607   uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
1608   int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
1609   shift -= mshift;
1610   if (shift > 0) {
1611     return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift),
1612                           -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
1613                           WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
1614   } else {
1615     return (int32_t)clamp(v * (1 << (-shift)),
1616                           -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
1617                           WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
1618   }
1619   return ret;
1620 }
1621 
1622 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
1623   int16_t mshift;
1624   uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
1625   int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
1626   shift -= mshift;
1627   if (shift > 0) {
1628     return (int32_t)clamp(
1629         ROUND_POWER_OF_TWO_SIGNED(v, shift),
1630         (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
1631         (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
1632   } else {
1633     return (int32_t)clamp(
1634         v * (1 << (-shift)),
1635         (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
1636         (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
1637   }
1638 }
1639 
1640 #else
1641 
1642 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
1643   int64_t v = Px * (int64_t)iDet;
1644   return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
1645                           -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
1646                           WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
1647 }
1648 
1649 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
1650   int64_t v = Px * (int64_t)iDet;
1651   return (int32_t)clamp64(
1652       ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
1653       (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
1654       (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
1655 }
1656 #endif  // USE_LIMITED_PREC_MULT
1657 
1658 static int find_affine_int(int np, int *pts1, int *pts2, BLOCK_SIZE bsize,
1659                            int mvy, int mvx, WarpedMotionParams *wm, int mi_row,
1660                            int mi_col) {
1661   int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
1662   int32_t Bx[2] = { 0, 0 };
1663   int32_t By[2] = { 0, 0 };
1664   int i, n = 0;
1665 
1666   const int bw = block_size_wide[bsize];
1667   const int bh = block_size_high[bsize];
1668   const int isuy = (mi_row * MI_SIZE + AOMMAX(bh, MI_SIZE) / 2 - 1);
1669   const int isux = (mi_col * MI_SIZE + AOMMAX(bw, MI_SIZE) / 2 - 1);
1670   const int suy = isuy * 8;
1671   const int sux = isux * 8;
1672   const int duy = suy + mvy;
1673   const int dux = sux + mvx;
1674 
1675   // Assume the center pixel of the block has exactly the same motion vector
1676   // as transmitted for the block. First shift the origin of the source
1677   // points to the block center, and the origin of the destination points to
1678   // the block center added to the motion vector transmitted.
1679   // Let (xi, yi) denote the source points and (xi', yi') denote destination
1680   // points after origin shfifting, for i = 0, 1, 2, .... n-1.
1681   // Then if  P = [x0, y0,
1682   //               x1, y1
1683   //               x2, y1,
1684   //                ....
1685   //              ]
1686   //          q = [x0', x1', x2', ... ]'
1687   //          r = [y0', y1', y2', ... ]'
1688   // the least squares problems that need to be solved are:
1689   //          [h1, h2]' = inv(P'P)P'q and
1690   //          [h3, h4]' = inv(P'P)P'r
1691   // where the affine transformation is given by:
1692   //          x' = h1.x + h2.y
1693   //          y' = h3.x + h4.y
1694   //
1695   // The loop below computes: A = P'P, Bx = P'q, By = P'r
1696   // We need to just compute inv(A).Bx and inv(A).By for the solutions.
1697   int sx, sy, dx, dy;
1698   // Contribution from neighbor block
1699   for (i = 0; i < np && n < LEAST_SQUARES_SAMPLES_MAX; i++) {
1700     dx = pts2[i * 2] - dux;
1701     dy = pts2[i * 2 + 1] - duy;
1702     sx = pts1[i * 2] - sux;
1703     sy = pts1[i * 2 + 1] - suy;
1704     if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
1705       A[0][0] += LS_SQUARE(sx);
1706       A[0][1] += LS_PRODUCT1(sx, sy);
1707       A[1][1] += LS_SQUARE(sy);
1708       Bx[0] += LS_PRODUCT2(sx, dx);
1709       Bx[1] += LS_PRODUCT1(sy, dx);
1710       By[0] += LS_PRODUCT1(sx, dy);
1711       By[1] += LS_PRODUCT2(sy, dy);
1712       n++;
1713     }
1714   }
1715   int downshift;
1716   if (n >= 4)
1717     downshift = LS_MAT_DOWN_BITS;
1718   else if (n >= 2)
1719     downshift = LS_MAT_DOWN_BITS - 1;
1720   else
1721     downshift = LS_MAT_DOWN_BITS - 2;
1722 
1723   // Reduce precision by downshift bits
1724   A[0][0] = clamp(ROUND_POWER_OF_TWO_SIGNED(A[0][0], downshift), LS_MAT_MIN,
1725                   LS_MAT_MAX);
1726   A[0][1] = clamp(ROUND_POWER_OF_TWO_SIGNED(A[0][1], downshift), LS_MAT_MIN,
1727                   LS_MAT_MAX);
1728   A[1][1] = clamp(ROUND_POWER_OF_TWO_SIGNED(A[1][1], downshift), LS_MAT_MIN,
1729                   LS_MAT_MAX);
1730   Bx[0] = clamp(ROUND_POWER_OF_TWO_SIGNED(Bx[0], downshift), LS_MAT_MIN,
1731                 LS_MAT_MAX);
1732   Bx[1] = clamp(ROUND_POWER_OF_TWO_SIGNED(Bx[1], downshift), LS_MAT_MIN,
1733                 LS_MAT_MAX);
1734   By[0] = clamp(ROUND_POWER_OF_TWO_SIGNED(By[0], downshift), LS_MAT_MIN,
1735                 LS_MAT_MAX);
1736   By[1] = clamp(ROUND_POWER_OF_TWO_SIGNED(By[1], downshift), LS_MAT_MIN,
1737                 LS_MAT_MAX);
1738 
1739   int64_t Px[2], Py[2], Det;
1740   int16_t iDet, shift;
1741 
1742   // These divided by the Det, are the least squares solutions
1743   Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
1744   Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
1745   Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
1746   Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
1747 
1748   // Compute Determinant of A
1749   Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
1750   if (Det == 0) return 1;
1751   iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
1752   shift -= WARPEDMODEL_PREC_BITS;
1753   if (shift < 0) {
1754     iDet <<= (-shift);
1755     shift = 0;
1756   }
1757 
1758   wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift);
1759   wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift);
1760   wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
1761   wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift);
1762 
1763   // Note: In the vx, vy expressions below, the max value of each of the
1764   // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
1765   // for the first term so that the overall sum in the worst case fits
1766   // within 32 bits overall.
1767   int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
1768                (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
1769                 isuy * wm->wmmat[3]);
1770   int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
1771                (isux * wm->wmmat[4] +
1772                 isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
1773   wm->wmmat[0] =
1774       clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
1775   wm->wmmat[1] =
1776       clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
1777 
1778   wm->wmmat[6] = wm->wmmat[7] = 0;
1779   return 0;
1780 }
1781 
1782 int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
1783                     int mvx, WarpedMotionParams *wm_params, int mi_row,
1784                     int mi_col) {
1785   assert(wm_params->wmtype == AFFINE);
1786   const int result = find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params,
1787                                      mi_row, mi_col);
1788   if (result == 0) {
1789     // check compatibility with the fast warp filter
1790     if (!get_shear_params(wm_params)) return 1;
1791   }
1792 
1793   return result;
1794 }
1795 #endif  // CONFIG_WARPED_MOTION
1796