1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <memory.h>
15 #include <math.h>
16 #include <assert.h>
17
18 #include "./av1_rtcd.h"
19 #include "av1/common/warped_motion.h"
20 #include "av1/common/scale.h"
21
22 #define WARP_ERROR_BLOCK 32
23
24 /* clang-format off */
25 static const int error_measure_lut[512] = {
26 // pow 0.7
27 16384, 16339, 16294, 16249, 16204, 16158, 16113, 16068,
28 16022, 15977, 15932, 15886, 15840, 15795, 15749, 15703,
29 15657, 15612, 15566, 15520, 15474, 15427, 15381, 15335,
30 15289, 15242, 15196, 15149, 15103, 15056, 15010, 14963,
31 14916, 14869, 14822, 14775, 14728, 14681, 14634, 14587,
32 14539, 14492, 14445, 14397, 14350, 14302, 14254, 14206,
33 14159, 14111, 14063, 14015, 13967, 13918, 13870, 13822,
34 13773, 13725, 13676, 13628, 13579, 13530, 13481, 13432,
35 13383, 13334, 13285, 13236, 13187, 13137, 13088, 13038,
36 12988, 12939, 12889, 12839, 12789, 12739, 12689, 12639,
37 12588, 12538, 12487, 12437, 12386, 12335, 12285, 12234,
38 12183, 12132, 12080, 12029, 11978, 11926, 11875, 11823,
39 11771, 11719, 11667, 11615, 11563, 11511, 11458, 11406,
40 11353, 11301, 11248, 11195, 11142, 11089, 11036, 10982,
41 10929, 10875, 10822, 10768, 10714, 10660, 10606, 10552,
42 10497, 10443, 10388, 10333, 10279, 10224, 10168, 10113,
43 10058, 10002, 9947, 9891, 9835, 9779, 9723, 9666,
44 9610, 9553, 9497, 9440, 9383, 9326, 9268, 9211,
45 9153, 9095, 9037, 8979, 8921, 8862, 8804, 8745,
46 8686, 8627, 8568, 8508, 8449, 8389, 8329, 8269,
47 8208, 8148, 8087, 8026, 7965, 7903, 7842, 7780,
48 7718, 7656, 7593, 7531, 7468, 7405, 7341, 7278,
49 7214, 7150, 7086, 7021, 6956, 6891, 6826, 6760,
50 6695, 6628, 6562, 6495, 6428, 6361, 6293, 6225,
51 6157, 6089, 6020, 5950, 5881, 5811, 5741, 5670,
52 5599, 5527, 5456, 5383, 5311, 5237, 5164, 5090,
53 5015, 4941, 4865, 4789, 4713, 4636, 4558, 4480,
54 4401, 4322, 4242, 4162, 4080, 3998, 3916, 3832,
55 3748, 3663, 3577, 3490, 3402, 3314, 3224, 3133,
56 3041, 2948, 2854, 2758, 2661, 2562, 2461, 2359,
57 2255, 2148, 2040, 1929, 1815, 1698, 1577, 1452,
58 1323, 1187, 1045, 894, 731, 550, 339, 0,
59 339, 550, 731, 894, 1045, 1187, 1323, 1452,
60 1577, 1698, 1815, 1929, 2040, 2148, 2255, 2359,
61 2461, 2562, 2661, 2758, 2854, 2948, 3041, 3133,
62 3224, 3314, 3402, 3490, 3577, 3663, 3748, 3832,
63 3916, 3998, 4080, 4162, 4242, 4322, 4401, 4480,
64 4558, 4636, 4713, 4789, 4865, 4941, 5015, 5090,
65 5164, 5237, 5311, 5383, 5456, 5527, 5599, 5670,
66 5741, 5811, 5881, 5950, 6020, 6089, 6157, 6225,
67 6293, 6361, 6428, 6495, 6562, 6628, 6695, 6760,
68 6826, 6891, 6956, 7021, 7086, 7150, 7214, 7278,
69 7341, 7405, 7468, 7531, 7593, 7656, 7718, 7780,
70 7842, 7903, 7965, 8026, 8087, 8148, 8208, 8269,
71 8329, 8389, 8449, 8508, 8568, 8627, 8686, 8745,
72 8804, 8862, 8921, 8979, 9037, 9095, 9153, 9211,
73 9268, 9326, 9383, 9440, 9497, 9553, 9610, 9666,
74 9723, 9779, 9835, 9891, 9947, 10002, 10058, 10113,
75 10168, 10224, 10279, 10333, 10388, 10443, 10497, 10552,
76 10606, 10660, 10714, 10768, 10822, 10875, 10929, 10982,
77 11036, 11089, 11142, 11195, 11248, 11301, 11353, 11406,
78 11458, 11511, 11563, 11615, 11667, 11719, 11771, 11823,
79 11875, 11926, 11978, 12029, 12080, 12132, 12183, 12234,
80 12285, 12335, 12386, 12437, 12487, 12538, 12588, 12639,
81 12689, 12739, 12789, 12839, 12889, 12939, 12988, 13038,
82 13088, 13137, 13187, 13236, 13285, 13334, 13383, 13432,
83 13481, 13530, 13579, 13628, 13676, 13725, 13773, 13822,
84 13870, 13918, 13967, 14015, 14063, 14111, 14159, 14206,
85 14254, 14302, 14350, 14397, 14445, 14492, 14539, 14587,
86 14634, 14681, 14728, 14775, 14822, 14869, 14916, 14963,
87 15010, 15056, 15103, 15149, 15196, 15242, 15289, 15335,
88 15381, 15427, 15474, 15520, 15566, 15612, 15657, 15703,
89 15749, 15795, 15840, 15886, 15932, 15977, 16022, 16068,
90 16113, 16158, 16204, 16249, 16294, 16339, 16384, 16384,
91 };
92 /* clang-format on */
93
get_project_points_type(TransformationType type)94 static ProjectPointsFunc get_project_points_type(TransformationType type) {
95 switch (type) {
96 case VERTRAPEZOID: return project_points_vertrapezoid;
97 case HORTRAPEZOID: return project_points_hortrapezoid;
98 case HOMOGRAPHY: return project_points_homography;
99 case AFFINE: return project_points_affine;
100 case ROTZOOM: return project_points_rotzoom;
101 case TRANSLATION: return project_points_translation;
102 default: assert(0); return NULL;
103 }
104 }
105
project_points_translation(const int32_t * mat,int * points,int * proj,const int n,const int stride_points,const int stride_proj,const int subsampling_x,const int subsampling_y)106 void project_points_translation(const int32_t *mat, int *points, int *proj,
107 const int n, const int stride_points,
108 const int stride_proj, const int subsampling_x,
109 const int subsampling_y) {
110 int i;
111 for (i = 0; i < n; ++i) {
112 const int x = *(points++), y = *(points++);
113 if (subsampling_x)
114 *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
115 ((x * (1 << (WARPEDMODEL_PREC_BITS + 1))) + mat[0]),
116 WARPEDDIFF_PREC_BITS + 1);
117 else
118 *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
119 ((x * (1 << WARPEDMODEL_PREC_BITS)) + mat[0]), WARPEDDIFF_PREC_BITS);
120 if (subsampling_y)
121 *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
122 ((y * (1 << (WARPEDMODEL_PREC_BITS + 1))) + mat[1]),
123 WARPEDDIFF_PREC_BITS + 1);
124 else
125 *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
126 ((y * (1 << WARPEDMODEL_PREC_BITS))) + mat[1], WARPEDDIFF_PREC_BITS);
127 points += stride_points - 2;
128 proj += stride_proj - 2;
129 }
130 }
131
project_points_rotzoom(const int32_t * mat,int * points,int * proj,const int n,const int stride_points,const int stride_proj,const int subsampling_x,const int subsampling_y)132 void project_points_rotzoom(const int32_t *mat, int *points, int *proj,
133 const int n, const int stride_points,
134 const int stride_proj, const int subsampling_x,
135 const int subsampling_y) {
136 int i;
137 for (i = 0; i < n; ++i) {
138 const int x = *(points++), y = *(points++);
139 if (subsampling_x)
140 *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
141 mat[2] * 2 * x + mat[3] * 2 * y + mat[0] +
142 (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
143 WARPEDDIFF_PREC_BITS + 1);
144 else
145 *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[2] * x + mat[3] * y + mat[0],
146 WARPEDDIFF_PREC_BITS);
147 if (subsampling_y)
148 *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
149 -mat[3] * 2 * x + mat[2] * 2 * y + mat[1] +
150 (-mat[3] + mat[2] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
151 WARPEDDIFF_PREC_BITS + 1);
152 else
153 *(proj++) = ROUND_POWER_OF_TWO_SIGNED(-mat[3] * x + mat[2] * y + mat[1],
154 WARPEDDIFF_PREC_BITS);
155 points += stride_points - 2;
156 proj += stride_proj - 2;
157 }
158 }
159
project_points_affine(const int32_t * mat,int * points,int * proj,const int n,const int stride_points,const int stride_proj,const int subsampling_x,const int subsampling_y)160 void project_points_affine(const int32_t *mat, int *points, int *proj,
161 const int n, const int stride_points,
162 const int stride_proj, const int subsampling_x,
163 const int subsampling_y) {
164 int i;
165 for (i = 0; i < n; ++i) {
166 const int x = *(points++), y = *(points++);
167 if (subsampling_x)
168 *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
169 mat[2] * 2 * x + mat[3] * 2 * y + mat[0] +
170 (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
171 WARPEDDIFF_PREC_BITS + 1);
172 else
173 *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[2] * x + mat[3] * y + mat[0],
174 WARPEDDIFF_PREC_BITS);
175 if (subsampling_y)
176 *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
177 mat[4] * 2 * x + mat[5] * 2 * y + mat[1] +
178 (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
179 WARPEDDIFF_PREC_BITS + 1);
180 else
181 *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[4] * x + mat[5] * y + mat[1],
182 WARPEDDIFF_PREC_BITS);
183 points += stride_points - 2;
184 proj += stride_proj - 2;
185 }
186 }
187
project_points_hortrapezoid(const int32_t * mat,int * points,int * proj,const int n,const int stride_points,const int stride_proj,const int subsampling_x,const int subsampling_y)188 void project_points_hortrapezoid(const int32_t *mat, int *points, int *proj,
189 const int n, const int stride_points,
190 const int stride_proj, const int subsampling_x,
191 const int subsampling_y) {
192 int i;
193 int64_t x, y, Z;
194 int64_t xp, yp;
195 for (i = 0; i < n; ++i) {
196 x = *(points++), y = *(points++);
197 x = (subsampling_x ? 4 * x + 1 : 2 * x);
198 y = (subsampling_y ? 4 * y + 1 : 2 * y);
199
200 Z = (mat[7] * y + (1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS + 1)));
201 xp = (mat[2] * x + mat[3] * y + 2 * mat[0]) *
202 (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
203 WARPEDMODEL_PREC_BITS));
204 yp = (mat[5] * y + 2 * mat[1]) *
205 (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
206 WARPEDMODEL_PREC_BITS));
207
208 xp = xp > 0 ? (xp + Z / 2) / Z : (xp - Z / 2) / Z;
209 yp = yp > 0 ? (yp + Z / 2) / Z : (yp - Z / 2) / Z;
210
211 if (subsampling_x) xp = (xp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
212 if (subsampling_y) yp = (yp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
213 *(proj++) = (int)xp;
214 *(proj++) = (int)yp;
215
216 points += stride_points - 2;
217 proj += stride_proj - 2;
218 }
219 }
220
project_points_vertrapezoid(const int32_t * mat,int * points,int * proj,const int n,const int stride_points,const int stride_proj,const int subsampling_x,const int subsampling_y)221 void project_points_vertrapezoid(const int32_t *mat, int *points, int *proj,
222 const int n, const int stride_points,
223 const int stride_proj, const int subsampling_x,
224 const int subsampling_y) {
225 int i;
226 int64_t x, y, Z;
227 int64_t xp, yp;
228 for (i = 0; i < n; ++i) {
229 x = *(points++), y = *(points++);
230 x = (subsampling_x ? 4 * x + 1 : 2 * x);
231 y = (subsampling_y ? 4 * y + 1 : 2 * y);
232
233 Z = (mat[6] * x + (1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS + 1)));
234 xp = (mat[2] * x + 2 * mat[0]) *
235 (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
236 WARPEDMODEL_PREC_BITS));
237 yp = (mat[4] * x + mat[5] * y + 2 * mat[1]) *
238 (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
239 WARPEDMODEL_PREC_BITS));
240
241 xp = xp > 0 ? (xp + Z / 2) / Z : (xp - Z / 2) / Z;
242 yp = yp > 0 ? (yp + Z / 2) / Z : (yp - Z / 2) / Z;
243
244 if (subsampling_x) xp = (xp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
245 if (subsampling_y) yp = (yp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
246 *(proj++) = (int)xp;
247 *(proj++) = (int)yp;
248
249 points += stride_points - 2;
250 proj += stride_proj - 2;
251 }
252 }
253
project_points_homography(const int32_t * mat,int * points,int * proj,const int n,const int stride_points,const int stride_proj,const int subsampling_x,const int subsampling_y)254 void project_points_homography(const int32_t *mat, int *points, int *proj,
255 const int n, const int stride_points,
256 const int stride_proj, const int subsampling_x,
257 const int subsampling_y) {
258 int i;
259 int64_t x, y, Z;
260 int64_t xp, yp;
261 for (i = 0; i < n; ++i) {
262 x = *(points++), y = *(points++);
263 x = (subsampling_x ? 4 * x + 1 : 2 * x);
264 y = (subsampling_y ? 4 * y + 1 : 2 * y);
265
266 Z = (mat[6] * x + mat[7] * y + (1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS + 1)));
267 xp = (mat[2] * x + mat[3] * y + 2 * mat[0]) *
268 (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
269 WARPEDMODEL_PREC_BITS));
270 yp = (mat[4] * x + mat[5] * y + 2 * mat[1]) *
271 (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
272 WARPEDMODEL_PREC_BITS));
273
274 xp = xp > 0 ? (xp + Z / 2) / Z : (xp - Z / 2) / Z;
275 yp = yp > 0 ? (yp + Z / 2) / Z : (yp - Z / 2) / Z;
276
277 if (subsampling_x) xp = (xp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
278 if (subsampling_y) yp = (yp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
279 *(proj++) = (int)xp;
280 *(proj++) = (int)yp;
281
282 points += stride_points - 2;
283 proj += stride_proj - 2;
284 }
285 }
286
287 static const int16_t
288 filter_ntap[WARPEDPIXEL_PREC_SHIFTS][WARPEDPIXEL_FILTER_TAPS] = {
289 #if WARPEDPIXEL_PREC_BITS == 6
290 { 0, 0, 128, 0, 0, 0 }, { 0, -1, 128, 2, -1, 0 },
291 { 1, -3, 127, 4, -1, 0 }, { 1, -4, 126, 6, -2, 1 },
292 { 1, -5, 126, 8, -3, 1 }, { 1, -6, 125, 11, -4, 1 },
293 { 1, -7, 124, 13, -4, 1 }, { 2, -8, 123, 15, -5, 1 },
294 { 2, -9, 122, 18, -6, 1 }, { 2, -10, 121, 20, -6, 1 },
295 { 2, -11, 120, 22, -7, 2 }, { 2, -12, 119, 25, -8, 2 },
296 { 3, -13, 117, 27, -8, 2 }, { 3, -13, 116, 29, -9, 2 },
297 { 3, -14, 114, 32, -10, 3 }, { 3, -15, 113, 35, -10, 2 },
298 { 3, -15, 111, 37, -11, 3 }, { 3, -16, 109, 40, -11, 3 },
299 { 3, -16, 108, 42, -12, 3 }, { 4, -17, 106, 45, -13, 3 },
300 { 4, -17, 104, 47, -13, 3 }, { 4, -17, 102, 50, -14, 3 },
301 { 4, -17, 100, 52, -14, 3 }, { 4, -18, 98, 55, -15, 4 },
302 { 4, -18, 96, 58, -15, 3 }, { 4, -18, 94, 60, -16, 4 },
303 { 4, -18, 91, 63, -16, 4 }, { 4, -18, 89, 65, -16, 4 },
304 { 4, -18, 87, 68, -17, 4 }, { 4, -18, 85, 70, -17, 4 },
305 { 4, -18, 82, 73, -17, 4 }, { 4, -18, 80, 75, -17, 4 },
306 { 4, -18, 78, 78, -18, 4 }, { 4, -17, 75, 80, -18, 4 },
307 { 4, -17, 73, 82, -18, 4 }, { 4, -17, 70, 85, -18, 4 },
308 { 4, -17, 68, 87, -18, 4 }, { 4, -16, 65, 89, -18, 4 },
309 { 4, -16, 63, 91, -18, 4 }, { 4, -16, 60, 94, -18, 4 },
310 { 3, -15, 58, 96, -18, 4 }, { 4, -15, 55, 98, -18, 4 },
311 { 3, -14, 52, 100, -17, 4 }, { 3, -14, 50, 102, -17, 4 },
312 { 3, -13, 47, 104, -17, 4 }, { 3, -13, 45, 106, -17, 4 },
313 { 3, -12, 42, 108, -16, 3 }, { 3, -11, 40, 109, -16, 3 },
314 { 3, -11, 37, 111, -15, 3 }, { 2, -10, 35, 113, -15, 3 },
315 { 3, -10, 32, 114, -14, 3 }, { 2, -9, 29, 116, -13, 3 },
316 { 2, -8, 27, 117, -13, 3 }, { 2, -8, 25, 119, -12, 2 },
317 { 2, -7, 22, 120, -11, 2 }, { 1, -6, 20, 121, -10, 2 },
318 { 1, -6, 18, 122, -9, 2 }, { 1, -5, 15, 123, -8, 2 },
319 { 1, -4, 13, 124, -7, 1 }, { 1, -4, 11, 125, -6, 1 },
320 { 1, -3, 8, 126, -5, 1 }, { 1, -2, 6, 126, -4, 1 },
321 { 0, -1, 4, 127, -3, 1 }, { 0, -1, 2, 128, -1, 0 },
322 #elif WARPEDPIXEL_PREC_BITS == 5
323 { 0, 0, 128, 0, 0, 0 }, { 1, -3, 127, 4, -1, 0 },
324 { 1, -5, 126, 8, -3, 1 }, { 1, -7, 124, 13, -4, 1 },
325 { 2, -9, 122, 18, -6, 1 }, { 2, -11, 120, 22, -7, 2 },
326 { 3, -13, 117, 27, -8, 2 }, { 3, -14, 114, 32, -10, 3 },
327 { 3, -15, 111, 37, -11, 3 }, { 3, -16, 108, 42, -12, 3 },
328 { 4, -17, 104, 47, -13, 3 }, { 4, -17, 100, 52, -14, 3 },
329 { 4, -18, 96, 58, -15, 3 }, { 4, -18, 91, 63, -16, 4 },
330 { 4, -18, 87, 68, -17, 4 }, { 4, -18, 82, 73, -17, 4 },
331 { 4, -18, 78, 78, -18, 4 }, { 4, -17, 73, 82, -18, 4 },
332 { 4, -17, 68, 87, -18, 4 }, { 4, -16, 63, 91, -18, 4 },
333 { 3, -15, 58, 96, -18, 4 }, { 3, -14, 52, 100, -17, 4 },
334 { 3, -13, 47, 104, -17, 4 }, { 3, -12, 42, 108, -16, 3 },
335 { 3, -11, 37, 111, -15, 3 }, { 3, -10, 32, 114, -14, 3 },
336 { 2, -8, 27, 117, -13, 3 }, { 2, -7, 22, 120, -11, 2 },
337 { 1, -6, 18, 122, -9, 2 }, { 1, -4, 13, 124, -7, 1 },
338 { 1, -3, 8, 126, -5, 1 }, { 0, -1, 4, 127, -3, 1 },
339 #endif // WARPEDPIXEL_PREC_BITS == 6
340 };
341
do_ntap_filter(const int32_t * const p,int x)342 static int32_t do_ntap_filter(const int32_t *const p, int x) {
343 int i;
344 int32_t sum = 0;
345 for (i = 0; i < WARPEDPIXEL_FILTER_TAPS; ++i) {
346 sum += p[i - WARPEDPIXEL_FILTER_TAPS / 2 + 1] * filter_ntap[x][i];
347 }
348 return sum;
349 }
350
do_cubic_filter(const int32_t * const p,int x)351 static int32_t do_cubic_filter(const int32_t *const p, int x) {
352 if (x == 0) {
353 return p[0] * (1 << WARPEDPIXEL_FILTER_BITS);
354 } else if (x == (1 << WARPEDPIXEL_PREC_BITS)) {
355 return p[1] * (1 << WARPEDPIXEL_FILTER_BITS);
356 } else {
357 const int64_t v1 = (int64_t)x * x * x * (3 * (p[0] - p[1]) + p[2] - p[-1]);
358 const int64_t v2 =
359 (int64_t)x * x * (2 * p[-1] - 5 * p[0] + 4 * p[1] - p[2]);
360 const int64_t v3 = x * (p[1] - p[-1]);
361 const int64_t v4 = 2 * p[0];
362 return (int32_t)ROUND_POWER_OF_TWO_SIGNED(
363 (v4 * (1 << (3 * WARPEDPIXEL_PREC_BITS))) +
364 (v3 * (1 << (2 * WARPEDPIXEL_PREC_BITS))) +
365 (v2 * (1 << WARPEDPIXEL_PREC_BITS)) + v1,
366 3 * WARPEDPIXEL_PREC_BITS + 1 - WARPEDPIXEL_FILTER_BITS);
367 }
368 }
369
get_subcolumn(int taps,const uint8_t * const ref,int32_t * col,int stride,int x,int y_start)370 static INLINE void get_subcolumn(int taps, const uint8_t *const ref,
371 int32_t *col, int stride, int x, int y_start) {
372 int i;
373 for (i = 0; i < taps; ++i) {
374 col[i] = ref[(i + y_start) * stride + x];
375 }
376 }
377
bi_ntap_filter(const uint8_t * const ref,int x,int y,int stride)378 static uint8_t bi_ntap_filter(const uint8_t *const ref, int x, int y,
379 int stride) {
380 int32_t val, arr[WARPEDPIXEL_FILTER_TAPS];
381 int k;
382 const int i = (int)x >> WARPEDPIXEL_PREC_BITS;
383 const int j = (int)y >> WARPEDPIXEL_PREC_BITS;
384 for (k = 0; k < WARPEDPIXEL_FILTER_TAPS; ++k) {
385 int32_t arr_temp[WARPEDPIXEL_FILTER_TAPS];
386 get_subcolumn(WARPEDPIXEL_FILTER_TAPS, ref, arr_temp, stride,
387 i + k + 1 - WARPEDPIXEL_FILTER_TAPS / 2,
388 j + 1 - WARPEDPIXEL_FILTER_TAPS / 2);
389 arr[k] = do_ntap_filter(arr_temp + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
390 y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
391 }
392 val = do_ntap_filter(arr + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
393 x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
394 val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
395 return (uint8_t)clip_pixel(val);
396 }
397
bi_cubic_filter(const uint8_t * const ref,int x,int y,int stride)398 static uint8_t bi_cubic_filter(const uint8_t *const ref, int x, int y,
399 int stride) {
400 int32_t val, arr[4];
401 int k;
402 const int i = (int)x >> WARPEDPIXEL_PREC_BITS;
403 const int j = (int)y >> WARPEDPIXEL_PREC_BITS;
404 for (k = 0; k < 4; ++k) {
405 int32_t arr_temp[4];
406 get_subcolumn(4, ref, arr_temp, stride, i + k - 1, j - 1);
407 arr[k] =
408 do_cubic_filter(arr_temp + 1, y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
409 }
410 val = do_cubic_filter(arr + 1, x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
411 val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
412 return (uint8_t)clip_pixel(val);
413 }
414
bi_linear_filter(const uint8_t * const ref,int x,int y,int stride)415 static uint8_t bi_linear_filter(const uint8_t *const ref, int x, int y,
416 int stride) {
417 const int ix = x >> WARPEDPIXEL_PREC_BITS;
418 const int iy = y >> WARPEDPIXEL_PREC_BITS;
419 const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
420 const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
421 int32_t val;
422 val = ROUND_POWER_OF_TWO_SIGNED(
423 ref[iy * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sy) *
424 (WARPEDPIXEL_PREC_SHIFTS - sx) +
425 ref[iy * stride + ix + 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) * sx +
426 ref[(iy + 1) * stride + ix] * sy * (WARPEDPIXEL_PREC_SHIFTS - sx) +
427 ref[(iy + 1) * stride + ix + 1] * sy * sx,
428 WARPEDPIXEL_PREC_BITS * 2);
429 return (uint8_t)clip_pixel(val);
430 }
431
warp_interpolate(const uint8_t * const ref,int x,int y,int width,int height,int stride)432 static uint8_t warp_interpolate(const uint8_t *const ref, int x, int y,
433 int width, int height, int stride) {
434 const int ix = x >> WARPEDPIXEL_PREC_BITS;
435 const int iy = y >> WARPEDPIXEL_PREC_BITS;
436 const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
437 const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
438 int32_t v;
439
440 if (ix < 0 && iy < 0)
441 return ref[0];
442 else if (ix < 0 && iy >= height - 1)
443 return ref[(height - 1) * stride];
444 else if (ix >= width - 1 && iy < 0)
445 return ref[width - 1];
446 else if (ix >= width - 1 && iy >= height - 1)
447 return ref[(height - 1) * stride + (width - 1)];
448 else if (ix < 0) {
449 v = ROUND_POWER_OF_TWO_SIGNED(
450 ref[iy * stride] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
451 ref[(iy + 1) * stride] * sy,
452 WARPEDPIXEL_PREC_BITS);
453 return clip_pixel(v);
454 } else if (iy < 0) {
455 v = ROUND_POWER_OF_TWO_SIGNED(
456 ref[ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) + ref[ix + 1] * sx,
457 WARPEDPIXEL_PREC_BITS);
458 return clip_pixel(v);
459 } else if (ix >= width - 1) {
460 v = ROUND_POWER_OF_TWO_SIGNED(
461 ref[iy * stride + width - 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
462 ref[(iy + 1) * stride + width - 1] * sy,
463 WARPEDPIXEL_PREC_BITS);
464 return clip_pixel(v);
465 } else if (iy >= height - 1) {
466 v = ROUND_POWER_OF_TWO_SIGNED(
467 ref[(height - 1) * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) +
468 ref[(height - 1) * stride + ix + 1] * sx,
469 WARPEDPIXEL_PREC_BITS);
470 return clip_pixel(v);
471 } else if (ix >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
472 iy >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
473 ix < width - WARPEDPIXEL_FILTER_TAPS / 2 &&
474 iy < height - WARPEDPIXEL_FILTER_TAPS / 2) {
475 return bi_ntap_filter(ref, x, y, stride);
476 } else if (ix >= 1 && iy >= 1 && ix < width - 2 && iy < height - 2) {
477 return bi_cubic_filter(ref, x, y, stride);
478 } else {
479 return bi_linear_filter(ref, x, y, stride);
480 }
481 }
482
483 // For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
484 // at a time. The zoom/rotation/shear in the model are applied to the
485 // "fractional" position of each pixel, which therefore varies within
486 // [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
487 // We need an extra 2 taps to fit this in, for a total of 8 taps.
488 /* clang-format off */
489 const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
490 #if WARPEDPIXEL_PREC_BITS == 6
491 // [-1, 0)
492 { 0, 0, 127, 1, 0, 0, 0, 0 }, { 0, - 1, 127, 2, 0, 0, 0, 0 },
493 { 1, - 3, 127, 4, - 1, 0, 0, 0 }, { 1, - 4, 126, 6, - 2, 1, 0, 0 },
494 { 1, - 5, 126, 8, - 3, 1, 0, 0 }, { 1, - 6, 125, 11, - 4, 1, 0, 0 },
495 { 1, - 7, 124, 13, - 4, 1, 0, 0 }, { 2, - 8, 123, 15, - 5, 1, 0, 0 },
496 { 2, - 9, 122, 18, - 6, 1, 0, 0 }, { 2, -10, 121, 20, - 6, 1, 0, 0 },
497 { 2, -11, 120, 22, - 7, 2, 0, 0 }, { 2, -12, 119, 25, - 8, 2, 0, 0 },
498 { 3, -13, 117, 27, - 8, 2, 0, 0 }, { 3, -13, 116, 29, - 9, 2, 0, 0 },
499 { 3, -14, 114, 32, -10, 3, 0, 0 }, { 3, -15, 113, 35, -10, 2, 0, 0 },
500 { 3, -15, 111, 37, -11, 3, 0, 0 }, { 3, -16, 109, 40, -11, 3, 0, 0 },
501 { 3, -16, 108, 42, -12, 3, 0, 0 }, { 4, -17, 106, 45, -13, 3, 0, 0 },
502 { 4, -17, 104, 47, -13, 3, 0, 0 }, { 4, -17, 102, 50, -14, 3, 0, 0 },
503 { 4, -17, 100, 52, -14, 3, 0, 0 }, { 4, -18, 98, 55, -15, 4, 0, 0 },
504 { 4, -18, 96, 58, -15, 3, 0, 0 }, { 4, -18, 94, 60, -16, 4, 0, 0 },
505 { 4, -18, 91, 63, -16, 4, 0, 0 }, { 4, -18, 89, 65, -16, 4, 0, 0 },
506 { 4, -18, 87, 68, -17, 4, 0, 0 }, { 4, -18, 85, 70, -17, 4, 0, 0 },
507 { 4, -18, 82, 73, -17, 4, 0, 0 }, { 4, -18, 80, 75, -17, 4, 0, 0 },
508 { 4, -18, 78, 78, -18, 4, 0, 0 }, { 4, -17, 75, 80, -18, 4, 0, 0 },
509 { 4, -17, 73, 82, -18, 4, 0, 0 }, { 4, -17, 70, 85, -18, 4, 0, 0 },
510 { 4, -17, 68, 87, -18, 4, 0, 0 }, { 4, -16, 65, 89, -18, 4, 0, 0 },
511 { 4, -16, 63, 91, -18, 4, 0, 0 }, { 4, -16, 60, 94, -18, 4, 0, 0 },
512 { 3, -15, 58, 96, -18, 4, 0, 0 }, { 4, -15, 55, 98, -18, 4, 0, 0 },
513 { 3, -14, 52, 100, -17, 4, 0, 0 }, { 3, -14, 50, 102, -17, 4, 0, 0 },
514 { 3, -13, 47, 104, -17, 4, 0, 0 }, { 3, -13, 45, 106, -17, 4, 0, 0 },
515 { 3, -12, 42, 108, -16, 3, 0, 0 }, { 3, -11, 40, 109, -16, 3, 0, 0 },
516 { 3, -11, 37, 111, -15, 3, 0, 0 }, { 2, -10, 35, 113, -15, 3, 0, 0 },
517 { 3, -10, 32, 114, -14, 3, 0, 0 }, { 2, - 9, 29, 116, -13, 3, 0, 0 },
518 { 2, - 8, 27, 117, -13, 3, 0, 0 }, { 2, - 8, 25, 119, -12, 2, 0, 0 },
519 { 2, - 7, 22, 120, -11, 2, 0, 0 }, { 1, - 6, 20, 121, -10, 2, 0, 0 },
520 { 1, - 6, 18, 122, - 9, 2, 0, 0 }, { 1, - 5, 15, 123, - 8, 2, 0, 0 },
521 { 1, - 4, 13, 124, - 7, 1, 0, 0 }, { 1, - 4, 11, 125, - 6, 1, 0, 0 },
522 { 1, - 3, 8, 126, - 5, 1, 0, 0 }, { 1, - 2, 6, 126, - 4, 1, 0, 0 },
523 { 0, - 1, 4, 127, - 3, 1, 0, 0 }, { 0, 0, 2, 127, - 1, 0, 0, 0 },
524
525 // [0, 1)
526 { 0, 0, 0, 127, 1, 0, 0, 0}, { 0, 0, -1, 127, 2, 0, 0, 0},
527 { 0, 1, -3, 127, 4, -2, 1, 0}, { 0, 1, -5, 127, 6, -2, 1, 0},
528 { 0, 2, -6, 126, 8, -3, 1, 0}, {-1, 2, -7, 126, 11, -4, 2, -1},
529 {-1, 3, -8, 125, 13, -5, 2, -1}, {-1, 3, -10, 124, 16, -6, 3, -1},
530 {-1, 4, -11, 123, 18, -7, 3, -1}, {-1, 4, -12, 122, 20, -7, 3, -1},
531 {-1, 4, -13, 121, 23, -8, 3, -1}, {-2, 5, -14, 120, 25, -9, 4, -1},
532 {-1, 5, -15, 119, 27, -10, 4, -1}, {-1, 5, -16, 118, 30, -11, 4, -1},
533 {-2, 6, -17, 116, 33, -12, 5, -1}, {-2, 6, -17, 114, 35, -12, 5, -1},
534 {-2, 6, -18, 113, 38, -13, 5, -1}, {-2, 7, -19, 111, 41, -14, 6, -2},
535 {-2, 7, -19, 110, 43, -15, 6, -2}, {-2, 7, -20, 108, 46, -15, 6, -2},
536 {-2, 7, -20, 106, 49, -16, 6, -2}, {-2, 7, -21, 104, 51, -16, 7, -2},
537 {-2, 7, -21, 102, 54, -17, 7, -2}, {-2, 8, -21, 100, 56, -18, 7, -2},
538 {-2, 8, -22, 98, 59, -18, 7, -2}, {-2, 8, -22, 96, 62, -19, 7, -2},
539 {-2, 8, -22, 94, 64, -19, 7, -2}, {-2, 8, -22, 91, 67, -20, 8, -2},
540 {-2, 8, -22, 89, 69, -20, 8, -2}, {-2, 8, -22, 87, 72, -21, 8, -2},
541 {-2, 8, -21, 84, 74, -21, 8, -2}, {-2, 8, -22, 82, 77, -21, 8, -2},
542 {-2, 8, -21, 79, 79, -21, 8, -2}, {-2, 8, -21, 77, 82, -22, 8, -2},
543 {-2, 8, -21, 74, 84, -21, 8, -2}, {-2, 8, -21, 72, 87, -22, 8, -2},
544 {-2, 8, -20, 69, 89, -22, 8, -2}, {-2, 8, -20, 67, 91, -22, 8, -2},
545 {-2, 7, -19, 64, 94, -22, 8, -2}, {-2, 7, -19, 62, 96, -22, 8, -2},
546 {-2, 7, -18, 59, 98, -22, 8, -2}, {-2, 7, -18, 56, 100, -21, 8, -2},
547 {-2, 7, -17, 54, 102, -21, 7, -2}, {-2, 7, -16, 51, 104, -21, 7, -2},
548 {-2, 6, -16, 49, 106, -20, 7, -2}, {-2, 6, -15, 46, 108, -20, 7, -2},
549 {-2, 6, -15, 43, 110, -19, 7, -2}, {-2, 6, -14, 41, 111, -19, 7, -2},
550 {-1, 5, -13, 38, 113, -18, 6, -2}, {-1, 5, -12, 35, 114, -17, 6, -2},
551 {-1, 5, -12, 33, 116, -17, 6, -2}, {-1, 4, -11, 30, 118, -16, 5, -1},
552 {-1, 4, -10, 27, 119, -15, 5, -1}, {-1, 4, -9, 25, 120, -14, 5, -2},
553 {-1, 3, -8, 23, 121, -13, 4, -1}, {-1, 3, -7, 20, 122, -12, 4, -1},
554 {-1, 3, -7, 18, 123, -11, 4, -1}, {-1, 3, -6, 16, 124, -10, 3, -1},
555 {-1, 2, -5, 13, 125, -8, 3, -1}, {-1, 2, -4, 11, 126, -7, 2, -1},
556 { 0, 1, -3, 8, 126, -6, 2, 0}, { 0, 1, -2, 6, 127, -5, 1, 0},
557 { 0, 1, -2, 4, 127, -3, 1, 0}, { 0, 0, 0, 2, 127, -1, 0, 0},
558
559 // [1, 2)
560 { 0, 0, 0, 1, 127, 0, 0, 0 }, { 0, 0, 0, - 1, 127, 2, 0, 0 },
561 { 0, 0, 1, - 3, 127, 4, - 1, 0 }, { 0, 0, 1, - 4, 126, 6, - 2, 1 },
562 { 0, 0, 1, - 5, 126, 8, - 3, 1 }, { 0, 0, 1, - 6, 125, 11, - 4, 1 },
563 { 0, 0, 1, - 7, 124, 13, - 4, 1 }, { 0, 0, 2, - 8, 123, 15, - 5, 1 },
564 { 0, 0, 2, - 9, 122, 18, - 6, 1 }, { 0, 0, 2, -10, 121, 20, - 6, 1 },
565 { 0, 0, 2, -11, 120, 22, - 7, 2 }, { 0, 0, 2, -12, 119, 25, - 8, 2 },
566 { 0, 0, 3, -13, 117, 27, - 8, 2 }, { 0, 0, 3, -13, 116, 29, - 9, 2 },
567 { 0, 0, 3, -14, 114, 32, -10, 3 }, { 0, 0, 3, -15, 113, 35, -10, 2 },
568 { 0, 0, 3, -15, 111, 37, -11, 3 }, { 0, 0, 3, -16, 109, 40, -11, 3 },
569 { 0, 0, 3, -16, 108, 42, -12, 3 }, { 0, 0, 4, -17, 106, 45, -13, 3 },
570 { 0, 0, 4, -17, 104, 47, -13, 3 }, { 0, 0, 4, -17, 102, 50, -14, 3 },
571 { 0, 0, 4, -17, 100, 52, -14, 3 }, { 0, 0, 4, -18, 98, 55, -15, 4 },
572 { 0, 0, 4, -18, 96, 58, -15, 3 }, { 0, 0, 4, -18, 94, 60, -16, 4 },
573 { 0, 0, 4, -18, 91, 63, -16, 4 }, { 0, 0, 4, -18, 89, 65, -16, 4 },
574 { 0, 0, 4, -18, 87, 68, -17, 4 }, { 0, 0, 4, -18, 85, 70, -17, 4 },
575 { 0, 0, 4, -18, 82, 73, -17, 4 }, { 0, 0, 4, -18, 80, 75, -17, 4 },
576 { 0, 0, 4, -18, 78, 78, -18, 4 }, { 0, 0, 4, -17, 75, 80, -18, 4 },
577 { 0, 0, 4, -17, 73, 82, -18, 4 }, { 0, 0, 4, -17, 70, 85, -18, 4 },
578 { 0, 0, 4, -17, 68, 87, -18, 4 }, { 0, 0, 4, -16, 65, 89, -18, 4 },
579 { 0, 0, 4, -16, 63, 91, -18, 4 }, { 0, 0, 4, -16, 60, 94, -18, 4 },
580 { 0, 0, 3, -15, 58, 96, -18, 4 }, { 0, 0, 4, -15, 55, 98, -18, 4 },
581 { 0, 0, 3, -14, 52, 100, -17, 4 }, { 0, 0, 3, -14, 50, 102, -17, 4 },
582 { 0, 0, 3, -13, 47, 104, -17, 4 }, { 0, 0, 3, -13, 45, 106, -17, 4 },
583 { 0, 0, 3, -12, 42, 108, -16, 3 }, { 0, 0, 3, -11, 40, 109, -16, 3 },
584 { 0, 0, 3, -11, 37, 111, -15, 3 }, { 0, 0, 2, -10, 35, 113, -15, 3 },
585 { 0, 0, 3, -10, 32, 114, -14, 3 }, { 0, 0, 2, - 9, 29, 116, -13, 3 },
586 { 0, 0, 2, - 8, 27, 117, -13, 3 }, { 0, 0, 2, - 8, 25, 119, -12, 2 },
587 { 0, 0, 2, - 7, 22, 120, -11, 2 }, { 0, 0, 1, - 6, 20, 121, -10, 2 },
588 { 0, 0, 1, - 6, 18, 122, - 9, 2 }, { 0, 0, 1, - 5, 15, 123, - 8, 2 },
589 { 0, 0, 1, - 4, 13, 124, - 7, 1 }, { 0, 0, 1, - 4, 11, 125, - 6, 1 },
590 { 0, 0, 1, - 3, 8, 126, - 5, 1 }, { 0, 0, 1, - 2, 6, 126, - 4, 1 },
591 { 0, 0, 0, - 1, 4, 127, - 3, 1 }, { 0, 0, 0, 0, 2, 127, - 1, 0 },
592 // dummy (replicate row index 191)
593 { 0, 0, 0, 0, 2, 127, - 1, 0 },
594
595 #elif WARPEDPIXEL_PREC_BITS == 5
596 // [-1, 0)
597 {0, 0, 127, 1, 0, 0, 0, 0}, {1, -3, 127, 4, -1, 0, 0, 0},
598 {1, -5, 126, 8, -3, 1, 0, 0}, {1, -7, 124, 13, -4, 1, 0, 0},
599 {2, -9, 122, 18, -6, 1, 0, 0}, {2, -11, 120, 22, -7, 2, 0, 0},
600 {3, -13, 117, 27, -8, 2, 0, 0}, {3, -14, 114, 32, -10, 3, 0, 0},
601 {3, -15, 111, 37, -11, 3, 0, 0}, {3, -16, 108, 42, -12, 3, 0, 0},
602 {4, -17, 104, 47, -13, 3, 0, 0}, {4, -17, 100, 52, -14, 3, 0, 0},
603 {4, -18, 96, 58, -15, 3, 0, 0}, {4, -18, 91, 63, -16, 4, 0, 0},
604 {4, -18, 87, 68, -17, 4, 0, 0}, {4, -18, 82, 73, -17, 4, 0, 0},
605 {4, -18, 78, 78, -18, 4, 0, 0}, {4, -17, 73, 82, -18, 4, 0, 0},
606 {4, -17, 68, 87, -18, 4, 0, 0}, {4, -16, 63, 91, -18, 4, 0, 0},
607 {3, -15, 58, 96, -18, 4, 0, 0}, {3, -14, 52, 100, -17, 4, 0, 0},
608 {3, -13, 47, 104, -17, 4, 0, 0}, {3, -12, 42, 108, -16, 3, 0, 0},
609 {3, -11, 37, 111, -15, 3, 0, 0}, {3, -10, 32, 114, -14, 3, 0, 0},
610 {2, -8, 27, 117, -13, 3, 0, 0}, {2, -7, 22, 120, -11, 2, 0, 0},
611 {1, -6, 18, 122, -9, 2, 0, 0}, {1, -4, 13, 124, -7, 1, 0, 0},
612 {1, -3, 8, 126, -5, 1, 0, 0}, {0, -1, 4, 127, -3, 1, 0, 0},
613 // [0, 1)
614 { 0, 0, 0, 127, 1, 0, 0, 0}, { 0, 1, -3, 127, 4, -2, 1, 0},
615 { 0, 2, -6, 126, 8, -3, 1, 0}, {-1, 3, -8, 125, 13, -5, 2, -1},
616 {-1, 4, -11, 123, 18, -7, 3, -1}, {-1, 4, -13, 121, 23, -8, 3, -1},
617 {-1, 5, -15, 119, 27, -10, 4, -1}, {-2, 6, -17, 116, 33, -12, 5, -1},
618 {-2, 6, -18, 113, 38, -13, 5, -1}, {-2, 7, -19, 110, 43, -15, 6, -2},
619 {-2, 7, -20, 106, 49, -16, 6, -2}, {-2, 7, -21, 102, 54, -17, 7, -2},
620 {-2, 8, -22, 98, 59, -18, 7, -2}, {-2, 8, -22, 94, 64, -19, 7, -2},
621 {-2, 8, -22, 89, 69, -20, 8, -2}, {-2, 8, -21, 84, 74, -21, 8, -2},
622 {-2, 8, -21, 79, 79, -21, 8, -2}, {-2, 8, -21, 74, 84, -21, 8, -2},
623 {-2, 8, -20, 69, 89, -22, 8, -2}, {-2, 7, -19, 64, 94, -22, 8, -2},
624 {-2, 7, -18, 59, 98, -22, 8, -2}, {-2, 7, -17, 54, 102, -21, 7, -2},
625 {-2, 6, -16, 49, 106, -20, 7, -2}, {-2, 6, -15, 43, 110, -19, 7, -2},
626 {-1, 5, -13, 38, 113, -18, 6, -2}, {-1, 5, -12, 33, 116, -17, 6, -2},
627 {-1, 4, -10, 27, 119, -15, 5, -1}, {-1, 3, -8, 23, 121, -13, 4, -1},
628 {-1, 3, -7, 18, 123, -11, 4, -1}, {-1, 2, -5, 13, 125, -8, 3, -1},
629 { 0, 1, -3, 8, 126, -6, 2, 0}, { 0, 1, -2, 4, 127, -3, 1, 0},
630 // [1, 2)
631 {0, 0, 0, 1, 127, 0, 0, 0}, {0, 0, 1, -3, 127, 4, -1, 0},
632 {0, 0, 1, -5, 126, 8, -3, 1}, {0, 0, 1, -7, 124, 13, -4, 1},
633 {0, 0, 2, -9, 122, 18, -6, 1}, {0, 0, 2, -11, 120, 22, -7, 2},
634 {0, 0, 3, -13, 117, 27, -8, 2}, {0, 0, 3, -14, 114, 32, -10, 3},
635 {0, 0, 3, -15, 111, 37, -11, 3}, {0, 0, 3, -16, 108, 42, -12, 3},
636 {0, 0, 4, -17, 104, 47, -13, 3}, {0, 0, 4, -17, 100, 52, -14, 3},
637 {0, 0, 4, -18, 96, 58, -15, 3}, {0, 0, 4, -18, 91, 63, -16, 4},
638 {0, 0, 4, -18, 87, 68, -17, 4}, {0, 0, 4, -18, 82, 73, -17, 4},
639 {0, 0, 4, -18, 78, 78, -18, 4}, {0, 0, 4, -17, 73, 82, -18, 4},
640 {0, 0, 4, -17, 68, 87, -18, 4}, {0, 0, 4, -16, 63, 91, -18, 4},
641 {0, 0, 3, -15, 58, 96, -18, 4}, {0, 0, 3, -14, 52, 100, -17, 4},
642 {0, 0, 3, -13, 47, 104, -17, 4}, {0, 0, 3, -12, 42, 108, -16, 3},
643 {0, 0, 3, -11, 37, 111, -15, 3}, {0, 0, 3, -10, 32, 114, -14, 3},
644 {0, 0, 2, -8, 27, 117, -13, 3}, {0, 0, 2, -7, 22, 120, -11, 2},
645 {0, 0, 1, -6, 18, 122, -9, 2}, {0, 0, 1, -4, 13, 124, -7, 1},
646 {0, 0, 1, -3, 8, 126, -5, 1}, {0, 0, 0, -1, 4, 127, -3, 1},
647 // dummy (replicate row index 95)
648 {0, 0, 0, -1, 4, 127, -3, 1},
649
650 #endif // WARPEDPIXEL_PREC_BITS == 6
651 };
652
653 /* clang-format on */
654
655 #define DIV_LUT_PREC_BITS 14
656 #define DIV_LUT_BITS 8
657 #define DIV_LUT_NUM (1 << DIV_LUT_BITS)
658
659 static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
660 16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
661 15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
662 15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
663 14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
664 13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
665 13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
666 13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
667 12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
668 12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
669 11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
670 11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
671 11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
672 10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
673 10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
674 10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
675 9963, 9939, 9916, 9892, 9869, 9846, 9823, 9800, 9777, 9754, 9732,
676 9709, 9687, 9664, 9642, 9620, 9598, 9576, 9554, 9533, 9511, 9489,
677 9468, 9447, 9425, 9404, 9383, 9362, 9341, 9321, 9300, 9279, 9259,
678 9239, 9218, 9198, 9178, 9158, 9138, 9118, 9098, 9079, 9059, 9039,
679 9020, 9001, 8981, 8962, 8943, 8924, 8905, 8886, 8867, 8849, 8830,
680 8812, 8793, 8775, 8756, 8738, 8720, 8702, 8684, 8666, 8648, 8630,
681 8613, 8595, 8577, 8560, 8542, 8525, 8508, 8490, 8473, 8456, 8439,
682 8422, 8405, 8389, 8372, 8355, 8339, 8322, 8306, 8289, 8273, 8257,
683 8240, 8224, 8208, 8192,
684 };
685
686 #if CONFIG_WARPED_MOTION
687 // Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
688 // at precision of DIV_LUT_PREC_BITS along with the shift.
resolve_divisor_64(uint64_t D,int16_t * shift)689 static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
690 int64_t e, f;
691 *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
692 : get_msb((unsigned int)D));
693 // e is obtained from D after resetting the most significant 1 bit.
694 e = D - ((uint64_t)1 << *shift);
695 // Get the most significant DIV_LUT_BITS (8) bits of e into f
696 if (*shift > DIV_LUT_BITS)
697 f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
698 else
699 f = e << (DIV_LUT_BITS - *shift);
700 assert(f <= DIV_LUT_NUM);
701 *shift += DIV_LUT_PREC_BITS;
702 // Use f as lookup into the precomputed table of multipliers
703 return div_lut[f];
704 }
705 #endif // CONFIG_WARPED_MOTION
706
resolve_divisor_32(uint32_t D,int16_t * shift)707 static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
708 int32_t e, f;
709 *shift = get_msb(D);
710 // e is obtained from D after resetting the most significant 1 bit.
711 e = D - ((uint32_t)1 << *shift);
712 // Get the most significant DIV_LUT_BITS (8) bits of e into f
713 if (*shift > DIV_LUT_BITS)
714 f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
715 else
716 f = e << (DIV_LUT_BITS - *shift);
717 assert(f <= DIV_LUT_NUM);
718 *shift += DIV_LUT_PREC_BITS;
719 // Use f as lookup into the precomputed table of multipliers
720 return div_lut[f];
721 }
722
is_affine_valid(const WarpedMotionParams * const wm)723 static int is_affine_valid(const WarpedMotionParams *const wm) {
724 const int32_t *mat = wm->wmmat;
725 return (mat[2] > 0);
726 }
727
is_affine_shear_allowed(int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)728 static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma,
729 int16_t delta) {
730 if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
731 (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS)))
732 return 0;
733 else
734 return 1;
735 }
736
737 // Returns 1 on success or 0 on an invalid affine set
get_shear_params(WarpedMotionParams * wm)738 int get_shear_params(WarpedMotionParams *wm) {
739 const int32_t *mat = wm->wmmat;
740 if (!is_affine_valid(wm)) return 0;
741 wm->alpha =
742 clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
743 wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
744 int16_t shift;
745 int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
746 int64_t v;
747 v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
748 wm->gamma =
749 clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX);
750 v = ((int64_t)mat[3] * mat[4]) * y;
751 wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) -
752 (1 << WARPEDMODEL_PREC_BITS),
753 INT16_MIN, INT16_MAX);
754 if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
755 return 0;
756
757 wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) *
758 (1 << WARP_PARAM_REDUCE_BITS);
759 wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) *
760 (1 << WARP_PARAM_REDUCE_BITS);
761 wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) *
762 (1 << WARP_PARAM_REDUCE_BITS);
763 wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) *
764 (1 << WARP_PARAM_REDUCE_BITS);
765 return 1;
766 }
767
768 #if CONFIG_HIGHBITDEPTH
highbd_get_subcolumn(int taps,const uint16_t * const ref,int32_t * col,int stride,int x,int y_start)769 static INLINE void highbd_get_subcolumn(int taps, const uint16_t *const ref,
770 int32_t *col, int stride, int x,
771 int y_start) {
772 int i;
773 for (i = 0; i < taps; ++i) {
774 col[i] = ref[(i + y_start) * stride + x];
775 }
776 }
777
highbd_bi_ntap_filter(const uint16_t * const ref,int x,int y,int stride,int bd)778 static uint16_t highbd_bi_ntap_filter(const uint16_t *const ref, int x, int y,
779 int stride, int bd) {
780 int32_t val, arr[WARPEDPIXEL_FILTER_TAPS];
781 int k;
782 const int i = (int)x >> WARPEDPIXEL_PREC_BITS;
783 const int j = (int)y >> WARPEDPIXEL_PREC_BITS;
784 for (k = 0; k < WARPEDPIXEL_FILTER_TAPS; ++k) {
785 int32_t arr_temp[WARPEDPIXEL_FILTER_TAPS];
786 highbd_get_subcolumn(WARPEDPIXEL_FILTER_TAPS, ref, arr_temp, stride,
787 i + k + 1 - WARPEDPIXEL_FILTER_TAPS / 2,
788 j + 1 - WARPEDPIXEL_FILTER_TAPS / 2);
789 arr[k] = do_ntap_filter(arr_temp + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
790 y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
791 }
792 val = do_ntap_filter(arr + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
793 x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
794 val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
795 return (uint16_t)clip_pixel_highbd(val, bd);
796 }
797
highbd_bi_cubic_filter(const uint16_t * const ref,int x,int y,int stride,int bd)798 static uint16_t highbd_bi_cubic_filter(const uint16_t *const ref, int x, int y,
799 int stride, int bd) {
800 int32_t val, arr[4];
801 int k;
802 const int i = (int)x >> WARPEDPIXEL_PREC_BITS;
803 const int j = (int)y >> WARPEDPIXEL_PREC_BITS;
804 for (k = 0; k < 4; ++k) {
805 int32_t arr_temp[4];
806 highbd_get_subcolumn(4, ref, arr_temp, stride, i + k - 1, j - 1);
807 arr[k] =
808 do_cubic_filter(arr_temp + 1, y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
809 }
810 val = do_cubic_filter(arr + 1, x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
811 val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
812 return (uint16_t)clip_pixel_highbd(val, bd);
813 }
814
highbd_bi_linear_filter(const uint16_t * const ref,int x,int y,int stride,int bd)815 static uint16_t highbd_bi_linear_filter(const uint16_t *const ref, int x, int y,
816 int stride, int bd) {
817 const int ix = x >> WARPEDPIXEL_PREC_BITS;
818 const int iy = y >> WARPEDPIXEL_PREC_BITS;
819 const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
820 const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
821 int32_t val;
822 val = ROUND_POWER_OF_TWO_SIGNED(
823 ref[iy * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sy) *
824 (WARPEDPIXEL_PREC_SHIFTS - sx) +
825 ref[iy * stride + ix + 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) * sx +
826 ref[(iy + 1) * stride + ix] * sy * (WARPEDPIXEL_PREC_SHIFTS - sx) +
827 ref[(iy + 1) * stride + ix + 1] * sy * sx,
828 WARPEDPIXEL_PREC_BITS * 2);
829 return (uint16_t)clip_pixel_highbd(val, bd);
830 }
831
highbd_warp_interpolate(const uint16_t * const ref,int x,int y,int width,int height,int stride,int bd)832 static uint16_t highbd_warp_interpolate(const uint16_t *const ref, int x, int y,
833 int width, int height, int stride,
834 int bd) {
835 const int ix = x >> WARPEDPIXEL_PREC_BITS;
836 const int iy = y >> WARPEDPIXEL_PREC_BITS;
837 const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
838 const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
839 int32_t v;
840
841 if (ix < 0 && iy < 0)
842 return ref[0];
843 else if (ix < 0 && iy > height - 1)
844 return ref[(height - 1) * stride];
845 else if (ix > width - 1 && iy < 0)
846 return ref[width - 1];
847 else if (ix > width - 1 && iy > height - 1)
848 return ref[(height - 1) * stride + (width - 1)];
849 else if (ix < 0) {
850 v = ROUND_POWER_OF_TWO_SIGNED(
851 ref[iy * stride] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
852 ref[(iy + 1) * stride] * sy,
853 WARPEDPIXEL_PREC_BITS);
854 return clip_pixel_highbd(v, bd);
855 } else if (iy < 0) {
856 v = ROUND_POWER_OF_TWO_SIGNED(
857 ref[ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) + ref[ix + 1] * sx,
858 WARPEDPIXEL_PREC_BITS);
859 return clip_pixel_highbd(v, bd);
860 } else if (ix > width - 1) {
861 v = ROUND_POWER_OF_TWO_SIGNED(
862 ref[iy * stride + width - 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
863 ref[(iy + 1) * stride + width - 1] * sy,
864 WARPEDPIXEL_PREC_BITS);
865 return clip_pixel_highbd(v, bd);
866 } else if (iy > height - 1) {
867 v = ROUND_POWER_OF_TWO_SIGNED(
868 ref[(height - 1) * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) +
869 ref[(height - 1) * stride + ix + 1] * sx,
870 WARPEDPIXEL_PREC_BITS);
871 return clip_pixel_highbd(v, bd);
872 } else if (ix >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
873 iy >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
874 ix < width - WARPEDPIXEL_FILTER_TAPS / 2 &&
875 iy < height - WARPEDPIXEL_FILTER_TAPS / 2) {
876 return highbd_bi_ntap_filter(ref, x, y, stride, bd);
877 } else if (ix >= 1 && iy >= 1 && ix < width - 2 && iy < height - 2) {
878 return highbd_bi_cubic_filter(ref, x, y, stride, bd);
879 } else {
880 return highbd_bi_linear_filter(ref, x, y, stride, bd);
881 }
882 }
883
highbd_error_measure(int err,int bd)884 static INLINE int highbd_error_measure(int err, int bd) {
885 const int b = bd - 8;
886 const int bmask = (1 << b) - 1;
887 const int v = (1 << b);
888 int e1, e2;
889 err = abs(err);
890 e1 = err >> b;
891 e2 = err & bmask;
892 return error_measure_lut[255 + e1] * (v - e2) +
893 error_measure_lut[256 + e1] * e2;
894 }
895
highbd_warp_plane_old(const WarpedMotionParams * const wm,const uint8_t * const ref8,int width,int height,int stride,const uint8_t * const pred8,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int x_scale,int y_scale,int bd,ConvolveParams * conv_params)896 static void highbd_warp_plane_old(const WarpedMotionParams *const wm,
897 const uint8_t *const ref8, int width,
898 int height, int stride,
899 const uint8_t *const pred8, int p_col,
900 int p_row, int p_width, int p_height,
901 int p_stride, int subsampling_x,
902 int subsampling_y, int x_scale, int y_scale,
903 int bd, ConvolveParams *conv_params) {
904 int i, j;
905 ProjectPointsFunc projectpoints = get_project_points_type(wm->wmtype);
906 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
907 const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
908 if (projectpoints == NULL) return;
909 for (i = p_row; i < p_row + p_height; ++i) {
910 for (j = p_col; j < p_col + p_width; ++j) {
911 int in[2], out[2];
912 in[0] = j;
913 in[1] = i;
914 projectpoints(wm->wmmat, in, out, 1, 2, 2, subsampling_x, subsampling_y);
915 out[0] = ROUND_POWER_OF_TWO_SIGNED(out[0] * x_scale, SCALE_SUBPEL_BITS);
916 out[1] = ROUND_POWER_OF_TWO_SIGNED(out[1] * y_scale, SCALE_SUBPEL_BITS);
917 if (conv_params->do_average)
918 pred[(j - p_col) + (i - p_row) * p_stride] = ROUND_POWER_OF_TWO(
919 pred[(j - p_col) + (i - p_row) * p_stride] +
920 highbd_warp_interpolate(ref, out[0], out[1], width, height,
921 stride, bd),
922 1);
923 else
924 pred[(j - p_col) + (i - p_row) * p_stride] = highbd_warp_interpolate(
925 ref, out[0], out[1], width, height, stride, bd);
926 }
927 }
928 }
929
930 /* Note: For an explanation of the warp algorithm, and some notes on bit widths
931 for hardware implementations, see the comments above av1_warp_affine_c
932 */
av1_highbd_warp_affine_c(const int32_t * mat,const uint16_t * ref,int width,int height,int stride,uint16_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)933 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
934 int width, int height, int stride, uint16_t *pred,
935 int p_col, int p_row, int p_width, int p_height,
936 int p_stride, int subsampling_x,
937 int subsampling_y, int bd,
938 ConvolveParams *conv_params, int16_t alpha,
939 int16_t beta, int16_t gamma, int16_t delta) {
940 int32_t tmp[15 * 8];
941 int i, j, k, l, m;
942 #if CONFIG_CONVOLVE_ROUND
943 const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
944 const int reduce_bits_horiz =
945 use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
946 const int max_bits_horiz =
947 use_conv_params
948 ? bd + FILTER_BITS + 1 - conv_params->round_0
949 : bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS;
950 const int offset_bits_horiz =
951 use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
952 const int offset_bits_vert =
953 use_conv_params
954 ? bd + 2 * FILTER_BITS - conv_params->round_0
955 : bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS;
956 if (use_conv_params) {
957 conv_params->do_post_rounding = 1;
958 }
959 assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
960 #else
961 const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
962 const int max_bits_horiz =
963 bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS;
964 const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
965 const int offset_bits_vert =
966 bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS;
967 #endif
968 (void)max_bits_horiz;
969
970 for (i = p_row; i < p_row + p_height; i += 8) {
971 for (j = p_col; j < p_col + p_width; j += 8) {
972 // Calculate the center of this 8x8 block,
973 // project to luma coordinates (if in a subsampled chroma plane),
974 // apply the affine transformation,
975 // then convert back to the original coordinates (if necessary)
976 const int32_t src_x = (j + 4) << subsampling_x;
977 const int32_t src_y = (i + 4) << subsampling_y;
978 const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
979 const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
980 const int32_t x4 = dst_x >> subsampling_x;
981 const int32_t y4 = dst_y >> subsampling_y;
982
983 int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
984 int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
985 int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
986 int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
987
988 sx4 += alpha * (-4) + beta * (-4);
989 sy4 += gamma * (-4) + delta * (-4);
990
991 sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
992 sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
993
994 // Horizontal filter
995 for (k = -7; k < 8; ++k) {
996 int iy = iy4 + k;
997 if (iy < 0)
998 iy = 0;
999 else if (iy > height - 1)
1000 iy = height - 1;
1001
1002 int sx = sx4 + beta * (k + 4);
1003 for (l = -4; l < 4; ++l) {
1004 int ix = ix4 + l - 3;
1005 const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
1006 WARPEDPIXEL_PREC_SHIFTS;
1007 assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
1008 const int16_t *coeffs = warped_filter[offs];
1009
1010 int32_t sum = 1 << offset_bits_horiz;
1011 for (m = 0; m < 8; ++m) {
1012 int sample_x = ix + m;
1013 if (sample_x < 0)
1014 sample_x = 0;
1015 else if (sample_x > width - 1)
1016 sample_x = width - 1;
1017 sum += ref[iy * stride + sample_x] * coeffs[m];
1018 }
1019 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
1020 assert(0 <= sum && sum < (1 << max_bits_horiz));
1021 tmp[(k + 7) * 8 + (l + 4)] = sum;
1022 sx += alpha;
1023 }
1024 }
1025
1026 // Vertical filter
1027 for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
1028 int sy = sy4 + delta * (k + 4);
1029 for (l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
1030 const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
1031 WARPEDPIXEL_PREC_SHIFTS;
1032 assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
1033 const int16_t *coeffs = warped_filter[offs];
1034
1035 int32_t sum = 1 << offset_bits_vert;
1036 for (m = 0; m < 8; ++m) {
1037 sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
1038 }
1039 #if CONFIG_CONVOLVE_ROUND
1040 if (use_conv_params) {
1041 CONV_BUF_TYPE *p =
1042 &conv_params
1043 ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
1044 (j - p_col + l + 4)];
1045 sum = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
1046 (1 << (offset_bits_horiz + FILTER_BITS -
1047 conv_params->round_0 - conv_params->round_1)) -
1048 (1 << (offset_bits_vert - conv_params->round_1));
1049 if (conv_params->do_average)
1050 *p += sum;
1051 else
1052 *p = sum;
1053 } else {
1054 #else
1055 {
1056 #endif
1057 uint16_t *p =
1058 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
1059 sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS);
1060 assert(0 <= sum && sum < (1 << (bd + 2)));
1061 uint16_t px =
1062 clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
1063 if (conv_params->do_average)
1064 *p = ROUND_POWER_OF_TWO(*p + px, 1);
1065 else
1066 *p = px;
1067 }
1068 sy += gamma;
1069 }
1070 }
1071 }
1072 }
1073 }
1074
1075 static void highbd_warp_plane(WarpedMotionParams *wm, const uint8_t *const ref8,
1076 int width, int height, int stride,
1077 const uint8_t *const pred8, int p_col, int p_row,
1078 int p_width, int p_height, int p_stride,
1079 int subsampling_x, int subsampling_y, int x_scale,
1080 int y_scale, int bd,
1081 ConvolveParams *conv_params) {
1082 if (wm->wmtype == ROTZOOM) {
1083 wm->wmmat[5] = wm->wmmat[2];
1084 wm->wmmat[4] = -wm->wmmat[3];
1085 }
1086 if ((wm->wmtype == ROTZOOM || wm->wmtype == AFFINE) &&
1087 x_scale == SCALE_SUBPEL_SHIFTS && y_scale == SCALE_SUBPEL_SHIFTS) {
1088 const int32_t *const mat = wm->wmmat;
1089 const int16_t alpha = wm->alpha;
1090 const int16_t beta = wm->beta;
1091 const int16_t gamma = wm->gamma;
1092 const int16_t delta = wm->delta;
1093
1094 const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
1095 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
1096 av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
1097 p_width, p_height, p_stride, subsampling_x,
1098 subsampling_y, bd, conv_params, alpha, beta, gamma,
1099 delta);
1100 } else {
1101 highbd_warp_plane_old(wm, ref8, width, height, stride, pred8, p_col, p_row,
1102 p_width, p_height, p_stride, subsampling_x,
1103 subsampling_y, x_scale, y_scale, bd, conv_params);
1104 }
1105 }
1106
1107 static int64_t highbd_frame_error(const uint16_t *const ref, int stride,
1108 const uint16_t *const dst, int p_width,
1109 int p_height, int p_stride, int bd) {
1110 int64_t sum_error = 0;
1111 for (int i = 0; i < p_height; ++i) {
1112 for (int j = 0; j < p_width; ++j) {
1113 sum_error +=
1114 highbd_error_measure(dst[j + i * p_stride] - ref[j + i * stride], bd);
1115 }
1116 }
1117 return sum_error;
1118 }
1119
1120 static int64_t highbd_warp_error(
1121 WarpedMotionParams *wm, const uint8_t *const ref8, int width, int height,
1122 int stride, const uint8_t *const dst8, int p_col, int p_row, int p_width,
1123 int p_height, int p_stride, int subsampling_x, int subsampling_y,
1124 int x_scale, int y_scale, int bd, int64_t best_error) {
1125 int64_t gm_sumerr = 0;
1126 int warp_w, warp_h;
1127 int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
1128 int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
1129 uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
1130
1131 ConvolveParams conv_params = get_conv_params(0, 0, 0);
1132 for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
1133 for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
1134 // avoid warping extra 8x8 blocks in the padded region of the frame
1135 // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
1136 warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
1137 warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
1138 highbd_warp_plane(wm, ref8, width, height, stride,
1139 CONVERT_TO_BYTEPTR(tmp), j, i, warp_w, warp_h,
1140 WARP_ERROR_BLOCK, subsampling_x, subsampling_y, x_scale,
1141 y_scale, bd, &conv_params);
1142
1143 gm_sumerr += highbd_frame_error(
1144 tmp, WARP_ERROR_BLOCK, CONVERT_TO_SHORTPTR(dst8) + j + i * p_stride,
1145 warp_w, warp_h, p_stride, bd);
1146 if (gm_sumerr > best_error) return gm_sumerr;
1147 }
1148 }
1149 return gm_sumerr;
1150 }
1151 #endif // CONFIG_HIGHBITDEPTH
1152
1153 static INLINE int error_measure(int err) {
1154 return error_measure_lut[255 + err];
1155 }
1156
1157 static void warp_plane_old(const WarpedMotionParams *const wm,
1158 const uint8_t *const ref, int width, int height,
1159 int stride, uint8_t *pred, int p_col, int p_row,
1160 int p_width, int p_height, int p_stride,
1161 int subsampling_x, int subsampling_y, int x_scale,
1162 int y_scale, ConvolveParams *conv_params) {
1163 int i, j;
1164 ProjectPointsFunc projectpoints = get_project_points_type(wm->wmtype);
1165 if (projectpoints == NULL) return;
1166 for (i = p_row; i < p_row + p_height; ++i) {
1167 for (j = p_col; j < p_col + p_width; ++j) {
1168 int in[2], out[2];
1169 in[0] = j;
1170 in[1] = i;
1171 projectpoints(wm->wmmat, in, out, 1, 2, 2, subsampling_x, subsampling_y);
1172 out[0] = ROUND_POWER_OF_TWO_SIGNED(out[0] * x_scale, SCALE_SUBPEL_BITS);
1173 out[1] = ROUND_POWER_OF_TWO_SIGNED(out[1] * y_scale, SCALE_SUBPEL_BITS);
1174 if (conv_params->do_average)
1175 pred[(j - p_col) + (i - p_row) * p_stride] = ROUND_POWER_OF_TWO(
1176 pred[(j - p_col) + (i - p_row) * p_stride] +
1177 warp_interpolate(ref, out[0], out[1], width, height, stride),
1178 1);
1179 else
1180 pred[(j - p_col) + (i - p_row) * p_stride] =
1181 warp_interpolate(ref, out[0], out[1], width, height, stride);
1182 }
1183 }
1184 }
1185
1186 /* The warp filter for ROTZOOM and AFFINE models works as follows:
1187 * Split the input into 8x8 blocks
1188 * For each block, project the point (4, 4) within the block, to get the
1189 overall block position. Split into integer and fractional coordinates,
1190 maintaining full WARPEDMODEL precision
1191 * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
1192 variable horizontal offset. This means that, while the rows of the
1193 intermediate buffer align with the rows of the *reference* image, the
1194 columns align with the columns of the *destination* image.
1195 * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
1196 destination is too small we crop the output at this stage). Each pixel has
1197 a variable vertical offset, so that the resulting rows are aligned with
1198 the rows of the destination image.
1199
1200 To accomplish these alignments, we factor the warp matrix as a
1201 product of two shear / asymmetric zoom matrices:
1202 / a b \ = / 1 0 \ * / 1+alpha beta \
1203 \ c d / \ gamma 1+delta / \ 0 1 /
1204 where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
1205 The horizontal shear (with alpha and beta) is applied first,
1206 then the vertical shear (with gamma and delta) is applied second.
1207
1208 The only limitation is that, to fit this in a fixed 8-tap filter size,
1209 the fractional pixel offsets must be at most +-1. Since the horizontal filter
1210 generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
1211 within the block, the parameters must satisfy
1212 4 * |alpha| + 7 * |beta| <= 1 and 4 * |gamma| + 4 * |delta| <= 1
1213 for this filter to be applicable.
1214
1215 Note: This function assumes that the caller has done all of the relevant
1216 checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
1217 are set appropriately (if using a ROTZOOM model), and that alpha, beta,
1218 gamma, delta are all in range.
1219
1220 TODO(david.barker): Maybe support scaled references?
1221 */
1222 /* A note on hardware implementation:
1223 The warp filter is intended to be implementable using the same hardware as
1224 the high-precision convolve filters from the loop-restoration and
1225 convolve-round experiments.
1226
1227 For a single filter stage, considering all of the coefficient sets for the
1228 warp filter and the regular convolution filter, an input in the range
1229 [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)]
1230 before rounding.
1231
1232 Allowing for some changes to the filter coefficient sets, call the range
1233 [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k,
1234 we can replace this by the range [0, 256 * 2^k], which can be stored in an
1235 unsigned value with 8 + k bits.
1236
1237 This allows the derivation of the appropriate bit widths and offsets for
1238 the various intermediate values: If
1239
1240 F := WARPEDPIXEL_FILTER_BITS = 7 (or else the above ranges need adjusting)
1241 So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit
1242 intermediate value.
1243 H := HORSHEAR_REDUCE_PREC_BITS
1244 V := VERSHEAR_REDUCE_PREC_BITS
1245 (and note that we must have H + V = 2*F for the output to have the same
1246 scale as the input)
1247
1248 then we end up with the following offsets and ranges:
1249 Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a
1250 uint{bd + F + 1}
1251 After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}.
1252 Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a
1253 uint{bd + 2*F + 2 - H}
1254 After rounding: The final value, before undoing the offset, fits into a
1255 uint{bd + 2}.
1256
1257 Then we need to undo the offsets before clamping to a pixel. Note that,
1258 if we do this at the end, the amount to subtract is actually independent
1259 of H and V:
1260
1261 offset to subtract = (1 << ((bd + F - 1) - H + F - V)) +
1262 (1 << ((bd + 2*F - H) - V))
1263 == (1 << (bd - 1)) + (1 << bd)
1264
1265 This allows us to entirely avoid clamping in both the warp filter and
1266 the convolve-round experiment. As of the time of writing, the Wiener filter
1267 from loop-restoration can encode a central coefficient up to 216, which
1268 leads to a maximum value of about 282 * 2^k after applying the offset.
1269 So in that case we still need to clamp.
1270 */
1271 void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
1272 int height, int stride, uint8_t *pred, int p_col,
1273 int p_row, int p_width, int p_height, int p_stride,
1274 int subsampling_x, int subsampling_y,
1275 ConvolveParams *conv_params, int16_t alpha, int16_t beta,
1276 int16_t gamma, int16_t delta) {
1277 int32_t tmp[15 * 8];
1278 int i, j, k, l, m;
1279 const int bd = 8;
1280 #if CONFIG_CONVOLVE_ROUND
1281 const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
1282 const int reduce_bits_horiz =
1283 use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
1284 const int max_bits_horiz =
1285 use_conv_params
1286 ? bd + FILTER_BITS + 1 - conv_params->round_0
1287 : bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS;
1288 const int offset_bits_horiz =
1289 use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
1290 const int offset_bits_vert =
1291 use_conv_params
1292 ? bd + 2 * FILTER_BITS - conv_params->round_0
1293 : bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS;
1294 if (use_conv_params) {
1295 conv_params->do_post_rounding = 1;
1296 }
1297 assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
1298 #else
1299 const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
1300 const int max_bits_horiz =
1301 bd + WARPEDPIXEL_FILTER_BITS + 1 - HORSHEAR_REDUCE_PREC_BITS;
1302 const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
1303 const int offset_bits_vert =
1304 bd + 2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS;
1305 #endif
1306 (void)max_bits_horiz;
1307
1308 for (i = p_row; i < p_row + p_height; i += 8) {
1309 for (j = p_col; j < p_col + p_width; j += 8) {
1310 // Calculate the center of this 8x8 block,
1311 // project to luma coordinates (if in a subsampled chroma plane),
1312 // apply the affine transformation,
1313 // then convert back to the original coordinates (if necessary)
1314 const int32_t src_x = (j + 4) << subsampling_x;
1315 const int32_t src_y = (i + 4) << subsampling_y;
1316 const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
1317 const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
1318 const int32_t x4 = dst_x >> subsampling_x;
1319 const int32_t y4 = dst_y >> subsampling_y;
1320
1321 int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
1322 int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
1323 int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
1324 int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
1325
1326 sx4 += alpha * (-4) + beta * (-4);
1327 sy4 += gamma * (-4) + delta * (-4);
1328
1329 sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
1330 sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
1331
1332 // Horizontal filter
1333 for (k = -7; k < 8; ++k) {
1334 // Clamp to top/bottom edge of the frame
1335 int iy = iy4 + k;
1336 if (iy < 0)
1337 iy = 0;
1338 else if (iy > height - 1)
1339 iy = height - 1;
1340
1341 int sx = sx4 + beta * (k + 4);
1342
1343 for (l = -4; l < 4; ++l) {
1344 int ix = ix4 + l - 3;
1345 // At this point, sx = sx4 + alpha * l + beta * k
1346 const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
1347 WARPEDPIXEL_PREC_SHIFTS;
1348 assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
1349 const int16_t *coeffs = warped_filter[offs];
1350
1351 int32_t sum = 1 << offset_bits_horiz;
1352 for (m = 0; m < 8; ++m) {
1353 // Clamp to left/right edge of the frame
1354 int sample_x = ix + m;
1355 if (sample_x < 0)
1356 sample_x = 0;
1357 else if (sample_x > width - 1)
1358 sample_x = width - 1;
1359
1360 sum += ref[iy * stride + sample_x] * coeffs[m];
1361 }
1362 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
1363 assert(0 <= sum && sum < (1 << max_bits_horiz));
1364 tmp[(k + 7) * 8 + (l + 4)] = sum;
1365 sx += alpha;
1366 }
1367 }
1368
1369 // Vertical filter
1370 for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
1371 int sy = sy4 + delta * (k + 4);
1372 for (l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
1373 // At this point, sy = sy4 + gamma * l + delta * k
1374 const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
1375 WARPEDPIXEL_PREC_SHIFTS;
1376 assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
1377 const int16_t *coeffs = warped_filter[offs];
1378
1379 int32_t sum = 1 << offset_bits_vert;
1380 for (m = 0; m < 8; ++m) {
1381 sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
1382 }
1383 #if CONFIG_CONVOLVE_ROUND
1384 if (use_conv_params) {
1385 CONV_BUF_TYPE *p =
1386 &conv_params
1387 ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
1388 (j - p_col + l + 4)];
1389 sum = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
1390 (1 << (offset_bits_horiz + FILTER_BITS -
1391 conv_params->round_0 - conv_params->round_1)) -
1392 (1 << (offset_bits_vert - conv_params->round_1));
1393 if (conv_params->do_average)
1394 *p += sum;
1395 else
1396 *p = sum;
1397 } else {
1398 #else
1399 {
1400 #endif
1401 uint8_t *p =
1402 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
1403 sum = ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS);
1404 assert(0 <= sum && sum < (1 << (bd + 2)));
1405 uint8_t px = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
1406 if (conv_params->do_average)
1407 *p = ROUND_POWER_OF_TWO(*p + px, 1);
1408 else
1409 *p = px;
1410 }
1411 sy += gamma;
1412 }
1413 }
1414 }
1415 }
1416 }
1417
1418 static void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref,
1419 int width, int height, int stride, uint8_t *pred,
1420 int p_col, int p_row, int p_width, int p_height,
1421 int p_stride, int subsampling_x, int subsampling_y,
1422 int x_scale, int y_scale, ConvolveParams *conv_params) {
1423 if (wm->wmtype == ROTZOOM) {
1424 wm->wmmat[5] = wm->wmmat[2];
1425 wm->wmmat[4] = -wm->wmmat[3];
1426 }
1427 if ((wm->wmtype == ROTZOOM || wm->wmtype == AFFINE) &&
1428 x_scale == SCALE_SUBPEL_SHIFTS && y_scale == SCALE_SUBPEL_SHIFTS) {
1429 const int32_t *const mat = wm->wmmat;
1430 const int16_t alpha = wm->alpha;
1431 const int16_t beta = wm->beta;
1432 const int16_t gamma = wm->gamma;
1433 const int16_t delta = wm->delta;
1434
1435 av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
1436 p_width, p_height, p_stride, subsampling_x, subsampling_y,
1437 conv_params, alpha, beta, gamma, delta);
1438 } else {
1439 warp_plane_old(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
1440 p_height, p_stride, subsampling_x, subsampling_y, x_scale,
1441 y_scale, conv_params);
1442 }
1443 }
1444
1445 static int64_t frame_error(const uint8_t *const ref, int stride,
1446 const uint8_t *const dst, int p_width, int p_height,
1447 int p_stride) {
1448 int64_t sum_error = 0;
1449 for (int i = 0; i < p_height; ++i) {
1450 for (int j = 0; j < p_width; ++j) {
1451 sum_error +=
1452 (int64_t)error_measure(dst[j + i * p_stride] - ref[j + i * stride]);
1453 }
1454 }
1455 return sum_error;
1456 }
1457
1458 static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
1459 int width, int height, int stride,
1460 const uint8_t *const dst, int p_col, int p_row,
1461 int p_width, int p_height, int p_stride,
1462 int subsampling_x, int subsampling_y, int x_scale,
1463 int y_scale, int64_t best_error) {
1464 int64_t gm_sumerr = 0;
1465 int warp_w, warp_h;
1466 int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
1467 int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
1468 uint8_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
1469 ConvolveParams conv_params = get_conv_params(0, 0, 0);
1470
1471 for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
1472 for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
1473 // avoid warping extra 8x8 blocks in the padded region of the frame
1474 // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
1475 warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
1476 warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
1477 warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w, warp_h,
1478 WARP_ERROR_BLOCK, subsampling_x, subsampling_y, x_scale,
1479 y_scale, &conv_params);
1480
1481 gm_sumerr += frame_error(tmp, WARP_ERROR_BLOCK, dst + j + i * p_stride,
1482 warp_w, warp_h, p_stride);
1483 if (gm_sumerr > best_error) return gm_sumerr;
1484 }
1485 }
1486 return gm_sumerr;
1487 }
1488
1489 int64_t av1_frame_error(
1490 #if CONFIG_HIGHBITDEPTH
1491 int use_hbd, int bd,
1492 #endif // CONFIG_HIGHBITDEPTH
1493 const uint8_t *ref, int stride, uint8_t *dst, int p_width, int p_height,
1494 int p_stride) {
1495 #if CONFIG_HIGHBITDEPTH
1496 if (use_hbd) {
1497 return highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
1498 CONVERT_TO_SHORTPTR(dst), p_width, p_height,
1499 p_stride, bd);
1500 }
1501 #endif // CONFIG_HIGHBITDEPTH
1502 return frame_error(ref, stride, dst, p_width, p_height, p_stride);
1503 }
1504
1505 int64_t av1_warp_error(WarpedMotionParams *wm,
1506 #if CONFIG_HIGHBITDEPTH
1507 int use_hbd, int bd,
1508 #endif // CONFIG_HIGHBITDEPTH
1509 const uint8_t *ref, int width, int height, int stride,
1510 uint8_t *dst, int p_col, int p_row, int p_width,
1511 int p_height, int p_stride, int subsampling_x,
1512 int subsampling_y, int x_scale, int y_scale,
1513 int64_t best_error) {
1514 if (wm->wmtype <= AFFINE)
1515 if (!get_shear_params(wm)) return 1;
1516 #if CONFIG_HIGHBITDEPTH
1517 if (use_hbd)
1518 return highbd_warp_error(wm, ref, width, height, stride, dst, p_col, p_row,
1519 p_width, p_height, p_stride, subsampling_x,
1520 subsampling_y, x_scale, y_scale, bd, best_error);
1521 #endif // CONFIG_HIGHBITDEPTH
1522 return warp_error(wm, ref, width, height, stride, dst, p_col, p_row, p_width,
1523 p_height, p_stride, subsampling_x, subsampling_y, x_scale,
1524 y_scale, best_error);
1525 }
1526
1527 void av1_warp_plane(WarpedMotionParams *wm,
1528 #if CONFIG_HIGHBITDEPTH
1529 int use_hbd, int bd,
1530 #endif // CONFIG_HIGHBITDEPTH
1531 const uint8_t *ref, int width, int height, int stride,
1532 uint8_t *pred, int p_col, int p_row, int p_width,
1533 int p_height, int p_stride, int subsampling_x,
1534 int subsampling_y, int x_scale, int y_scale,
1535 ConvolveParams *conv_params) {
1536 #if CONFIG_HIGHBITDEPTH
1537 if (use_hbd)
1538 highbd_warp_plane(wm, ref, width, height, stride, pred, p_col, p_row,
1539 p_width, p_height, p_stride, subsampling_x, subsampling_y,
1540 x_scale, y_scale, bd, conv_params);
1541 else
1542 #endif // CONFIG_HIGHBITDEPTH
1543 warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
1544 p_height, p_stride, subsampling_x, subsampling_y, x_scale,
1545 y_scale, conv_params);
1546 }
1547
1548 #if CONFIG_WARPED_MOTION
1549 #define LEAST_SQUARES_ORDER 2
1550
1551 #define LS_MV_MAX 256 // max mv in 1/8-pel
1552 #define LS_STEP 2
1553
1554 // Assuming LS_MV_MAX is < MAX_SB_SIZE * 8,
1555 // the precision needed is:
1556 // (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] +
1557 // (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] +
1558 // 1 [for sign] +
1559 // LEAST_SQUARES_SAMPLES_MAX_BITS
1560 // [for adding up to LEAST_SQUARES_SAMPLES_MAX samples]
1561 // The value is 23
1562 #define LS_MAT_RANGE_BITS \
1563 ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS)
1564
1565 // Bit-depth reduction from the full-range
1566 #define LS_MAT_DOWN_BITS 2
1567
1568 // bits range of A, Bx and By after downshifting
1569 #define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS)
1570 #define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1)))
1571 #define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1)
1572
1573 #define LS_SUM(a) ((a)*4 + LS_STEP * 2)
1574 #define LS_SQUARE(a) \
1575 (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> 2)
1576 #define LS_PRODUCT1(a, b) \
1577 (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> 2)
1578 #define LS_PRODUCT2(a, b) \
1579 (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> 2)
1580
1581 #define USE_LIMITED_PREC_MULT 0
1582
1583 #if USE_LIMITED_PREC_MULT
1584
1585 #define MUL_PREC_BITS 16
1586 static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) {
1587 int msb = 0;
1588 uint16_t mult = 0;
1589 *shift = 0;
1590 if (D != 0) {
1591 msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
1592 : get_msb((unsigned int)D));
1593 if (msb >= MUL_PREC_BITS) {
1594 mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS);
1595 *shift = msb + 1 - MUL_PREC_BITS;
1596 } else {
1597 mult = (uint16_t)D;
1598 *shift = 0;
1599 }
1600 }
1601 return mult;
1602 }
1603
1604 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
1605 int32_t ret;
1606 int16_t mshift;
1607 uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
1608 int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
1609 shift -= mshift;
1610 if (shift > 0) {
1611 return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift),
1612 -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
1613 WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
1614 } else {
1615 return (int32_t)clamp(v * (1 << (-shift)),
1616 -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
1617 WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
1618 }
1619 return ret;
1620 }
1621
1622 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
1623 int16_t mshift;
1624 uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
1625 int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
1626 shift -= mshift;
1627 if (shift > 0) {
1628 return (int32_t)clamp(
1629 ROUND_POWER_OF_TWO_SIGNED(v, shift),
1630 (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
1631 (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
1632 } else {
1633 return (int32_t)clamp(
1634 v * (1 << (-shift)),
1635 (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
1636 (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
1637 }
1638 }
1639
1640 #else
1641
1642 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
1643 int64_t v = Px * (int64_t)iDet;
1644 return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
1645 -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
1646 WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
1647 }
1648
1649 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
1650 int64_t v = Px * (int64_t)iDet;
1651 return (int32_t)clamp64(
1652 ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
1653 (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
1654 (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
1655 }
1656 #endif // USE_LIMITED_PREC_MULT
1657
1658 static int find_affine_int(int np, int *pts1, int *pts2, BLOCK_SIZE bsize,
1659 int mvy, int mvx, WarpedMotionParams *wm, int mi_row,
1660 int mi_col) {
1661 int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
1662 int32_t Bx[2] = { 0, 0 };
1663 int32_t By[2] = { 0, 0 };
1664 int i, n = 0;
1665
1666 const int bw = block_size_wide[bsize];
1667 const int bh = block_size_high[bsize];
1668 const int isuy = (mi_row * MI_SIZE + AOMMAX(bh, MI_SIZE) / 2 - 1);
1669 const int isux = (mi_col * MI_SIZE + AOMMAX(bw, MI_SIZE) / 2 - 1);
1670 const int suy = isuy * 8;
1671 const int sux = isux * 8;
1672 const int duy = suy + mvy;
1673 const int dux = sux + mvx;
1674
1675 // Assume the center pixel of the block has exactly the same motion vector
1676 // as transmitted for the block. First shift the origin of the source
1677 // points to the block center, and the origin of the destination points to
1678 // the block center added to the motion vector transmitted.
1679 // Let (xi, yi) denote the source points and (xi', yi') denote destination
1680 // points after origin shfifting, for i = 0, 1, 2, .... n-1.
1681 // Then if P = [x0, y0,
1682 // x1, y1
1683 // x2, y1,
1684 // ....
1685 // ]
1686 // q = [x0', x1', x2', ... ]'
1687 // r = [y0', y1', y2', ... ]'
1688 // the least squares problems that need to be solved are:
1689 // [h1, h2]' = inv(P'P)P'q and
1690 // [h3, h4]' = inv(P'P)P'r
1691 // where the affine transformation is given by:
1692 // x' = h1.x + h2.y
1693 // y' = h3.x + h4.y
1694 //
1695 // The loop below computes: A = P'P, Bx = P'q, By = P'r
1696 // We need to just compute inv(A).Bx and inv(A).By for the solutions.
1697 int sx, sy, dx, dy;
1698 // Contribution from neighbor block
1699 for (i = 0; i < np && n < LEAST_SQUARES_SAMPLES_MAX; i++) {
1700 dx = pts2[i * 2] - dux;
1701 dy = pts2[i * 2 + 1] - duy;
1702 sx = pts1[i * 2] - sux;
1703 sy = pts1[i * 2 + 1] - suy;
1704 if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
1705 A[0][0] += LS_SQUARE(sx);
1706 A[0][1] += LS_PRODUCT1(sx, sy);
1707 A[1][1] += LS_SQUARE(sy);
1708 Bx[0] += LS_PRODUCT2(sx, dx);
1709 Bx[1] += LS_PRODUCT1(sy, dx);
1710 By[0] += LS_PRODUCT1(sx, dy);
1711 By[1] += LS_PRODUCT2(sy, dy);
1712 n++;
1713 }
1714 }
1715 int downshift;
1716 if (n >= 4)
1717 downshift = LS_MAT_DOWN_BITS;
1718 else if (n >= 2)
1719 downshift = LS_MAT_DOWN_BITS - 1;
1720 else
1721 downshift = LS_MAT_DOWN_BITS - 2;
1722
1723 // Reduce precision by downshift bits
1724 A[0][0] = clamp(ROUND_POWER_OF_TWO_SIGNED(A[0][0], downshift), LS_MAT_MIN,
1725 LS_MAT_MAX);
1726 A[0][1] = clamp(ROUND_POWER_OF_TWO_SIGNED(A[0][1], downshift), LS_MAT_MIN,
1727 LS_MAT_MAX);
1728 A[1][1] = clamp(ROUND_POWER_OF_TWO_SIGNED(A[1][1], downshift), LS_MAT_MIN,
1729 LS_MAT_MAX);
1730 Bx[0] = clamp(ROUND_POWER_OF_TWO_SIGNED(Bx[0], downshift), LS_MAT_MIN,
1731 LS_MAT_MAX);
1732 Bx[1] = clamp(ROUND_POWER_OF_TWO_SIGNED(Bx[1], downshift), LS_MAT_MIN,
1733 LS_MAT_MAX);
1734 By[0] = clamp(ROUND_POWER_OF_TWO_SIGNED(By[0], downshift), LS_MAT_MIN,
1735 LS_MAT_MAX);
1736 By[1] = clamp(ROUND_POWER_OF_TWO_SIGNED(By[1], downshift), LS_MAT_MIN,
1737 LS_MAT_MAX);
1738
1739 int64_t Px[2], Py[2], Det;
1740 int16_t iDet, shift;
1741
1742 // These divided by the Det, are the least squares solutions
1743 Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
1744 Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
1745 Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
1746 Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
1747
1748 // Compute Determinant of A
1749 Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
1750 if (Det == 0) return 1;
1751 iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
1752 shift -= WARPEDMODEL_PREC_BITS;
1753 if (shift < 0) {
1754 iDet <<= (-shift);
1755 shift = 0;
1756 }
1757
1758 wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift);
1759 wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift);
1760 wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
1761 wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift);
1762
1763 // Note: In the vx, vy expressions below, the max value of each of the
1764 // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
1765 // for the first term so that the overall sum in the worst case fits
1766 // within 32 bits overall.
1767 int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
1768 (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
1769 isuy * wm->wmmat[3]);
1770 int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
1771 (isux * wm->wmmat[4] +
1772 isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
1773 wm->wmmat[0] =
1774 clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
1775 wm->wmmat[1] =
1776 clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
1777
1778 wm->wmmat[6] = wm->wmmat[7] = 0;
1779 return 0;
1780 }
1781
1782 int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
1783 int mvx, WarpedMotionParams *wm_params, int mi_row,
1784 int mi_col) {
1785 assert(wm_params->wmtype == AFFINE);
1786 const int result = find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params,
1787 mi_row, mi_col);
1788 if (result == 0) {
1789 // check compatibility with the fast warp filter
1790 if (!get_shear_params(wm_params)) return 1;
1791 }
1792
1793 return result;
1794 }
1795 #endif // CONFIG_WARPED_MOTION
1796