1 /*
2 *
3 * Copyright (c) 2020, Alliance for Open Media. All rights reserved
4 *
5 * This source code is subject to the terms of the BSD 2 Clause License and
6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7 * was not distributed with this source code in the LICENSE file, you can
8 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
9 * Media Patent License 1.0 was not distributed with this source code in the
10 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
11 */
12
13 #include <tmmintrin.h> // SSSE3
14 #include "config/av1_rtcd.h"
15 #include "config/aom_scale_rtcd.h"
16
17 #include "aom_dsp/x86/convolve_sse2.h"
18 #include "aom_dsp/x86/convolve_ssse3.h"
19 #include "aom_dsp/x86/mem_sse2.h"
20 #include "aom_dsp/x86/transpose_sse2.h"
21 #include "av1/common/resize.h"
22
scale_plane_2_to_1_phase_0_kernel(const uint8_t * const src,const __m128i * const mask)23 static INLINE __m128i scale_plane_2_to_1_phase_0_kernel(
24 const uint8_t *const src, const __m128i *const mask) {
25 const __m128i a = _mm_loadu_si128((const __m128i *)(&src[0]));
26 const __m128i b = _mm_loadu_si128((const __m128i *)(&src[16]));
27 const __m128i a_and = _mm_and_si128(a, *mask);
28 const __m128i b_and = _mm_and_si128(b, *mask);
29 return _mm_packus_epi16(a_and, b_and);
30 }
31
shuffle_filter_odd_ssse3(const int16_t * const filter,__m128i * const f)32 static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter,
33 __m128i *const f) {
34 const __m128i f_values = _mm_load_si128((const __m128i *)filter);
35 // pack and duplicate the filter values
36 // It utilizes the fact that the high byte of filter[3] is always 0 to clean
37 // half of f[0] and f[4].
38 assert(filter[3] >= 0 && filter[3] < 256);
39 f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u));
40 f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u));
41 f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u));
42 f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au));
43 f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu));
44 }
45
convolve8_8_even_offset_ssse3(const __m128i * const s,const __m128i * const f)46 static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,
47 const __m128i *const f) {
48 // multiply 2 adjacent elements with the filter and add the result
49 const __m128i k_64 = _mm_set1_epi16(1 << 6);
50 const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
51 const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
52 const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
53 const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
54 // compensate the subtracted 64 in f[1]. x4 is always non negative.
55 const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64));
56 // add and saturate the results together
57 __m128i temp = _mm_adds_epi16(x0, x3);
58 temp = _mm_adds_epi16(temp, x1);
59 temp = _mm_adds_epi16(temp, x2);
60 temp = _mm_adds_epi16(temp, x4);
61 // round and shift by 7 bit each 16 bit
62 temp = _mm_adds_epi16(temp, k_64);
63 temp = _mm_srai_epi16(temp, 7);
64 return temp;
65 }
66
convolve8_8_odd_offset_ssse3(const __m128i * const s,const __m128i * const f)67 static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s,
68 const __m128i *const f) {
69 // multiply 2 adjacent elements with the filter and add the result
70 const __m128i k_64 = _mm_set1_epi16(1 << 6);
71 const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
72 const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
73 const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
74 const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
75 const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]);
76 // compensate the subtracted 64 in f[2]. x5 is always non negative.
77 const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64));
78 __m128i temp;
79
80 // add and saturate the results together
81 temp = _mm_adds_epi16(x0, x1);
82 temp = _mm_adds_epi16(temp, x2);
83 temp = _mm_adds_epi16(temp, x3);
84 temp = _mm_adds_epi16(temp, x4);
85 temp = _mm_adds_epi16(temp, x5);
86 // round and shift by 7 bit each 16 bit
87 temp = _mm_adds_epi16(temp, k_64);
88 temp = _mm_srai_epi16(temp, 7);
89 return temp;
90 }
91
scale_plane_2_to_1_phase_0(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const int dst_w,const int dst_h)92 static void scale_plane_2_to_1_phase_0(const uint8_t *src,
93 const ptrdiff_t src_stride, uint8_t *dst,
94 const ptrdiff_t dst_stride,
95 const int dst_w, const int dst_h) {
96 const int max_width = (dst_w + 15) & ~15;
97 const __m128i mask = _mm_set1_epi16(0x00FF);
98 int y = dst_h;
99
100 do {
101 int x = max_width;
102 do {
103 const __m128i d = scale_plane_2_to_1_phase_0_kernel(src, &mask);
104 _mm_storeu_si128((__m128i *)dst, d);
105 src += 32;
106 dst += 16;
107 x -= 16;
108 } while (x);
109 src += 2 * (src_stride - max_width);
110 dst += dst_stride - max_width;
111 } while (--y);
112 }
113
scale_plane_4_to_1_phase_0(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const int dst_w,const int dst_h)114 static void scale_plane_4_to_1_phase_0(const uint8_t *src,
115 const ptrdiff_t src_stride, uint8_t *dst,
116 const ptrdiff_t dst_stride,
117 const int dst_w, const int dst_h) {
118 const int max_width = (dst_w + 15) & ~15;
119 const __m128i mask = _mm_set1_epi32(0x000000FF);
120 int y = dst_h;
121
122 do {
123 int x = max_width;
124 do {
125 const __m128i d0 = scale_plane_2_to_1_phase_0_kernel(&src[0], &mask);
126 const __m128i d1 = scale_plane_2_to_1_phase_0_kernel(&src[32], &mask);
127 const __m128i d2 = _mm_packus_epi16(d0, d1);
128 _mm_storeu_si128((__m128i *)dst, d2);
129 src += 64;
130 dst += 16;
131 x -= 16;
132 } while (x);
133 src += 4 * (src_stride - max_width);
134 dst += dst_stride - max_width;
135 } while (--y);
136 }
137
scale_plane_bilinear_kernel(const __m128i * const s,const __m128i c0c1)138 static INLINE __m128i scale_plane_bilinear_kernel(const __m128i *const s,
139 const __m128i c0c1) {
140 const __m128i k_64 = _mm_set1_epi16(1 << 6);
141 const __m128i t0 = _mm_maddubs_epi16(s[0], c0c1);
142 const __m128i t1 = _mm_maddubs_epi16(s[1], c0c1);
143 // round and shift by 7 bit each 16 bit
144 const __m128i t2 = _mm_adds_epi16(t0, k_64);
145 const __m128i t3 = _mm_adds_epi16(t1, k_64);
146 const __m128i t4 = _mm_srai_epi16(t2, 7);
147 const __m128i t5 = _mm_srai_epi16(t3, 7);
148 return _mm_packus_epi16(t4, t5);
149 }
150
scale_plane_2_to_1_bilinear(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const int dst_w,const int dst_h,const __m128i c0c1)151 static void scale_plane_2_to_1_bilinear(const uint8_t *src,
152 const ptrdiff_t src_stride,
153 uint8_t *dst,
154 const ptrdiff_t dst_stride,
155 const int dst_w, const int dst_h,
156 const __m128i c0c1) {
157 const int max_width = (dst_w + 15) & ~15;
158 int y = dst_h;
159
160 do {
161 int x = max_width;
162 do {
163 __m128i s[2], d[2];
164
165 // Horizontal
166 // Even rows
167 s[0] = _mm_loadu_si128((const __m128i *)(src + 0));
168 s[1] = _mm_loadu_si128((const __m128i *)(src + 16));
169 d[0] = scale_plane_bilinear_kernel(s, c0c1);
170
171 // odd rows
172 s[0] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
173 s[1] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
174 d[1] = scale_plane_bilinear_kernel(s, c0c1);
175
176 // Vertical
177 s[0] = _mm_unpacklo_epi8(d[0], d[1]);
178 s[1] = _mm_unpackhi_epi8(d[0], d[1]);
179 d[0] = scale_plane_bilinear_kernel(s, c0c1);
180
181 _mm_storeu_si128((__m128i *)dst, d[0]);
182 src += 32;
183 dst += 16;
184 x -= 16;
185 } while (x);
186 src += 2 * (src_stride - max_width);
187 dst += dst_stride - max_width;
188 } while (--y);
189 }
190
scale_plane_4_to_1_bilinear(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const int dst_w,const int dst_h,const __m128i c0c1)191 static void scale_plane_4_to_1_bilinear(const uint8_t *src,
192 const ptrdiff_t src_stride,
193 uint8_t *dst,
194 const ptrdiff_t dst_stride,
195 const int dst_w, const int dst_h,
196 const __m128i c0c1) {
197 const int max_width = (dst_w + 15) & ~15;
198 int y = dst_h;
199
200 do {
201 int x = max_width;
202 do {
203 __m128i s[8], d[8];
204
205 // Note: Using _mm_packus_epi32() in SSE4.1 could be faster.
206 // Here we tried to not use shuffle instructions which would be slow
207 // on some x86 CPUs.
208
209 // Horizontal
210 // 000 001 xx xx 004 005 xx xx 008 009 xx xx 00C 00D xx xx
211 // 010 011 xx xx 014 015 xx xx 018 019 xx xx 01C 01D xx xx
212 // 020 021 xx xx 024 025 xx xx 028 029 xx xx 02C 02D xx xx
213 // 030 031 xx xx 034 035 xx xx 038 039 xx xx 03C 03D xx xx
214 // 100 101 xx xx 104 105 xx xx 108 109 xx xx 10C 10D xx xx
215 // 110 111 xx xx 114 115 xx xx 118 119 xx xx 11C 11D xx xx
216 // 120 121 xx xx 124 125 xx xx 128 129 xx xx 12C 12D xx xx
217 // 130 131 xx xx 134 135 xx xx 138 139 xx xx 13C 13D xx xx
218 s[0] = _mm_loadu_si128((const __m128i *)(&src[0]));
219 s[1] = _mm_loadu_si128((const __m128i *)(&src[16]));
220 s[2] = _mm_loadu_si128((const __m128i *)(&src[32]));
221 s[3] = _mm_loadu_si128((const __m128i *)(&src[48]));
222 s[4] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
223 s[5] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
224 s[6] = _mm_loadu_si128((const __m128i *)(src + src_stride + 32));
225 s[7] = _mm_loadu_si128((const __m128i *)(src + src_stride + 48));
226
227 // 000 001 100 101 xx xx xx xx 004 005 104 105 xx xx xx xx
228 // 008 009 108 109 xx xx xx xx 00C 00D 10C 10D xx xx xx xx
229 // 010 011 110 111 xx xx xx xx 014 015 114 115 xx xx xx xx
230 // 018 019 118 119 xx xx xx xx 01C 01D 11C 11D xx xx xx xx
231 // 020 021 120 121 xx xx xx xx 024 025 124 125 xx xx xx xx
232 // 028 029 128 129 xx xx xx xx 02C 02D 12C 12D xx xx xx xx
233 // 030 031 130 131 xx xx xx xx 034 035 134 135 xx xx xx xx
234 // 038 039 138 139 xx xx xx xx 03C 03D 13C 13D xx xx xx xx
235 d[0] = _mm_unpacklo_epi16(s[0], s[4]);
236 d[1] = _mm_unpackhi_epi16(s[0], s[4]);
237 d[2] = _mm_unpacklo_epi16(s[1], s[5]);
238 d[3] = _mm_unpackhi_epi16(s[1], s[5]);
239 d[4] = _mm_unpacklo_epi16(s[2], s[6]);
240 d[5] = _mm_unpackhi_epi16(s[2], s[6]);
241 d[6] = _mm_unpacklo_epi16(s[3], s[7]);
242 d[7] = _mm_unpackhi_epi16(s[3], s[7]);
243
244 // 000 001 100 101 008 009 108 109 xx xx xx xx xx xx xx xx
245 // 004 005 104 105 00C 00D 10C 10D xx xx xx xx xx xx xx xx
246 // 010 011 110 111 018 019 118 119 xx xx xx xx xx xx xx xx
247 // 014 015 114 115 01C 01D 11C 11D xx xx xx xx xx xx xx xx
248 // 020 021 120 121 028 029 128 129 xx xx xx xx xx xx xx xx
249 // 024 025 124 125 02C 02D 12C 12D xx xx xx xx xx xx xx xx
250 // 030 031 130 131 038 039 138 139 xx xx xx xx xx xx xx xx
251 // 034 035 134 135 03C 03D 13C 13D xx xx xx xx xx xx xx xx
252 s[0] = _mm_unpacklo_epi32(d[0], d[1]);
253 s[1] = _mm_unpackhi_epi32(d[0], d[1]);
254 s[2] = _mm_unpacklo_epi32(d[2], d[3]);
255 s[3] = _mm_unpackhi_epi32(d[2], d[3]);
256 s[4] = _mm_unpacklo_epi32(d[4], d[5]);
257 s[5] = _mm_unpackhi_epi32(d[4], d[5]);
258 s[6] = _mm_unpacklo_epi32(d[6], d[7]);
259 s[7] = _mm_unpackhi_epi32(d[6], d[7]);
260
261 // 000 001 100 101 004 005 104 105 008 009 108 109 00C 00D 10C 10D
262 // 010 011 110 111 014 015 114 115 018 019 118 119 01C 01D 11C 11D
263 // 020 021 120 121 024 025 124 125 028 029 128 129 02C 02D 12C 12D
264 // 030 031 130 131 034 035 134 135 038 039 138 139 03C 03D 13C 13D
265 d[0] = _mm_unpacklo_epi32(s[0], s[1]);
266 d[1] = _mm_unpacklo_epi32(s[2], s[3]);
267 d[2] = _mm_unpacklo_epi32(s[4], s[5]);
268 d[3] = _mm_unpacklo_epi32(s[6], s[7]);
269
270 d[0] = scale_plane_bilinear_kernel(&d[0], c0c1);
271 d[1] = scale_plane_bilinear_kernel(&d[2], c0c1);
272
273 // Vertical
274 d[0] = scale_plane_bilinear_kernel(d, c0c1);
275
276 _mm_storeu_si128((__m128i *)dst, d[0]);
277 src += 64;
278 dst += 16;
279 x -= 16;
280 } while (x);
281 src += 4 * (src_stride - max_width);
282 dst += dst_stride - max_width;
283 } while (--y);
284 }
285
scale_plane_4_to_1_general(const uint8_t * src,const int src_stride,uint8_t * dst,const int dst_stride,const int w,const int h,const int16_t * const coef,uint8_t * const temp_buffer)286 static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
287 uint8_t *dst, const int dst_stride,
288 const int w, const int h,
289 const int16_t *const coef,
290 uint8_t *const temp_buffer) {
291 const int width_hor = (w + 1) & ~1;
292 const int width_ver = (w + 7) & ~7;
293 const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7;
294 const int height_ver = (h + 1) & ~1;
295 int x, y = height_hor;
296 uint8_t *t = temp_buffer;
297 __m128i s[11], d[4];
298 __m128i f[4];
299
300 assert(w && h);
301
302 shuffle_filter_ssse3(coef, f);
303 src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3;
304
305 // horizontal 2x8
306 do {
307 load_8bit_8x8(src + 4, src_stride, s);
308 // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
309 // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73
310 // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 (overlapped)
311 // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped)
312 transpose_16bit_4x8(s, s);
313 x = width_hor;
314
315 do {
316 src += 8;
317 load_8bit_8x8(src, src_stride, &s[2]);
318 // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75
319 // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77
320 // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79
321 // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B
322 transpose_16bit_4x8(&s[2], &s[2]);
323
324 d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70
325 d[1] = convolve8_8_ssse3(&s[2], f); // 01 11 21 31 41 51 61 71
326
327 // 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx xx
328 // 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
329 d[0] = _mm_packus_epi16(d[0], d[0]);
330 d[1] = _mm_packus_epi16(d[1], d[1]);
331 // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71
332 d[0] = _mm_unpacklo_epi16(d[0], d[1]);
333 store_8bit_4x4_sse2(d[0], t, 2 * width_hor);
334
335 s[0] = s[4];
336 s[1] = s[5];
337
338 t += 4;
339 x -= 2;
340 } while (x);
341 src += 8 * src_stride - 4 * width_hor;
342 t += 6 * width_hor;
343 y -= 8;
344 } while (y);
345
346 // vertical 8x2
347 x = width_ver;
348 t = temp_buffer;
349 do {
350 // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
351 // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
352 s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
353 s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
354 t += 4 * width_hor;
355 y = height_ver;
356
357 do {
358 // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
359 // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
360 // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77
361 // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77
362 loadu_8bit_16x4(t, 2 * width_hor, &s[2]);
363 t += 8 * width_hor;
364
365 d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07
366 d[1] = convolve8_8_ssse3(&s[2], f); // 10 11 12 13 14 15 16 17
367
368 // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
369 d[0] = _mm_packus_epi16(d[0], d[1]);
370 _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
371 _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
372
373 s[0] = s[4];
374 s[1] = s[5];
375
376 dst += 2 * dst_stride;
377 y -= 2;
378 } while (y);
379 t -= width_hor * (4 * height_ver + 4);
380 t += 16;
381 dst -= height_ver * dst_stride;
382 dst += 8;
383 x -= 8;
384 } while (x);
385 }
386
scale_plane_2_to_1_general(const uint8_t * src,const int src_stride,uint8_t * dst,const int dst_stride,const int w,const int h,const int16_t * const coef,uint8_t * const temp_buffer)387 static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride,
388 uint8_t *dst, const int dst_stride,
389 const int w, const int h,
390 const int16_t *const coef,
391 uint8_t *const temp_buffer) {
392 const int width_hor = (w + 3) & ~3;
393 const int width_ver = (w + 7) & ~7;
394 const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7;
395 const int height_ver = (h + 3) & ~3;
396 int x, y = height_hor;
397 uint8_t *t = temp_buffer;
398 __m128i s[11], d[4];
399 __m128i f[4];
400
401 assert(w && h);
402
403 shuffle_filter_ssse3(coef, f);
404 src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1;
405
406 // horizontal 4x8
407 do {
408 load_8bit_8x8(src + 2, src_stride, s);
409 // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
410 // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73
411 // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75
412 // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped)
413 transpose_16bit_4x8(s, s);
414 x = width_hor;
415
416 do {
417 src += 8;
418 load_8bit_8x8(src, src_stride, &s[3]);
419 // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77
420 // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79
421 // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B
422 // 0C 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D
423 transpose_16bit_4x8(&s[3], &s[3]);
424
425 d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70
426 d[1] = convolve8_8_ssse3(&s[1], f); // 01 11 21 31 41 51 61 71
427 d[2] = convolve8_8_ssse3(&s[2], f); // 02 12 22 32 42 52 62 72
428 d[3] = convolve8_8_ssse3(&s[3], f); // 03 13 23 33 43 53 63 73
429
430 // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72
431 // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73
432 d[0] = _mm_packus_epi16(d[0], d[2]);
433 d[1] = _mm_packus_epi16(d[1], d[3]);
434 // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71
435 // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73
436 d[2] = _mm_unpacklo_epi16(d[0], d[1]);
437 d[3] = _mm_unpackhi_epi16(d[0], d[1]);
438 // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33
439 // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73
440 d[0] = _mm_unpacklo_epi32(d[2], d[3]);
441 d[1] = _mm_unpackhi_epi32(d[2], d[3]);
442 store_8bit_8x4_from_16x2(d, t, 2 * width_hor);
443
444 s[0] = s[4];
445 s[1] = s[5];
446 s[2] = s[6];
447
448 t += 8;
449 x -= 4;
450 } while (x);
451 src += 8 * src_stride - 2 * width_hor;
452 t += 6 * width_hor;
453 y -= 8;
454 } while (y);
455
456 // vertical 8x4
457 x = width_ver;
458 t = temp_buffer;
459 do {
460 // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
461 // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
462 // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
463 s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
464 s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
465 s[2] = _mm_loadu_si128((const __m128i *)(t + 4 * width_hor));
466 t += 6 * width_hor;
467 y = height_ver;
468
469 do {
470 // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
471 // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77
472 // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77
473 // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 77
474 loadu_8bit_16x4(t, 2 * width_hor, &s[3]);
475 t += 8 * width_hor;
476
477 d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07
478 d[1] = convolve8_8_ssse3(&s[1], f); // 10 11 12 13 14 15 16 17
479 d[2] = convolve8_8_ssse3(&s[2], f); // 20 21 22 23 24 25 26 27
480 d[3] = convolve8_8_ssse3(&s[3], f); // 30 31 32 33 34 35 36 37
481
482 // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
483 // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37
484 d[0] = _mm_packus_epi16(d[0], d[1]);
485 d[1] = _mm_packus_epi16(d[2], d[3]);
486 store_8bit_8x4_from_16x2(d, dst, dst_stride);
487
488 s[0] = s[4];
489 s[1] = s[5];
490 s[2] = s[6];
491
492 dst += 4 * dst_stride;
493 y -= 4;
494 } while (y);
495 t -= width_hor * (2 * height_ver + 6);
496 t += 16;
497 dst -= height_ver * dst_stride;
498 dst += 8;
499 x -= 8;
500 } while (x);
501 }
502
503 typedef void (*shuffle_filter_funcs)(const int16_t *const filter,
504 __m128i *const f);
505
506 typedef __m128i (*convolve8_funcs)(const __m128i *const s,
507 const __m128i *const f);
508
scale_plane_4_to_3_general(const uint8_t * src,const int src_stride,uint8_t * dst,const int dst_stride,const int w,const int h,const InterpKernel * const coef,const int phase,uint8_t * const temp_buffer)509 static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
510 uint8_t *dst, const int dst_stride,
511 const int w, const int h,
512 const InterpKernel *const coef,
513 const int phase,
514 uint8_t *const temp_buffer) {
515 static const int step_q4 = 16 * 4 / 3;
516 const int width_hor = (w + 5) - ((w + 5) % 6);
517 const int stride_hor = 2 * width_hor + 4; // store 4 extra pixels
518 const int width_ver = (w + 7) & ~7;
519 // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows
520 // above and (SUBPEL_TAPS / 2) extra rows below.
521 const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
522 const int height_ver = (h + 5) - ((h + 5) % 6);
523 int x, y = height_hor;
524 uint8_t *t = temp_buffer;
525 __m128i s[12], d[6], dd[4];
526 __m128i f0[4], f1[5], f2[5];
527 // The offset of the first row is always less than 1 pixel.
528 const int offset1_q4 = phase + 1 * step_q4;
529 const int offset2_q4 = phase + 2 * step_q4;
530 // offset_idxx indicates the pixel offset is even (0) or odd (1).
531 // It's used to choose the src offset and filter coefficient offset.
532 const int offset_idx1 = (offset1_q4 >> 4) & 1;
533 const int offset_idx2 = (offset2_q4 >> 4) & 1;
534 static const shuffle_filter_funcs shuffle_filter_func_list[2] = {
535 shuffle_filter_ssse3, shuffle_filter_odd_ssse3
536 };
537 static const convolve8_funcs convolve8_func_list[2] = {
538 convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3
539 };
540
541 assert(w && h);
542
543 shuffle_filter_ssse3(coef[(phase + 0 * step_q4) & SUBPEL_MASK], f0);
544 shuffle_filter_func_list[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1);
545 shuffle_filter_func_list[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2);
546
547 // Sub 64 to avoid overflow.
548 // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here.
549 // Coef 128 is in either fx[1] or fx[2] depending on the phase idx.
550 // When filter phase idx is 1, the two biggest coefficients are shuffled
551 // together, and the sum of them are always no less than 128. Sub 64 here.
552 // After the subtraction, when the sum of all positive coefficients are no
553 // larger than 128, and the sum of all negative coefficients are no
554 // less than -128, there will be no overflow in the convolve8 functions.
555 f0[1] = _mm_sub_epi8(f0[1], _mm_set1_epi8(64));
556 f1[1 + offset_idx1] = _mm_sub_epi8(f1[1 + offset_idx1], _mm_set1_epi8(64));
557 f2[1 + offset_idx2] = _mm_sub_epi8(f2[1 + offset_idx2], _mm_set1_epi8(64));
558
559 src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 - 1;
560
561 // horizontal 6x8
562 do {
563 load_8bit_8x8(src, src_stride, s);
564 // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
565 // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73
566 // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75
567 // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77
568 transpose_16bit_4x8(s, s);
569 x = width_hor;
570
571 do {
572 src += 8;
573 load_8bit_8x8(src, src_stride, &s[4]);
574 // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79
575 // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B
576 // OC 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D
577 // 0E 0F 1E 1F 2E 2F 3E 3F 4E 4F 5E 5F 6E 6F 7E 7F
578 transpose_16bit_4x8(&s[4], &s[4]);
579
580 // 00 10 20 30 40 50 60 70
581 // 01 11 21 31 41 51 61 71
582 // 02 12 22 32 42 52 62 72
583 // 03 13 23 33 43 53 63 73
584 // 04 14 24 34 44 54 64 74
585 // 05 15 25 35 45 55 65 75
586 d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
587 d[1] = convolve8_func_list[offset_idx1](&s[offset1_q4 >> 5], f1);
588 d[2] = convolve8_func_list[offset_idx2](&s[offset2_q4 >> 5], f2);
589 d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
590 d[4] = convolve8_func_list[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
591 d[5] = convolve8_func_list[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
592
593 // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72
594 // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73
595 // 04 14 24 34 44 54 64 74 xx xx xx xx xx xx xx xx
596 // 05 15 25 35 45 55 65 75 xx xx xx xx xx xx xx xx
597 dd[0] = _mm_packus_epi16(d[0], d[2]);
598 dd[1] = _mm_packus_epi16(d[1], d[3]);
599 dd[2] = _mm_packus_epi16(d[4], d[4]);
600 dd[3] = _mm_packus_epi16(d[5], d[5]);
601
602 // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71
603 // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73
604 // 04 14 05 15 24 34 25 35 44 54 45 55 64 74 65 75
605 d[0] = _mm_unpacklo_epi16(dd[0], dd[1]);
606 d[1] = _mm_unpackhi_epi16(dd[0], dd[1]);
607 d[2] = _mm_unpacklo_epi16(dd[2], dd[3]);
608
609 // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33
610 // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73
611 // 04 14 05 15 xx xx xx xx 24 34 25 35 xx xx xx xx
612 // 44 54 45 55 xx xx xx xx 64 74 65 75 xx xx xx xx
613 dd[0] = _mm_unpacklo_epi32(d[0], d[1]);
614 dd[1] = _mm_unpackhi_epi32(d[0], d[1]);
615 dd[2] = _mm_unpacklo_epi32(d[2], d[2]);
616 dd[3] = _mm_unpackhi_epi32(d[2], d[2]);
617
618 // 00 10 01 11 02 12 03 13 04 14 05 15 xx xx xx xx
619 // 20 30 21 31 22 32 23 33 24 34 25 35 xx xx xx xx
620 // 40 50 41 51 42 52 43 53 44 54 45 55 xx xx xx xx
621 // 60 70 61 71 62 72 63 73 64 74 65 75 xx xx xx xx
622 d[0] = _mm_unpacklo_epi64(dd[0], dd[2]);
623 d[1] = _mm_unpackhi_epi64(dd[0], dd[2]);
624 d[2] = _mm_unpacklo_epi64(dd[1], dd[3]);
625 d[3] = _mm_unpackhi_epi64(dd[1], dd[3]);
626
627 // store 4 extra pixels
628 storeu_8bit_16x4(d, t, stride_hor);
629
630 s[0] = s[4];
631 s[1] = s[5];
632 s[2] = s[6];
633 s[3] = s[7];
634
635 t += 12;
636 x -= 6;
637 } while (x);
638 src += 8 * src_stride - 4 * width_hor / 3;
639 t += 3 * stride_hor + 4;
640 y -= 8;
641 } while (y);
642
643 // vertical 8x6
644 x = width_ver;
645 t = temp_buffer;
646 do {
647 // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
648 // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
649 // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
650 // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
651 loadu_8bit_16x4(t, stride_hor, s);
652 y = height_ver;
653
654 do {
655 // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 97
656 // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 B7
657 // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 D7
658 // E0 F0 E1 F1 E2 F2 E3 F3 E4 F4 E5 F5 E6 F6 E7 F7
659 t += 4 * stride_hor;
660 loadu_8bit_16x4(t, stride_hor, &s[4]);
661
662 d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
663 d[1] = convolve8_func_list[offset_idx1](&s[offset1_q4 >> 5], f1);
664 d[2] = convolve8_func_list[offset_idx2](&s[offset2_q4 >> 5], f2);
665 d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
666 d[4] = convolve8_func_list[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
667 d[5] = convolve8_func_list[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
668
669 // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
670 // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37
671 // 40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57
672 d[0] = _mm_packus_epi16(d[0], d[1]);
673 d[2] = _mm_packus_epi16(d[2], d[3]);
674 d[4] = _mm_packus_epi16(d[4], d[5]);
675
676 _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
677 _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
678 _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), d[2]);
679 _mm_storeh_epi64((__m128i *)(dst + 3 * dst_stride), d[2]);
680 _mm_storel_epi64((__m128i *)(dst + 4 * dst_stride), d[4]);
681 _mm_storeh_epi64((__m128i *)(dst + 5 * dst_stride), d[4]);
682
683 s[0] = s[4];
684 s[1] = s[5];
685 s[2] = s[6];
686 s[3] = s[7];
687
688 dst += 6 * dst_stride;
689 y -= 6;
690 } while (y);
691 t -= stride_hor * 2 * height_ver / 3;
692 t += 16;
693 dst -= height_ver * dst_stride;
694 dst += 8;
695 x -= 8;
696 } while (x);
697 }
698
scale_1_to_2_phase_0_kernel(const __m128i * const s,const __m128i * const f)699 static INLINE __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s,
700 const __m128i *const f) {
701 __m128i ss[4], temp;
702
703 ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
704 ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
705 ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
706 ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
707 temp = convolve8_8_ssse3(ss, f);
708 return _mm_packus_epi16(temp, temp);
709 }
710
711 // Only calculate odd columns since even columns are just src pixels' copies.
scale_1_to_2_phase_0_row(const uint8_t * src,uint8_t * dst,const int w,const __m128i * const f)712 static void scale_1_to_2_phase_0_row(const uint8_t *src, uint8_t *dst,
713 const int w, const __m128i *const f) {
714 int x = w;
715
716 do {
717 __m128i s[8], temp;
718 s[0] = _mm_loadl_epi64((const __m128i *)(src + 0));
719 s[1] = _mm_loadl_epi64((const __m128i *)(src + 1));
720 s[2] = _mm_loadl_epi64((const __m128i *)(src + 2));
721 s[3] = _mm_loadl_epi64((const __m128i *)(src + 3));
722 s[4] = _mm_loadl_epi64((const __m128i *)(src + 4));
723 s[5] = _mm_loadl_epi64((const __m128i *)(src + 5));
724 s[6] = _mm_loadl_epi64((const __m128i *)(src + 6));
725 s[7] = _mm_loadl_epi64((const __m128i *)(src + 7));
726 temp = scale_1_to_2_phase_0_kernel(s, f);
727 _mm_storel_epi64((__m128i *)dst, temp);
728 src += 8;
729 dst += 8;
730 x -= 8;
731 } while (x);
732 }
733
scale_plane_1_to_2_phase_0(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const int src_w,const int src_h,const int16_t * const coef,uint8_t * const temp_buffer)734 static void scale_plane_1_to_2_phase_0(const uint8_t *src,
735 const ptrdiff_t src_stride, uint8_t *dst,
736 const ptrdiff_t dst_stride,
737 const int src_w, const int src_h,
738 const int16_t *const coef,
739 uint8_t *const temp_buffer) {
740 int max_width;
741 int y;
742 uint8_t *tmp[9];
743 __m128i f[4];
744
745 max_width = (src_w + 7) & ~7;
746 tmp[0] = temp_buffer + 0 * max_width;
747 tmp[1] = temp_buffer + 1 * max_width;
748 tmp[2] = temp_buffer + 2 * max_width;
749 tmp[3] = temp_buffer + 3 * max_width;
750 tmp[4] = temp_buffer + 4 * max_width;
751 tmp[5] = temp_buffer + 5 * max_width;
752 tmp[6] = temp_buffer + 6 * max_width;
753 tmp[7] = temp_buffer + 7 * max_width;
754
755 shuffle_filter_ssse3(coef, f);
756
757 scale_1_to_2_phase_0_row(src - 3 * src_stride - 3, tmp[0], max_width, f);
758 scale_1_to_2_phase_0_row(src - 2 * src_stride - 3, tmp[1], max_width, f);
759 scale_1_to_2_phase_0_row(src - 1 * src_stride - 3, tmp[2], max_width, f);
760 scale_1_to_2_phase_0_row(src + 0 * src_stride - 3, tmp[3], max_width, f);
761 scale_1_to_2_phase_0_row(src + 1 * src_stride - 3, tmp[4], max_width, f);
762 scale_1_to_2_phase_0_row(src + 2 * src_stride - 3, tmp[5], max_width, f);
763 scale_1_to_2_phase_0_row(src + 3 * src_stride - 3, tmp[6], max_width, f);
764
765 y = src_h;
766 do {
767 int x;
768 scale_1_to_2_phase_0_row(src + 4 * src_stride - 3, tmp[7], max_width, f);
769 for (x = 0; x < max_width; x += 8) {
770 __m128i s[8], C, D, CD;
771
772 // Even rows
773 const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x));
774 const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp[3] + x));
775 const __m128i ab = _mm_unpacklo_epi8(a, b);
776 _mm_storeu_si128((__m128i *)(dst + 2 * x), ab);
777
778 // Odd rows
779 // Even columns
780 load_8bit_8x8(src + x - 3 * src_stride, src_stride, s);
781 C = scale_1_to_2_phase_0_kernel(s, f);
782
783 // Odd columns
784 s[0] = _mm_loadl_epi64((const __m128i *)(tmp[0] + x));
785 s[1] = _mm_loadl_epi64((const __m128i *)(tmp[1] + x));
786 s[2] = _mm_loadl_epi64((const __m128i *)(tmp[2] + x));
787 s[3] = _mm_loadl_epi64((const __m128i *)(tmp[3] + x));
788 s[4] = _mm_loadl_epi64((const __m128i *)(tmp[4] + x));
789 s[5] = _mm_loadl_epi64((const __m128i *)(tmp[5] + x));
790 s[6] = _mm_loadl_epi64((const __m128i *)(tmp[6] + x));
791 s[7] = _mm_loadl_epi64((const __m128i *)(tmp[7] + x));
792 D = scale_1_to_2_phase_0_kernel(s, f);
793
794 CD = _mm_unpacklo_epi8(C, D);
795 _mm_storeu_si128((__m128i *)(dst + dst_stride + 2 * x), CD);
796 }
797
798 src += src_stride;
799 dst += 2 * dst_stride;
800 tmp[8] = tmp[0];
801 tmp[0] = tmp[1];
802 tmp[1] = tmp[2];
803 tmp[2] = tmp[3];
804 tmp[3] = tmp[4];
805 tmp[4] = tmp[5];
806 tmp[5] = tmp[6];
807 tmp[6] = tmp[7];
808 tmp[7] = tmp[8];
809 } while (--y);
810 }
811
av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG * src,YV12_BUFFER_CONFIG * dst,const InterpFilter filter,const int phase,const int num_planes)812 void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
813 YV12_BUFFER_CONFIG *dst,
814 const InterpFilter filter,
815 const int phase, const int num_planes) {
816 // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
817 // the static analysis warnings.
818 int scaled = 0;
819 for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
820 const int is_uv = i > 0;
821 const int src_w = src->crop_widths[is_uv];
822 const int src_h = src->crop_heights[is_uv];
823 const int src_y_w = (src->crop_widths[0] + 1) & ~1;
824 const int dst_w = dst->crop_widths[is_uv];
825 const int dst_h = dst->crop_heights[is_uv];
826 const int dst_y_w = (dst->crop_widths[0] + 1) & ~1;
827 const int dst_y_h = (dst->crop_heights[0] + 1) & ~1;
828
829 if (2 * dst_w == src_w && 2 * dst_h == src_h) {
830 // 2 to 1
831 scaled = 1;
832 if (phase == 0) {
833 scale_plane_2_to_1_phase_0(src->buffers[i], src->strides[is_uv],
834 dst->buffers[i], dst->strides[is_uv], dst_w,
835 dst_h);
836 } else if (filter == BILINEAR) {
837 const int16_t c0 = av1_bilinear_filters[phase][3];
838 const int16_t c1 = av1_bilinear_filters[phase][4];
839 const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0
840 scale_plane_2_to_1_bilinear(src->buffers[i], src->strides[is_uv],
841 dst->buffers[i], dst->strides[is_uv], dst_w,
842 dst_h, c0c1);
843 } else {
844 const int buffer_stride = (dst_y_w + 3) & ~3;
845 const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
846 uint8_t *const temp_buffer =
847 (uint8_t *)malloc(buffer_stride * buffer_height);
848 if (temp_buffer) {
849 const InterpKernel *interp_kernel =
850 (const InterpKernel *)av1_interp_filter_params_list[filter]
851 .filter_ptr;
852 scale_plane_2_to_1_general(src->buffers[i], src->strides[is_uv],
853 dst->buffers[i], dst->strides[is_uv],
854 dst_w, dst_h, interp_kernel[phase],
855 temp_buffer);
856 free(temp_buffer);
857 } else {
858 scaled = 0;
859 }
860 }
861 } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
862 // 4 to 1
863 scaled = 1;
864 if (phase == 0) {
865 scale_plane_4_to_1_phase_0(src->buffers[i], src->strides[is_uv],
866 dst->buffers[i], dst->strides[is_uv], dst_w,
867 dst_h);
868 } else if (filter == BILINEAR) {
869 const int16_t c0 = av1_bilinear_filters[phase][3];
870 const int16_t c1 = av1_bilinear_filters[phase][4];
871 const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0
872 scale_plane_4_to_1_bilinear(src->buffers[i], src->strides[is_uv],
873 dst->buffers[i], dst->strides[is_uv], dst_w,
874 dst_h, c0c1);
875 } else {
876 const int buffer_stride = (dst_y_w + 1) & ~1;
877 const int buffer_height = (4 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
878 // When dst_w is 1 or 2, we need extra padding to avoid heap read
879 // overflow
880 const int extra_padding = 16;
881 uint8_t *const temp_buffer =
882 (uint8_t *)malloc(buffer_stride * buffer_height + extra_padding);
883 if (temp_buffer) {
884 const InterpKernel *interp_kernel =
885 (const InterpKernel *)av1_interp_filter_params_list[filter]
886 .filter_ptr;
887 scale_plane_4_to_1_general(src->buffers[i], src->strides[is_uv],
888 dst->buffers[i], dst->strides[is_uv],
889 dst_w, dst_h, interp_kernel[phase],
890 temp_buffer);
891 free(temp_buffer);
892 } else {
893 scaled = 0;
894 }
895 }
896 } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) {
897 // 4 to 3
898 const int buffer_stride_hor = (dst_y_w + 5) - ((dst_y_w + 5) % 6) + 2;
899 const int buffer_stride_ver = (dst_y_w + 7) & ~7;
900 const int buffer_height = (4 * dst_y_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
901 // When the vertical filter reads more pixels than the horizontal filter
902 // generated in each row, we need extra padding to avoid heap read
903 // overflow. For example, the horizontal filter generates 18 pixels but
904 // the vertical filter reads 24 pixels in a row. The difference is
905 // multiplied by 2 since two rows are interlaced together in the
906 // optimization.
907 const int extra_padding =
908 (buffer_stride_ver > buffer_stride_hor)
909 ? 2 * (buffer_stride_ver - buffer_stride_hor)
910 : 0;
911 const int buffer_size = buffer_stride_hor * buffer_height + extra_padding;
912 uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_size);
913 if (temp_buffer) {
914 scaled = 1;
915 const InterpKernel *interp_kernel =
916 (const InterpKernel *)av1_interp_filter_params_list[filter]
917 .filter_ptr;
918 scale_plane_4_to_3_general(src->buffers[i], src->strides[is_uv],
919 dst->buffers[i], dst->strides[is_uv], dst_w,
920 dst_h, interp_kernel, phase, temp_buffer);
921 free(temp_buffer);
922 } else {
923 scaled = 0;
924 }
925 } else if (dst_w == src_w * 2 && dst_h == src_h * 2) {
926 // 1 to 2
927 uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_y_w + 7) & ~7));
928 if (temp_buffer) {
929 scaled = 1;
930 const InterpKernel *interp_kernel =
931 (const InterpKernel *)av1_interp_filter_params_list[filter]
932 .filter_ptr;
933 scale_plane_1_to_2_phase_0(src->buffers[i], src->strides[is_uv],
934 dst->buffers[i], dst->strides[is_uv], src_w,
935 src_h, interp_kernel[8], temp_buffer);
936 free(temp_buffer);
937 } else {
938 scaled = 0;
939 }
940 }
941 }
942 if (!scaled) {
943 av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes);
944 } else {
945 aom_extend_frame_borders(dst, num_planes);
946 }
947 }
948