1 /*
2  *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <tmmintrin.h>  // SSSE3
12 
13 #include "./vp9_rtcd.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "./vpx_scale_rtcd.h"
16 #include "vpx_dsp/x86/convolve_ssse3.h"
17 #include "vpx_dsp/x86/mem_sse2.h"
18 #include "vpx_dsp/x86/transpose_sse2.h"
19 #include "vpx_scale/yv12config.h"
20 
scale_plane_2_to_1_phase_0_kernel(const uint8_t * const src,const __m128i * const mask)21 static INLINE __m128i scale_plane_2_to_1_phase_0_kernel(
22     const uint8_t *const src, const __m128i *const mask) {
23   const __m128i a = _mm_loadu_si128((const __m128i *)(&src[0]));
24   const __m128i b = _mm_loadu_si128((const __m128i *)(&src[16]));
25   const __m128i a_and = _mm_and_si128(a, *mask);
26   const __m128i b_and = _mm_and_si128(b, *mask);
27   return _mm_packus_epi16(a_and, b_and);
28 }
29 
scale_plane_2_to_1_phase_0(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const int dst_w,const int dst_h)30 static void scale_plane_2_to_1_phase_0(const uint8_t *src,
31                                        const ptrdiff_t src_stride, uint8_t *dst,
32                                        const ptrdiff_t dst_stride,
33                                        const int dst_w, const int dst_h) {
34   const int max_width = (dst_w + 15) & ~15;
35   const __m128i mask = _mm_set1_epi16(0x00FF);
36   int y = dst_h;
37 
38   do {
39     int x = max_width;
40     do {
41       const __m128i d = scale_plane_2_to_1_phase_0_kernel(src, &mask);
42       _mm_storeu_si128((__m128i *)dst, d);
43       src += 32;
44       dst += 16;
45       x -= 16;
46     } while (x);
47     src += 2 * (src_stride - max_width);
48     dst += dst_stride - max_width;
49   } while (--y);
50 }
51 
scale_plane_4_to_1_phase_0(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const int dst_w,const int dst_h)52 static void scale_plane_4_to_1_phase_0(const uint8_t *src,
53                                        const ptrdiff_t src_stride, uint8_t *dst,
54                                        const ptrdiff_t dst_stride,
55                                        const int dst_w, const int dst_h) {
56   const int max_width = (dst_w + 15) & ~15;
57   const __m128i mask = _mm_set1_epi32(0x000000FF);
58   int y = dst_h;
59 
60   do {
61     int x = max_width;
62     do {
63       const __m128i d0 = scale_plane_2_to_1_phase_0_kernel(&src[0], &mask);
64       const __m128i d1 = scale_plane_2_to_1_phase_0_kernel(&src[32], &mask);
65       const __m128i d2 = _mm_packus_epi16(d0, d1);
66       _mm_storeu_si128((__m128i *)dst, d2);
67       src += 64;
68       dst += 16;
69       x -= 16;
70     } while (x);
71     src += 4 * (src_stride - max_width);
72     dst += dst_stride - max_width;
73   } while (--y);
74 }
75 
scale_plane_bilinear_kernel(const __m128i * const s,const __m128i c0c1)76 static INLINE __m128i scale_plane_bilinear_kernel(const __m128i *const s,
77                                                   const __m128i c0c1) {
78   const __m128i k_64 = _mm_set1_epi16(1 << 6);
79   const __m128i t0 = _mm_maddubs_epi16(s[0], c0c1);
80   const __m128i t1 = _mm_maddubs_epi16(s[1], c0c1);
81   // round and shift by 7 bit each 16 bit
82   const __m128i t2 = _mm_adds_epi16(t0, k_64);
83   const __m128i t3 = _mm_adds_epi16(t1, k_64);
84   const __m128i t4 = _mm_srai_epi16(t2, 7);
85   const __m128i t5 = _mm_srai_epi16(t3, 7);
86   return _mm_packus_epi16(t4, t5);
87 }
88 
scale_plane_2_to_1_bilinear(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const int dst_w,const int dst_h,const __m128i c0c1)89 static void scale_plane_2_to_1_bilinear(const uint8_t *src,
90                                         const ptrdiff_t src_stride,
91                                         uint8_t *dst,
92                                         const ptrdiff_t dst_stride,
93                                         const int dst_w, const int dst_h,
94                                         const __m128i c0c1) {
95   const int max_width = (dst_w + 15) & ~15;
96   int y = dst_h;
97 
98   do {
99     int x = max_width;
100     do {
101       __m128i s[2], d[2];
102 
103       // Horizontal
104       // Even rows
105       s[0] = _mm_loadu_si128((const __m128i *)(src + 0));
106       s[1] = _mm_loadu_si128((const __m128i *)(src + 16));
107       d[0] = scale_plane_bilinear_kernel(s, c0c1);
108 
109       // odd rows
110       s[0] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
111       s[1] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
112       d[1] = scale_plane_bilinear_kernel(s, c0c1);
113 
114       // Vertical
115       s[0] = _mm_unpacklo_epi8(d[0], d[1]);
116       s[1] = _mm_unpackhi_epi8(d[0], d[1]);
117       d[0] = scale_plane_bilinear_kernel(s, c0c1);
118 
119       _mm_storeu_si128((__m128i *)dst, d[0]);
120       src += 32;
121       dst += 16;
122       x -= 16;
123     } while (x);
124     src += 2 * (src_stride - max_width);
125     dst += dst_stride - max_width;
126   } while (--y);
127 }
128 
scale_plane_4_to_1_bilinear(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const int dst_w,const int dst_h,const __m128i c0c1)129 static void scale_plane_4_to_1_bilinear(const uint8_t *src,
130                                         const ptrdiff_t src_stride,
131                                         uint8_t *dst,
132                                         const ptrdiff_t dst_stride,
133                                         const int dst_w, const int dst_h,
134                                         const __m128i c0c1) {
135   const int max_width = (dst_w + 15) & ~15;
136   int y = dst_h;
137 
138   do {
139     int x = max_width;
140     do {
141       __m128i s[8], d[8];
142 
143       // Note: Using _mm_packus_epi32() in SSE4.1 could be faster.
144       //       Here we tried to not use shuffle instructions which would be slow
145       //       on some x86 CPUs.
146 
147       // Horizontal
148       // 000 001 xx xx 004 005 xx xx  008 009 xx xx 00C 00D xx xx
149       // 010 011 xx xx 014 015 xx xx  018 019 xx xx 01C 01D xx xx
150       // 020 021 xx xx 024 025 xx xx  028 029 xx xx 02C 02D xx xx
151       // 030 031 xx xx 034 035 xx xx  038 039 xx xx 03C 03D xx xx
152       // 100 101 xx xx 104 105 xx xx  108 109 xx xx 10C 10D xx xx
153       // 110 111 xx xx 114 115 xx xx  118 119 xx xx 11C 11D xx xx
154       // 120 121 xx xx 124 125 xx xx  128 129 xx xx 12C 12D xx xx
155       // 130 131 xx xx 134 135 xx xx  138 139 xx xx 13C 13D xx xx
156       s[0] = _mm_loadu_si128((const __m128i *)(&src[0]));
157       s[1] = _mm_loadu_si128((const __m128i *)(&src[16]));
158       s[2] = _mm_loadu_si128((const __m128i *)(&src[32]));
159       s[3] = _mm_loadu_si128((const __m128i *)(&src[48]));
160       s[4] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
161       s[5] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
162       s[6] = _mm_loadu_si128((const __m128i *)(src + src_stride + 32));
163       s[7] = _mm_loadu_si128((const __m128i *)(src + src_stride + 48));
164 
165       // 000 001 100 101 xx xx xx xx  004 005 104 105 xx xx xx xx
166       // 008 009 108 109 xx xx xx xx  00C 00D 10C 10D xx xx xx xx
167       // 010 011 110 111 xx xx xx xx  014 015 114 115 xx xx xx xx
168       // 018 019 118 119 xx xx xx xx  01C 01D 11C 11D xx xx xx xx
169       // 020 021 120 121 xx xx xx xx  024 025 124 125 xx xx xx xx
170       // 028 029 128 129 xx xx xx xx  02C 02D 12C 12D xx xx xx xx
171       // 030 031 130 131 xx xx xx xx  034 035 134 135 xx xx xx xx
172       // 038 039 138 139 xx xx xx xx  03C 03D 13C 13D xx xx xx xx
173       d[0] = _mm_unpacklo_epi16(s[0], s[4]);
174       d[1] = _mm_unpackhi_epi16(s[0], s[4]);
175       d[2] = _mm_unpacklo_epi16(s[1], s[5]);
176       d[3] = _mm_unpackhi_epi16(s[1], s[5]);
177       d[4] = _mm_unpacklo_epi16(s[2], s[6]);
178       d[5] = _mm_unpackhi_epi16(s[2], s[6]);
179       d[6] = _mm_unpacklo_epi16(s[3], s[7]);
180       d[7] = _mm_unpackhi_epi16(s[3], s[7]);
181 
182       // 000 001 100 101 008 009 108 109  xx xx xx xx xx xx xx xx
183       // 004 005 104 105 00C 00D 10C 10D  xx xx xx xx xx xx xx xx
184       // 010 011 110 111 018 019 118 119  xx xx xx xx xx xx xx xx
185       // 014 015 114 115 01C 01D 11C 11D  xx xx xx xx xx xx xx xx
186       // 020 021 120 121 028 029 128 129  xx xx xx xx xx xx xx xx
187       // 024 025 124 125 02C 02D 12C 12D  xx xx xx xx xx xx xx xx
188       // 030 031 130 131 038 039 138 139  xx xx xx xx xx xx xx xx
189       // 034 035 134 135 03C 03D 13C 13D  xx xx xx xx xx xx xx xx
190       s[0] = _mm_unpacklo_epi32(d[0], d[1]);
191       s[1] = _mm_unpackhi_epi32(d[0], d[1]);
192       s[2] = _mm_unpacklo_epi32(d[2], d[3]);
193       s[3] = _mm_unpackhi_epi32(d[2], d[3]);
194       s[4] = _mm_unpacklo_epi32(d[4], d[5]);
195       s[5] = _mm_unpackhi_epi32(d[4], d[5]);
196       s[6] = _mm_unpacklo_epi32(d[6], d[7]);
197       s[7] = _mm_unpackhi_epi32(d[6], d[7]);
198 
199       // 000 001 100 101 004 005 104 105  008 009 108 109 00C 00D 10C 10D
200       // 010 011 110 111 014 015 114 115  018 019 118 119 01C 01D 11C 11D
201       // 020 021 120 121 024 025 124 125  028 029 128 129 02C 02D 12C 12D
202       // 030 031 130 131 034 035 134 135  038 039 138 139 03C 03D 13C 13D
203       d[0] = _mm_unpacklo_epi32(s[0], s[1]);
204       d[1] = _mm_unpacklo_epi32(s[2], s[3]);
205       d[2] = _mm_unpacklo_epi32(s[4], s[5]);
206       d[3] = _mm_unpacklo_epi32(s[6], s[7]);
207 
208       d[0] = scale_plane_bilinear_kernel(&d[0], c0c1);
209       d[1] = scale_plane_bilinear_kernel(&d[2], c0c1);
210 
211       // Vertical
212       d[0] = scale_plane_bilinear_kernel(d, c0c1);
213 
214       _mm_storeu_si128((__m128i *)dst, d[0]);
215       src += 64;
216       dst += 16;
217       x -= 16;
218     } while (x);
219     src += 4 * (src_stride - max_width);
220     dst += dst_stride - max_width;
221   } while (--y);
222 }
223 
scale_plane_2_to_1_general(const uint8_t * src,const int src_stride,uint8_t * dst,const int dst_stride,const int w,const int h,const int16_t * const coef,uint8_t * const temp_buffer)224 static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride,
225                                        uint8_t *dst, const int dst_stride,
226                                        const int w, const int h,
227                                        const int16_t *const coef,
228                                        uint8_t *const temp_buffer) {
229   const int width_hor = (w + 3) & ~3;
230   const int width_ver = (w + 7) & ~7;
231   const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7;
232   const int height_ver = (h + 3) & ~3;
233   int x, y = height_hor;
234   uint8_t *t = temp_buffer;
235   __m128i s[11], d[4];
236   __m128i f[4];
237 
238   assert(w && h);
239 
240   shuffle_filter_ssse3(coef, f);
241   src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1;
242 
243   // horizontal 4x8
244   do {
245     load_8bit_8x8(src + 2, src_stride, s);
246     // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
247     // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
248     // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
249     // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77 (overlapped)
250     transpose_16bit_4x8(s, s);
251     x = width_hor;
252 
253     do {
254       src += 8;
255       load_8bit_8x8(src, src_stride, &s[3]);
256       // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
257       // 08 09 18 19 28 29 38 39  48 49 58 59 68 69 78 79
258       // 0A 0B 1A 1B 2A 2B 3A 3B  4A 4B 5A 5B 6A 6B 7A 7B
259       // 0C 0D 1C 1D 2C 2D 3C 3D  4C 4D 5C 5D 6C 6D 7C 7D
260       transpose_16bit_4x8(&s[3], &s[3]);
261 
262       d[0] = convolve8_8_ssse3(&s[0], f);  // 00 10 20 30 40 50 60 70
263       d[1] = convolve8_8_ssse3(&s[1], f);  // 01 11 21 31 41 51 61 71
264       d[2] = convolve8_8_ssse3(&s[2], f);  // 02 12 22 32 42 52 62 72
265       d[3] = convolve8_8_ssse3(&s[3], f);  // 03 13 23 33 43 53 63 73
266 
267       // 00 10 20 30 40 50 60 70  02 12 22 32 42 52 62 72
268       // 01 11 21 31 41 51 61 71  03 13 23 33 43 53 63 73
269       d[0] = _mm_packus_epi16(d[0], d[2]);
270       d[1] = _mm_packus_epi16(d[1], d[3]);
271       // 00 10 01 11 20 30 21 31  40 50 41 51 60 70 61 71
272       // 02 12 03 13 22 32 23 33  42 52 43 53 62 72 63 73
273       d[2] = _mm_unpacklo_epi16(d[0], d[1]);
274       d[3] = _mm_unpackhi_epi16(d[0], d[1]);
275       // 00 10 01 11 02 12 03 13  20 30 21 31 22 32 23 33
276       // 40 50 41 51 42 52 43 53  60 70 61 71 62 72 63 73
277       d[0] = _mm_unpacklo_epi32(d[2], d[3]);
278       d[1] = _mm_unpackhi_epi32(d[2], d[3]);
279       store_8bit_8x4_from_16x2(d, t, 2 * width_hor);
280 
281       s[0] = s[4];
282       s[1] = s[5];
283       s[2] = s[6];
284 
285       t += 8;
286       x -= 4;
287     } while (x);
288     src += 8 * src_stride - 2 * width_hor;
289     t += 6 * width_hor;
290     y -= 8;
291   } while (y);
292 
293   // vertical 8x4
294   x = width_ver;
295   t = temp_buffer;
296   do {
297     // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
298     // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
299     // 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
300     s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
301     s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
302     s[2] = _mm_loadu_si128((const __m128i *)(t + 4 * width_hor));
303     t += 6 * width_hor;
304     y = height_ver;
305 
306     do {
307       // 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
308       // 80 90 81 91 82 92 83 93  84 94 85 95 86 96 87 77
309       // A0 B0 A1 B1 A2 B2 A3 B3  A4 B4 A5 B5 A6 B6 A7 77
310       // C0 D0 C1 D1 C2 D2 C3 D3  C4 D4 C5 D5 C6 D6 C7 77
311       loadu_8bit_16x4(t, 2 * width_hor, &s[3]);
312       t += 8 * width_hor;
313 
314       d[0] = convolve8_8_ssse3(&s[0], f);  // 00 01 02 03 04 05 06 07
315       d[1] = convolve8_8_ssse3(&s[1], f);  // 10 11 12 13 14 15 16 17
316       d[2] = convolve8_8_ssse3(&s[2], f);  // 20 21 22 23 24 25 26 27
317       d[3] = convolve8_8_ssse3(&s[3], f);  // 30 31 32 33 34 35 36 37
318 
319       // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
320       // 20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37
321       d[0] = _mm_packus_epi16(d[0], d[1]);
322       d[1] = _mm_packus_epi16(d[2], d[3]);
323       store_8bit_8x4_from_16x2(d, dst, dst_stride);
324 
325       s[0] = s[4];
326       s[1] = s[5];
327       s[2] = s[6];
328 
329       dst += 4 * dst_stride;
330       y -= 4;
331     } while (y);
332     t -= width_hor * (2 * height_ver + 6);
333     t += 16;
334     dst -= height_ver * dst_stride;
335     dst += 8;
336     x -= 8;
337   } while (x);
338 }
339 
scale_plane_4_to_1_general(const uint8_t * src,const int src_stride,uint8_t * dst,const int dst_stride,const int w,const int h,const int16_t * const coef,uint8_t * const temp_buffer)340 static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
341                                        uint8_t *dst, const int dst_stride,
342                                        const int w, const int h,
343                                        const int16_t *const coef,
344                                        uint8_t *const temp_buffer) {
345   const int width_hor = (w + 1) & ~1;
346   const int width_ver = (w + 7) & ~7;
347   const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7;
348   const int height_ver = (h + 1) & ~1;
349   int x, y = height_hor;
350   uint8_t *t = temp_buffer;
351   __m128i s[11], d[4];
352   __m128i f[4];
353 
354   assert(w && h);
355 
356   shuffle_filter_ssse3(coef, f);
357   src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3;
358 
359   // horizontal 2x8
360   do {
361     load_8bit_8x8(src + 4, src_stride, s);
362     // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
363     // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
364     // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75 (overlapped)
365     // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77 (overlapped)
366     transpose_16bit_4x8(s, s);
367     x = width_hor;
368 
369     do {
370       src += 8;
371       load_8bit_8x8(src, src_stride, &s[2]);
372       // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
373       // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
374       // 08 09 18 19 28 29 38 39  48 49 58 59 68 69 78 79
375       // 0A 0B 1A 1B 2A 2B 3A 3B  4A 4B 5A 5B 6A 6B 7A 7B
376       transpose_16bit_4x8(&s[2], &s[2]);
377 
378       d[0] = convolve8_8_ssse3(&s[0], f);  // 00 10 20 30 40 50 60 70
379       d[1] = convolve8_8_ssse3(&s[2], f);  // 01 11 21 31 41 51 61 71
380 
381       // 00 10 20 30 40 50 60 70  xx xx xx xx xx xx xx xx
382       // 01 11 21 31 41 51 61 71  xx xx xx xx xx xx xx xx
383       d[0] = _mm_packus_epi16(d[0], d[0]);
384       d[1] = _mm_packus_epi16(d[1], d[1]);
385       // 00 10 01 11 20 30 21 31  40 50 41 51 60 70 61 71
386       d[0] = _mm_unpacklo_epi16(d[0], d[1]);
387       store_8bit_4x4_sse2(d[0], t, 2 * width_hor);
388 
389       s[0] = s[4];
390       s[1] = s[5];
391 
392       t += 4;
393       x -= 2;
394     } while (x);
395     src += 8 * src_stride - 4 * width_hor;
396     t += 6 * width_hor;
397     y -= 8;
398   } while (y);
399 
400   // vertical 8x2
401   x = width_ver;
402   t = temp_buffer;
403   do {
404     // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
405     // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
406     s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
407     s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
408     t += 4 * width_hor;
409     y = height_ver;
410 
411     do {
412       // 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
413       // 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
414       // 80 90 81 91 82 92 83 93  84 94 85 95 86 96 87 77
415       // A0 B0 A1 B1 A2 B2 A3 B3  A4 B4 A5 B5 A6 B6 A7 77
416       loadu_8bit_16x4(t, 2 * width_hor, &s[2]);
417       t += 8 * width_hor;
418 
419       d[0] = convolve8_8_ssse3(&s[0], f);  // 00 01 02 03 04 05 06 07
420       d[1] = convolve8_8_ssse3(&s[2], f);  // 10 11 12 13 14 15 16 17
421 
422       // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
423       d[0] = _mm_packus_epi16(d[0], d[1]);
424       _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
425       _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
426 
427       s[0] = s[4];
428       s[1] = s[5];
429 
430       dst += 2 * dst_stride;
431       y -= 2;
432     } while (y);
433     t -= width_hor * (4 * height_ver + 4);
434     t += 16;
435     dst -= height_ver * dst_stride;
436     dst += 8;
437     x -= 8;
438   } while (x);
439 }
440 
441 typedef void (*shuffle_filter_funcs)(const int16_t *const filter,
442                                      __m128i *const f);
443 
444 typedef __m128i (*convolve8_funcs)(const __m128i *const s,
445                                    const __m128i *const f);
446 
scale_plane_4_to_3_general(const uint8_t * src,const int src_stride,uint8_t * dst,const int dst_stride,const int w,const int h,const InterpKernel * const coef,const int phase_scaler,uint8_t * const temp_buffer)447 static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
448                                        uint8_t *dst, const int dst_stride,
449                                        const int w, const int h,
450                                        const InterpKernel *const coef,
451                                        const int phase_scaler,
452                                        uint8_t *const temp_buffer) {
453   static const int step_q4 = 16 * 4 / 3;
454   const int width_hor = (w + 5) - ((w + 5) % 6);
455   const int stride_hor = 2 * width_hor + 4;  // store 4 extra pixels
456   const int width_ver = (w + 7) & ~7;
457   // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows
458   // above and (SUBPEL_TAPS / 2) extra rows below.
459   const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
460   const int height_ver = (h + 5) - ((h + 5) % 6);
461   int x, y = height_hor;
462   uint8_t *t = temp_buffer;
463   __m128i s[12], d[6], dd[4];
464   __m128i f0[4], f1[5], f2[5];
465   // The offset of the first row is always less than 1 pixel.
466   const int offset1_q4 = phase_scaler + 1 * step_q4;
467   const int offset2_q4 = phase_scaler + 2 * step_q4;
468   // offset_idxx indicates the pixel offset is even (0) or odd (1).
469   // It's used to choose the src offset and filter coefficient offset.
470   const int offset_idx1 = (offset1_q4 >> 4) & 1;
471   const int offset_idx2 = (offset2_q4 >> 4) & 1;
472   static const shuffle_filter_funcs shuffle_filter_funcs[2] = {
473     shuffle_filter_ssse3, shuffle_filter_odd_ssse3
474   };
475   static const convolve8_funcs convolve8_funcs[2] = {
476     convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3
477   };
478 
479   assert(w && h);
480 
481   shuffle_filter_ssse3(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK], f0);
482   shuffle_filter_funcs[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1);
483   shuffle_filter_funcs[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2);
484 
485   // Sub 64 to avoid overflow.
486   // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here.
487   // Coef 128 is in either fx[1] or fx[2] depending on the phase idx.
488   // When filter phase idx is 1, the two biggest coefficients are shuffled
489   // together, and the sum of them are always no less than 128. Sub 64 here.
490   // After the subtraction, when the sum of all positive coefficients are no
491   // larger than 128, and the sum of all negative coefficients are no
492   // less than -128, there will be no overflow in the convolve8 functions.
493   f0[1] = _mm_sub_epi8(f0[1], _mm_set1_epi8(64));
494   f1[1 + offset_idx1] = _mm_sub_epi8(f1[1 + offset_idx1], _mm_set1_epi8(64));
495   f2[1 + offset_idx2] = _mm_sub_epi8(f2[1 + offset_idx2], _mm_set1_epi8(64));
496 
497   src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 - 1;
498 
499   // horizontal 6x8
500   do {
501     load_8bit_8x8(src, src_stride, s);
502     // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
503     // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
504     // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
505     // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
506     transpose_16bit_4x8(s, s);
507     x = width_hor;
508 
509     do {
510       src += 8;
511       load_8bit_8x8(src, src_stride, &s[4]);
512       // 08 09 18 19 28 29 38 39  48 49 58 59 68 69 78 79
513       // 0A 0B 1A 1B 2A 2B 3A 3B  4A 4B 5A 5B 6A 6B 7A 7B
514       // OC 0D 1C 1D 2C 2D 3C 3D  4C 4D 5C 5D 6C 6D 7C 7D
515       // 0E 0F 1E 1F 2E 2F 3E 3F  4E 4F 5E 5F 6E 6F 7E 7F
516       transpose_16bit_4x8(&s[4], &s[4]);
517 
518       // 00 10 20 30 40 50 60 70
519       // 01 11 21 31 41 51 61 71
520       // 02 12 22 32 42 52 62 72
521       // 03 13 23 33 43 53 63 73
522       // 04 14 24 34 44 54 64 74
523       // 05 15 25 35 45 55 65 75
524       d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
525       d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
526       d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
527       d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
528       d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
529       d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
530 
531       // 00 10 20 30 40 50 60 70  02 12 22 32 42 52 62 72
532       // 01 11 21 31 41 51 61 71  03 13 23 33 43 53 63 73
533       // 04 14 24 34 44 54 64 74  xx xx xx xx xx xx xx xx
534       // 05 15 25 35 45 55 65 75  xx xx xx xx xx xx xx xx
535       dd[0] = _mm_packus_epi16(d[0], d[2]);
536       dd[1] = _mm_packus_epi16(d[1], d[3]);
537       dd[2] = _mm_packus_epi16(d[4], d[4]);
538       dd[3] = _mm_packus_epi16(d[5], d[5]);
539 
540       // 00 10 01 11 20 30 21 31  40 50 41 51 60 70 61 71
541       // 02 12 03 13 22 32 23 33  42 52 43 53 62 72 63 73
542       // 04 14 05 15 24 34 25 35  44 54 45 55 64 74 65 75
543       d[0] = _mm_unpacklo_epi16(dd[0], dd[1]);
544       d[1] = _mm_unpackhi_epi16(dd[0], dd[1]);
545       d[2] = _mm_unpacklo_epi16(dd[2], dd[3]);
546 
547       // 00 10 01 11 02 12 03 13  20 30 21 31 22 32 23 33
548       // 40 50 41 51 42 52 43 53  60 70 61 71 62 72 63 73
549       // 04 14 05 15 xx xx xx xx  24 34 25 35 xx xx xx xx
550       // 44 54 45 55 xx xx xx xx  64 74 65 75 xx xx xx xx
551       dd[0] = _mm_unpacklo_epi32(d[0], d[1]);
552       dd[1] = _mm_unpackhi_epi32(d[0], d[1]);
553       dd[2] = _mm_unpacklo_epi32(d[2], d[2]);
554       dd[3] = _mm_unpackhi_epi32(d[2], d[2]);
555 
556       // 00 10 01 11 02 12 03 13  04 14 05 15 xx xx xx xx
557       // 20 30 21 31 22 32 23 33  24 34 25 35 xx xx xx xx
558       // 40 50 41 51 42 52 43 53  44 54 45 55 xx xx xx xx
559       // 60 70 61 71 62 72 63 73  64 74 65 75 xx xx xx xx
560       d[0] = _mm_unpacklo_epi64(dd[0], dd[2]);
561       d[1] = _mm_unpackhi_epi64(dd[0], dd[2]);
562       d[2] = _mm_unpacklo_epi64(dd[1], dd[3]);
563       d[3] = _mm_unpackhi_epi64(dd[1], dd[3]);
564 
565       // store 4 extra pixels
566       storeu_8bit_16x4(d, t, stride_hor);
567 
568       s[0] = s[4];
569       s[1] = s[5];
570       s[2] = s[6];
571       s[3] = s[7];
572 
573       t += 12;
574       x -= 6;
575     } while (x);
576     src += 8 * src_stride - 4 * width_hor / 3;
577     t += 3 * stride_hor + 4;
578     y -= 8;
579   } while (y);
580 
581   // vertical 8x6
582   x = width_ver;
583   t = temp_buffer;
584   do {
585     // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
586     // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
587     // 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
588     // 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
589     loadu_8bit_16x4(t, stride_hor, s);
590     y = height_ver;
591 
592     do {
593       // 80 90 81 91 82 92 83 93  84 94 85 95 86 96 87 97
594       // A0 B0 A1 B1 A2 B2 A3 B3  A4 B4 A5 B5 A6 B6 A7 B7
595       // C0 D0 C1 D1 C2 D2 C3 D3  C4 D4 C5 D5 C6 D6 C7 D7
596       // E0 F0 E1 F1 E2 F2 E3 F3  E4 F4 E5 F5 E6 F6 E7 F7
597       t += 4 * stride_hor;
598       loadu_8bit_16x4(t, stride_hor, &s[4]);
599 
600       d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
601       d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
602       d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
603       d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
604       d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
605       d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
606 
607       // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
608       // 20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37
609       // 40 41 42 43 44 45 46 47  50 51 52 53 54 55 56 57
610       d[0] = _mm_packus_epi16(d[0], d[1]);
611       d[2] = _mm_packus_epi16(d[2], d[3]);
612       d[4] = _mm_packus_epi16(d[4], d[5]);
613 
614       _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
615       _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
616       _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), d[2]);
617       _mm_storeh_epi64((__m128i *)(dst + 3 * dst_stride), d[2]);
618       _mm_storel_epi64((__m128i *)(dst + 4 * dst_stride), d[4]);
619       _mm_storeh_epi64((__m128i *)(dst + 5 * dst_stride), d[4]);
620 
621       s[0] = s[4];
622       s[1] = s[5];
623       s[2] = s[6];
624       s[3] = s[7];
625 
626       dst += 6 * dst_stride;
627       y -= 6;
628     } while (y);
629     t -= stride_hor * 2 * height_ver / 3;
630     t += 16;
631     dst -= height_ver * dst_stride;
632     dst += 8;
633     x -= 8;
634   } while (x);
635 }
636 
scale_1_to_2_phase_0_kernel(const __m128i * const s,const __m128i * const f)637 static INLINE __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s,
638                                                   const __m128i *const f) {
639   __m128i ss[4], temp;
640 
641   ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
642   ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
643   ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
644   ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
645   temp = convolve8_8_ssse3(ss, f);
646   return _mm_packus_epi16(temp, temp);
647 }
648 
649 // Only calculate odd columns since even columns are just src pixels' copies.
scale_1_to_2_phase_0_row(const uint8_t * src,uint8_t * dst,const int w,const __m128i * const f)650 static void scale_1_to_2_phase_0_row(const uint8_t *src, uint8_t *dst,
651                                      const int w, const __m128i *const f) {
652   int x = w;
653 
654   do {
655     __m128i s[8], temp;
656     s[0] = _mm_loadl_epi64((const __m128i *)(src + 0));
657     s[1] = _mm_loadl_epi64((const __m128i *)(src + 1));
658     s[2] = _mm_loadl_epi64((const __m128i *)(src + 2));
659     s[3] = _mm_loadl_epi64((const __m128i *)(src + 3));
660     s[4] = _mm_loadl_epi64((const __m128i *)(src + 4));
661     s[5] = _mm_loadl_epi64((const __m128i *)(src + 5));
662     s[6] = _mm_loadl_epi64((const __m128i *)(src + 6));
663     s[7] = _mm_loadl_epi64((const __m128i *)(src + 7));
664     temp = scale_1_to_2_phase_0_kernel(s, f);
665     _mm_storel_epi64((__m128i *)dst, temp);
666     src += 8;
667     dst += 8;
668     x -= 8;
669   } while (x);
670 }
671 
scale_plane_1_to_2_phase_0(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const int src_w,const int src_h,const int16_t * const coef,uint8_t * const temp_buffer)672 static void scale_plane_1_to_2_phase_0(const uint8_t *src,
673                                        const ptrdiff_t src_stride, uint8_t *dst,
674                                        const ptrdiff_t dst_stride,
675                                        const int src_w, const int src_h,
676                                        const int16_t *const coef,
677                                        uint8_t *const temp_buffer) {
678   int max_width;
679   int y;
680   uint8_t *tmp[9];
681   __m128i f[4];
682 
683   max_width = (src_w + 7) & ~7;
684   tmp[0] = temp_buffer + 0 * max_width;
685   tmp[1] = temp_buffer + 1 * max_width;
686   tmp[2] = temp_buffer + 2 * max_width;
687   tmp[3] = temp_buffer + 3 * max_width;
688   tmp[4] = temp_buffer + 4 * max_width;
689   tmp[5] = temp_buffer + 5 * max_width;
690   tmp[6] = temp_buffer + 6 * max_width;
691   tmp[7] = temp_buffer + 7 * max_width;
692 
693   shuffle_filter_ssse3(coef, f);
694 
695   scale_1_to_2_phase_0_row(src - 3 * src_stride - 3, tmp[0], max_width, f);
696   scale_1_to_2_phase_0_row(src - 2 * src_stride - 3, tmp[1], max_width, f);
697   scale_1_to_2_phase_0_row(src - 1 * src_stride - 3, tmp[2], max_width, f);
698   scale_1_to_2_phase_0_row(src + 0 * src_stride - 3, tmp[3], max_width, f);
699   scale_1_to_2_phase_0_row(src + 1 * src_stride - 3, tmp[4], max_width, f);
700   scale_1_to_2_phase_0_row(src + 2 * src_stride - 3, tmp[5], max_width, f);
701   scale_1_to_2_phase_0_row(src + 3 * src_stride - 3, tmp[6], max_width, f);
702 
703   y = src_h;
704   do {
705     int x;
706     scale_1_to_2_phase_0_row(src + 4 * src_stride - 3, tmp[7], max_width, f);
707     for (x = 0; x < max_width; x += 8) {
708       __m128i s[8], C, D, CD;
709 
710       // Even rows
711       const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x));
712       const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp[3] + x));
713       const __m128i ab = _mm_unpacklo_epi8(a, b);
714       _mm_storeu_si128((__m128i *)(dst + 2 * x), ab);
715 
716       // Odd rows
717       // Even columns
718       load_8bit_8x8(src + x - 3 * src_stride, src_stride, s);
719       C = scale_1_to_2_phase_0_kernel(s, f);
720 
721       // Odd columns
722       s[0] = _mm_loadl_epi64((const __m128i *)(tmp[0] + x));
723       s[1] = _mm_loadl_epi64((const __m128i *)(tmp[1] + x));
724       s[2] = _mm_loadl_epi64((const __m128i *)(tmp[2] + x));
725       s[3] = _mm_loadl_epi64((const __m128i *)(tmp[3] + x));
726       s[4] = _mm_loadl_epi64((const __m128i *)(tmp[4] + x));
727       s[5] = _mm_loadl_epi64((const __m128i *)(tmp[5] + x));
728       s[6] = _mm_loadl_epi64((const __m128i *)(tmp[6] + x));
729       s[7] = _mm_loadl_epi64((const __m128i *)(tmp[7] + x));
730       D = scale_1_to_2_phase_0_kernel(s, f);
731 
732       CD = _mm_unpacklo_epi8(C, D);
733       _mm_storeu_si128((__m128i *)(dst + dst_stride + 2 * x), CD);
734     }
735 
736     src += src_stride;
737     dst += 2 * dst_stride;
738     tmp[8] = tmp[0];
739     tmp[0] = tmp[1];
740     tmp[1] = tmp[2];
741     tmp[2] = tmp[3];
742     tmp[3] = tmp[4];
743     tmp[4] = tmp[5];
744     tmp[5] = tmp[6];
745     tmp[6] = tmp[7];
746     tmp[7] = tmp[8];
747   } while (--y);
748 }
749 
vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG * src,YV12_BUFFER_CONFIG * dst,uint8_t filter_type,int phase_scaler)750 void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
751                                       YV12_BUFFER_CONFIG *dst,
752                                       uint8_t filter_type, int phase_scaler) {
753   const int src_w = src->y_crop_width;
754   const int src_h = src->y_crop_height;
755   const int dst_w = dst->y_crop_width;
756   const int dst_h = dst->y_crop_height;
757   const int dst_uv_w = dst_w / 2;
758   const int dst_uv_h = dst_h / 2;
759   int scaled = 0;
760 
761   // phase_scaler is usually 0 or 8.
762   assert(phase_scaler >= 0 && phase_scaler < 16);
763 
764   if (dst_w * 2 == src_w && dst_h * 2 == src_h) {
765     // 2 to 1
766     scaled = 1;
767 
768     if (phase_scaler == 0) {
769       scale_plane_2_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
770                                  dst->y_stride, dst_w, dst_h);
771       scale_plane_2_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
772                                  dst->uv_stride, dst_uv_w, dst_uv_h);
773       scale_plane_2_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
774                                  dst->uv_stride, dst_uv_w, dst_uv_h);
775     } else if (filter_type == BILINEAR) {
776       const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
777       const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
778       const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8));  // c0 and c1 >= 0
779       scale_plane_2_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
780                                   dst->y_stride, dst_w, dst_h, c0c1);
781       scale_plane_2_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
782                                   dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
783       scale_plane_2_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
784                                   dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
785     } else {
786       const int buffer_stride = (dst_w + 3) & ~3;
787       const int buffer_height = (2 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
788       uint8_t *const temp_buffer =
789           (uint8_t *)malloc(buffer_stride * buffer_height);
790       if (temp_buffer) {
791         scale_plane_2_to_1_general(
792             src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
793             dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
794         scale_plane_2_to_1_general(
795             src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
796             dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
797             temp_buffer);
798         scale_plane_2_to_1_general(
799             src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
800             dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
801             temp_buffer);
802         free(temp_buffer);
803       } else {
804         scaled = 0;
805       }
806     }
807   } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
808     // 4 to 1
809     scaled = 1;
810     if (phase_scaler == 0) {
811       scale_plane_4_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
812                                  dst->y_stride, dst_w, dst_h);
813       scale_plane_4_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
814                                  dst->uv_stride, dst_uv_w, dst_uv_h);
815       scale_plane_4_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
816                                  dst->uv_stride, dst_uv_w, dst_uv_h);
817     } else if (filter_type == BILINEAR) {
818       const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
819       const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
820       const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8));  // c0 and c1 >= 0
821       scale_plane_4_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
822                                   dst->y_stride, dst_w, dst_h, c0c1);
823       scale_plane_4_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
824                                   dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
825       scale_plane_4_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
826                                   dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
827     } else {
828       const int buffer_stride = (dst_w + 1) & ~1;
829       const int buffer_height = (4 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
830       // When dst_w is 1 or 2, we need extra padding to avoid heap read overflow
831       const int extra_padding = 16;
832       uint8_t *const temp_buffer =
833           (uint8_t *)malloc(buffer_stride * buffer_height + extra_padding);
834       if (temp_buffer) {
835         scale_plane_4_to_1_general(
836             src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
837             dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
838         scale_plane_4_to_1_general(
839             src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
840             dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
841             temp_buffer);
842         scale_plane_4_to_1_general(
843             src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
844             dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
845             temp_buffer);
846         free(temp_buffer);
847       } else {
848         scaled = 0;
849       }
850     }
851   } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) {
852     // 4 to 3
853     const int buffer_stride_hor = (dst_w + 5) - ((dst_w + 5) % 6) + 2;
854     const int buffer_stride_ver = (dst_w + 7) & ~7;
855     const int buffer_height = (4 * dst_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
856     // When the vertical filter reads more pixels than the horizontal filter
857     // generated in each row, we need extra padding to avoid heap read overflow.
858     // For example, the horizontal filter generates 18 pixels but the vertical
859     // filter reads 24 pixels in a row. The difference is multiplied by 2 since
860     // two rows are interlaced together in the optimization.
861     const int extra_padding = (buffer_stride_ver > buffer_stride_hor)
862                                   ? 2 * (buffer_stride_ver - buffer_stride_hor)
863                                   : 0;
864     const int buffer_size = buffer_stride_hor * buffer_height + extra_padding;
865     uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_size);
866     if (temp_buffer) {
867       scaled = 1;
868       scale_plane_4_to_3_general(
869           src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
870           dst_h, vp9_filter_kernels[filter_type], phase_scaler, temp_buffer);
871       scale_plane_4_to_3_general(src->u_buffer, src->uv_stride, dst->u_buffer,
872                                  dst->uv_stride, dst_uv_w, dst_uv_h,
873                                  vp9_filter_kernels[filter_type], phase_scaler,
874                                  temp_buffer);
875       scale_plane_4_to_3_general(src->v_buffer, src->uv_stride, dst->v_buffer,
876                                  dst->uv_stride, dst_uv_w, dst_uv_h,
877                                  vp9_filter_kernels[filter_type], phase_scaler,
878                                  temp_buffer);
879       free(temp_buffer);
880     }
881   } else if (dst_w == src_w * 2 && dst_h == src_h * 2 && phase_scaler == 0) {
882     // 1 to 2
883     uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_w + 7) & ~7));
884     if (temp_buffer) {
885       scaled = 1;
886       scale_plane_1_to_2_phase_0(
887           src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src_w,
888           src_h, vp9_filter_kernels[filter_type][8], temp_buffer);
889       scale_plane_1_to_2_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
890                                  dst->uv_stride, src_w / 2, src_h / 2,
891                                  vp9_filter_kernels[filter_type][8],
892                                  temp_buffer);
893       scale_plane_1_to_2_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
894                                  dst->uv_stride, src_w / 2, src_h / 2,
895                                  vp9_filter_kernels[filter_type][8],
896                                  temp_buffer);
897       free(temp_buffer);
898     }
899   }
900 
901   if (scaled) {
902     vpx_extend_frame_borders(dst);
903   } else {
904     // Call c version for all other scaling ratios.
905     vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler);
906   }
907 }
908