1 /*
2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <tmmintrin.h>  // SSSE3
12 
13 #include <string.h>
14 
15 #include "./vpx_config.h"
16 #include "./vpx_dsp_rtcd.h"
17 #include "vpx_dsp/vpx_filter.h"
18 #include "vpx_dsp/x86/convolve.h"
19 #include "vpx_dsp/x86/convolve_sse2.h"
20 #include "vpx_dsp/x86/convolve_ssse3.h"
21 #include "vpx_dsp/x86/mem_sse2.h"
22 #include "vpx_dsp/x86/transpose_sse2.h"
23 #include "vpx_mem/vpx_mem.h"
24 #include "vpx_ports/mem.h"
25 
shuffle_filter_convolve8_8_ssse3(const __m128i * const s,const int16_t * const filter)26 static INLINE __m128i shuffle_filter_convolve8_8_ssse3(
27     const __m128i *const s, const int16_t *const filter) {
28   __m128i f[4];
29   shuffle_filter_ssse3(filter, f);
30   return convolve8_8_ssse3(s, f);
31 }
32 
33 // Used by the avx2 implementation.
34 #if VPX_ARCH_X86_64
35 // Use the intrinsics below
36 filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
37 filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
38 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
39 #define vpx_filter_block1d4_h8_ssse3 vpx_filter_block1d4_h8_intrin_ssse3
40 #define vpx_filter_block1d8_h8_ssse3 vpx_filter_block1d8_h8_intrin_ssse3
41 #define vpx_filter_block1d8_v8_ssse3 vpx_filter_block1d8_v8_intrin_ssse3
42 #else  // VPX_ARCH_X86
43 // Use the assembly in vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm.
44 filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
45 filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
46 filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
47 #endif
48 
49 #if VPX_ARCH_X86_64
vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)50 void vpx_filter_block1d4_h8_intrin_ssse3(
51     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
52     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
53   __m128i firstFilters, secondFilters, shuffle1, shuffle2;
54   __m128i srcRegFilt1, srcRegFilt2;
55   __m128i addFilterReg64, filtersReg, srcReg;
56   unsigned int i;
57 
58   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
59   addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
60   filtersReg = _mm_loadu_si128((const __m128i *)filter);
61   // converting the 16 bit (short) to  8 bit (byte) and have the same data
62   // in both lanes of 128 bit register.
63   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
64 
65   // duplicate only the first 16 bits in the filter into the first lane
66   firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
67   // duplicate only the third 16 bit in the filter into the first lane
68   secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
69   // duplicate only the seconds 16 bits in the filter into the second lane
70   // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
71   firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
72   // duplicate only the forth 16 bits in the filter into the second lane
73   // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
74   secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
75 
76   // loading the local filters
77   shuffle1 = _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6);
78   shuffle2 = _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10);
79 
80   for (i = 0; i < output_height; i++) {
81     srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
82 
83     // filter the source buffer
84     srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1);
85     srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2);
86 
87     // multiply 2 adjacent elements with the filter and add the result
88     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
89     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
90 
91     // sum the results together, saturating only on the final step
92     // the specific order of the additions prevents outranges
93     srcRegFilt1 = _mm_add_epi16(srcRegFilt1, srcRegFilt2);
94 
95     // extract the higher half of the register
96     srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8);
97 
98     // add the rounding offset early to avoid another saturated add
99     srcRegFilt1 = _mm_add_epi16(srcRegFilt1, addFilterReg64);
100     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
101 
102     // shift by 7 bit each 16 bits
103     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
104 
105     // shrink to 8 bit each 16 bits
106     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
107     src_ptr += src_pitch;
108 
109     // save only 4 bytes
110     *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1);
111 
112     output_ptr += output_pitch;
113   }
114 }
115 
vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)116 void vpx_filter_block1d8_h8_intrin_ssse3(
117     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
118     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
119   unsigned int i;
120   __m128i f[4], filt[4], s[4];
121 
122   shuffle_filter_ssse3(filter, f);
123   filt[0] = _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
124   filt[1] = _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
125   filt[2] = _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12);
126   filt[3] =
127       _mm_setr_epi8(6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14);
128 
129   for (i = 0; i < output_height; i++) {
130     const __m128i srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
131 
132     // filter the source buffer
133     s[0] = _mm_shuffle_epi8(srcReg, filt[0]);
134     s[1] = _mm_shuffle_epi8(srcReg, filt[1]);
135     s[2] = _mm_shuffle_epi8(srcReg, filt[2]);
136     s[3] = _mm_shuffle_epi8(srcReg, filt[3]);
137     s[0] = convolve8_8_ssse3(s, f);
138 
139     // shrink to 8 bit each 16 bits
140     s[0] = _mm_packus_epi16(s[0], s[0]);
141 
142     src_ptr += src_pitch;
143 
144     // save only 8 bytes
145     _mm_storel_epi64((__m128i *)&output_ptr[0], s[0]);
146 
147     output_ptr += output_pitch;
148   }
149 }
150 
vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)151 void vpx_filter_block1d8_v8_intrin_ssse3(
152     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
153     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
154   unsigned int i;
155   __m128i f[4], s[8], ss[4];
156 
157   shuffle_filter_ssse3(filter, f);
158 
159   // load the first 7 rows of 8 bytes
160   s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch));
161   s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch));
162   s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch));
163   s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch));
164   s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
165   s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch));
166   s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch));
167 
168   for (i = 0; i < output_height; i++) {
169     // load the last 8 bytes
170     s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
171 
172     // merge the result together
173     ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
174     ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
175 
176     // merge the result together
177     ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
178     ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
179 
180     ss[0] = convolve8_8_ssse3(ss, f);
181     // shrink to 8 bit each 16 bits
182     ss[0] = _mm_packus_epi16(ss[0], ss[0]);
183 
184     src_ptr += src_pitch;
185 
186     // shift down a row
187     s[0] = s[1];
188     s[1] = s[2];
189     s[2] = s[3];
190     s[3] = s[4];
191     s[4] = s[5];
192     s[5] = s[6];
193     s[6] = s[7];
194 
195     // save only 8 bytes convolve result
196     _mm_storel_epi64((__m128i *)&output_ptr[0], ss[0]);
197 
198     output_ptr += out_pitch;
199   }
200 }
201 #endif  // VPX_ARCH_X86_64
202 
vpx_filter_block1d16_h4_ssse3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel)203 static void vpx_filter_block1d16_h4_ssse3(const uint8_t *src_ptr,
204                                           ptrdiff_t src_stride,
205                                           uint8_t *dst_ptr,
206                                           ptrdiff_t dst_stride, uint32_t height,
207                                           const int16_t *kernel) {
208   // We will cast the kernel from 16-bit words to 8-bit words, and then extract
209   // the middle four elements of the kernel into two registers in the form
210   // ... k[3] k[2] k[3] k[2]
211   // ... k[5] k[4] k[5] k[4]
212   // Then we shuffle the source into
213   // ... s[1] s[0] s[0] s[-1]
214   // ... s[3] s[2] s[2] s[1]
215   // Calling multiply and add gives us half of the sum. Calling add gives us
216   // first half of the output. Repeat again to get the second half of the
217   // output. Finally we shuffle again to combine the two outputs.
218 
219   __m128i kernel_reg;                         // Kernel
220   __m128i kernel_reg_23, kernel_reg_45;       // Segments of the kernel used
221   const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
222   int h;
223 
224   __m128i src_reg, src_reg_shift_0, src_reg_shift_2;
225   __m128i dst_first, dst_second;
226   __m128i tmp_0, tmp_1;
227   __m128i idx_shift_0 =
228       _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
229   __m128i idx_shift_2 =
230       _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
231 
232   // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
233   src_ptr -= 1;
234 
235   // Load Kernel
236   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
237   kernel_reg = _mm_srai_epi16(kernel_reg, 1);
238   kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
239   kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u));
240   kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u));
241 
242   for (h = height; h > 0; --h) {
243     // Load the source
244     src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
245     src_reg_shift_0 = _mm_shuffle_epi8(src_reg, idx_shift_0);
246     src_reg_shift_2 = _mm_shuffle_epi8(src_reg, idx_shift_2);
247 
248     // Partial result for first half
249     tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
250     tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
251     dst_first = _mm_adds_epi16(tmp_0, tmp_1);
252 
253     // Do again to get the second half of dst
254     // Load the source
255     src_reg = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
256     src_reg_shift_0 = _mm_shuffle_epi8(src_reg, idx_shift_0);
257     src_reg_shift_2 = _mm_shuffle_epi8(src_reg, idx_shift_2);
258 
259     // Partial result for first half
260     tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
261     tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
262     dst_second = _mm_adds_epi16(tmp_0, tmp_1);
263 
264     // Round each result
265     dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
266     dst_second = mm_round_epi16_sse2(&dst_second, &reg_32, 6);
267 
268     // Finally combine to get the final dst
269     dst_first = _mm_packus_epi16(dst_first, dst_second);
270     _mm_store_si128((__m128i *)dst_ptr, dst_first);
271 
272     src_ptr += src_stride;
273     dst_ptr += dst_stride;
274   }
275 }
276 
vpx_filter_block1d16_v4_ssse3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel)277 static void vpx_filter_block1d16_v4_ssse3(const uint8_t *src_ptr,
278                                           ptrdiff_t src_stride,
279                                           uint8_t *dst_ptr,
280                                           ptrdiff_t dst_stride, uint32_t height,
281                                           const int16_t *kernel) {
282   // We will load two rows of pixels as 8-bit words, rearrange them into the
283   // form
284   // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
285   // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
286   // so that we can call multiply and add with the kernel to get 16-bit words of
287   // the form
288   // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
289   // Finally, we can add multiple rows together to get the desired output.
290 
291   // Register for source s[-1:3, :]
292   __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
293   // Interleaved rows of the source. lo is first half, hi second
294   __m128i src_reg_m10_lo, src_reg_m10_hi, src_reg_01_lo, src_reg_01_hi;
295   __m128i src_reg_12_lo, src_reg_12_hi, src_reg_23_lo, src_reg_23_hi;
296 
297   __m128i kernel_reg;                    // Kernel
298   __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
299 
300   // Result after multiply and add
301   __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
302   __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi;
303   __m128i res_reg_m1012, res_reg_0123;
304   __m128i res_reg_m1012_lo, res_reg_0123_lo, res_reg_m1012_hi, res_reg_0123_hi;
305 
306   const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
307 
308   // We will compute the result two rows at a time
309   const ptrdiff_t src_stride_unrolled = src_stride << 1;
310   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
311   int h;
312 
313   // Load Kernel
314   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
315   kernel_reg = _mm_srai_epi16(kernel_reg, 1);
316   kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
317   kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u));
318   kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u));
319 
320   // First shuffle the data
321   src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
322   src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
323   src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
324   src_reg_m10_hi = _mm_unpackhi_epi8(src_reg_m1, src_reg_0);
325 
326   // More shuffling
327   src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
328   src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
329   src_reg_01_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1);
330 
331   for (h = height; h > 1; h -= 2) {
332     src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
333 
334     src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
335     src_reg_12_hi = _mm_unpackhi_epi8(src_reg_1, src_reg_2);
336 
337     src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
338 
339     src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
340     src_reg_23_hi = _mm_unpackhi_epi8(src_reg_2, src_reg_3);
341 
342     // Partial output from first half
343     res_reg_m10_lo = _mm_maddubs_epi16(src_reg_m10_lo, kernel_reg_23);
344     res_reg_01_lo = _mm_maddubs_epi16(src_reg_01_lo, kernel_reg_23);
345 
346     res_reg_12_lo = _mm_maddubs_epi16(src_reg_12_lo, kernel_reg_45);
347     res_reg_23_lo = _mm_maddubs_epi16(src_reg_23_lo, kernel_reg_45);
348 
349     // Add to get first half of the results
350     res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
351     res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
352 
353     // Partial output for second half
354     res_reg_m10_hi = _mm_maddubs_epi16(src_reg_m10_hi, kernel_reg_23);
355     res_reg_01_hi = _mm_maddubs_epi16(src_reg_01_hi, kernel_reg_23);
356 
357     res_reg_12_hi = _mm_maddubs_epi16(src_reg_12_hi, kernel_reg_45);
358     res_reg_23_hi = _mm_maddubs_epi16(src_reg_23_hi, kernel_reg_45);
359 
360     // Second half of the results
361     res_reg_m1012_hi = _mm_adds_epi16(res_reg_m10_hi, res_reg_12_hi);
362     res_reg_0123_hi = _mm_adds_epi16(res_reg_01_hi, res_reg_23_hi);
363 
364     // Round the words
365     res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
366     res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
367     res_reg_m1012_hi = mm_round_epi16_sse2(&res_reg_m1012_hi, &reg_32, 6);
368     res_reg_0123_hi = mm_round_epi16_sse2(&res_reg_0123_hi, &reg_32, 6);
369 
370     // Combine to get the result
371     res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, res_reg_m1012_hi);
372     res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, res_reg_0123_hi);
373 
374     _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012);
375     _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
376 
377     // Update the source by two rows
378     src_ptr += src_stride_unrolled;
379     dst_ptr += dst_stride_unrolled;
380 
381     src_reg_m10_lo = src_reg_12_lo;
382     src_reg_m10_hi = src_reg_12_hi;
383     src_reg_01_lo = src_reg_23_lo;
384     src_reg_01_hi = src_reg_23_hi;
385     src_reg_1 = src_reg_3;
386   }
387 }
388 
vpx_filter_block1d8_h4_ssse3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel)389 static void vpx_filter_block1d8_h4_ssse3(const uint8_t *src_ptr,
390                                          ptrdiff_t src_stride, uint8_t *dst_ptr,
391                                          ptrdiff_t dst_stride, uint32_t height,
392                                          const int16_t *kernel) {
393   // We will cast the kernel from 16-bit words to 8-bit words, and then extract
394   // the middle four elements of the kernel into two registers in the form
395   // ... k[3] k[2] k[3] k[2]
396   // ... k[5] k[4] k[5] k[4]
397   // Then we shuffle the source into
398   // ... s[1] s[0] s[0] s[-1]
399   // ... s[3] s[2] s[2] s[1]
400   // Calling multiply and add gives us half of the sum. Calling add gives us
401   // first half of the output. Repeat again to get the second half of the
402   // output. Finally we shuffle again to combine the two outputs.
403 
404   __m128i kernel_reg;                         // Kernel
405   __m128i kernel_reg_23, kernel_reg_45;       // Segments of the kernel used
406   const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
407   int h;
408 
409   __m128i src_reg, src_reg_shift_0, src_reg_shift_2;
410   __m128i dst_first;
411   __m128i tmp_0, tmp_1;
412   __m128i idx_shift_0 =
413       _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
414   __m128i idx_shift_2 =
415       _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
416 
417   // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
418   src_ptr -= 1;
419 
420   // Load Kernel
421   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
422   kernel_reg = _mm_srai_epi16(kernel_reg, 1);
423   kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
424   kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u));
425   kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u));
426 
427   for (h = height; h > 0; --h) {
428     // Load the source
429     src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
430     src_reg_shift_0 = _mm_shuffle_epi8(src_reg, idx_shift_0);
431     src_reg_shift_2 = _mm_shuffle_epi8(src_reg, idx_shift_2);
432 
433     // Get the result
434     tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
435     tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
436     dst_first = _mm_adds_epi16(tmp_0, tmp_1);
437 
438     // Round round result
439     dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
440 
441     // Pack to 8-bits
442     dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
443     _mm_storel_epi64((__m128i *)dst_ptr, dst_first);
444 
445     src_ptr += src_stride;
446     dst_ptr += dst_stride;
447   }
448 }
449 
vpx_filter_block1d8_v4_ssse3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel)450 static void vpx_filter_block1d8_v4_ssse3(const uint8_t *src_ptr,
451                                          ptrdiff_t src_stride, uint8_t *dst_ptr,
452                                          ptrdiff_t dst_stride, uint32_t height,
453                                          const int16_t *kernel) {
454   // We will load two rows of pixels as 8-bit words, rearrange them into the
455   // form
456   // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
457   // so that we can call multiply and add with the kernel to get 16-bit words of
458   // the form
459   // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
460   // Finally, we can add multiple rows together to get the desired output.
461 
462   // Register for source s[-1:3, :]
463   __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
464   // Interleaved rows of the source. lo is first half, hi second
465   __m128i src_reg_m10, src_reg_01;
466   __m128i src_reg_12, src_reg_23;
467 
468   __m128i kernel_reg;                    // Kernel
469   __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
470 
471   // Result after multiply and add
472   __m128i res_reg_m10, res_reg_01, res_reg_12, res_reg_23;
473   __m128i res_reg_m1012, res_reg_0123;
474 
475   const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
476 
477   // We will compute the result two rows at a time
478   const ptrdiff_t src_stride_unrolled = src_stride << 1;
479   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
480   int h;
481 
482   // Load Kernel
483   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
484   kernel_reg = _mm_srai_epi16(kernel_reg, 1);
485   kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
486   kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u));
487   kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u));
488 
489   // First shuffle the data
490   src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr);
491   src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride));
492   src_reg_m10 = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
493 
494   // More shuffling
495   src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2));
496   src_reg_01 = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
497 
498   for (h = height; h > 1; h -= 2) {
499     src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3));
500 
501     src_reg_12 = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
502 
503     src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4));
504 
505     src_reg_23 = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
506 
507     // Partial output
508     res_reg_m10 = _mm_maddubs_epi16(src_reg_m10, kernel_reg_23);
509     res_reg_01 = _mm_maddubs_epi16(src_reg_01, kernel_reg_23);
510 
511     res_reg_12 = _mm_maddubs_epi16(src_reg_12, kernel_reg_45);
512     res_reg_23 = _mm_maddubs_epi16(src_reg_23, kernel_reg_45);
513 
514     // Add to get entire output
515     res_reg_m1012 = _mm_adds_epi16(res_reg_m10, res_reg_12);
516     res_reg_0123 = _mm_adds_epi16(res_reg_01, res_reg_23);
517 
518     // Round the words
519     res_reg_m1012 = mm_round_epi16_sse2(&res_reg_m1012, &reg_32, 6);
520     res_reg_0123 = mm_round_epi16_sse2(&res_reg_0123, &reg_32, 6);
521 
522     // Pack from 16-bit to 8-bit
523     res_reg_m1012 = _mm_packus_epi16(res_reg_m1012, _mm_setzero_si128());
524     res_reg_0123 = _mm_packus_epi16(res_reg_0123, _mm_setzero_si128());
525 
526     _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012);
527     _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
528 
529     // Update the source by two rows
530     src_ptr += src_stride_unrolled;
531     dst_ptr += dst_stride_unrolled;
532 
533     src_reg_m10 = src_reg_12;
534     src_reg_01 = src_reg_23;
535     src_reg_1 = src_reg_3;
536   }
537 }
538 
vpx_filter_block1d4_h4_ssse3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel)539 static void vpx_filter_block1d4_h4_ssse3(const uint8_t *src_ptr,
540                                          ptrdiff_t src_stride, uint8_t *dst_ptr,
541                                          ptrdiff_t dst_stride, uint32_t height,
542                                          const int16_t *kernel) {
543   // We will cast the kernel from 16-bit words to 8-bit words, and then extract
544   // the middle four elements of the kernel into a single register in the form
545   // k[5:2] k[5:2] k[5:2] k[5:2]
546   // Then we shuffle the source into
547   // s[5:2] s[4:1] s[3:0] s[2:-1]
548   // Calling multiply and add gives us half of the sum next to each other.
549   // Calling horizontal add then gives us the output.
550 
551   __m128i kernel_reg;                         // Kernel
552   const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
553   int h;
554 
555   __m128i src_reg, src_reg_shuf;
556   __m128i dst_first;
557   __m128i shuf_idx =
558       _mm_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6);
559 
560   // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
561   src_ptr -= 1;
562 
563   // Load Kernel
564   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
565   kernel_reg = _mm_srai_epi16(kernel_reg, 1);
566   kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
567   kernel_reg = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi32(0x05040302u));
568 
569   for (h = height; h > 0; --h) {
570     // Load the source
571     src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
572     src_reg_shuf = _mm_shuffle_epi8(src_reg, shuf_idx);
573 
574     // Get the result
575     dst_first = _mm_maddubs_epi16(src_reg_shuf, kernel_reg);
576     dst_first = _mm_hadds_epi16(dst_first, _mm_setzero_si128());
577 
578     // Round result
579     dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
580 
581     // Pack to 8-bits
582     dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
583     *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
584 
585     src_ptr += src_stride;
586     dst_ptr += dst_stride;
587   }
588 }
589 
vpx_filter_block1d4_v4_ssse3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,uint32_t height,const int16_t * kernel)590 static void vpx_filter_block1d4_v4_ssse3(const uint8_t *src_ptr,
591                                          ptrdiff_t src_stride, uint8_t *dst_ptr,
592                                          ptrdiff_t dst_stride, uint32_t height,
593                                          const int16_t *kernel) {
594   // We will load two rows of pixels as 8-bit words, rearrange them into the
595   // form
596   // ... s[2,0] s[1,0] s[0,0] s[-1,0]
597   // so that we can call multiply and add with the kernel partial output. Then
598   // we can call horizontal add to get the output.
599   // Finally, we can add multiple rows together to get the desired output.
600   // This is done two rows at a time
601 
602   // Register for source s[-1:3, :]
603   __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
604   // Interleaved rows of the source.
605   __m128i src_reg_m10, src_reg_01;
606   __m128i src_reg_12, src_reg_23;
607   __m128i src_reg_m1001, src_reg_1223;
608   __m128i src_reg_m1012_1023_lo, src_reg_m1012_1023_hi;
609 
610   __m128i kernel_reg;  // Kernel
611 
612   // Result after multiply and add
613   __m128i reg_0, reg_1;
614 
615   const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
616 
617   // We will compute the result two rows at a time
618   const ptrdiff_t src_stride_unrolled = src_stride << 1;
619   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
620   int h;
621 
622   // Load Kernel
623   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
624   kernel_reg = _mm_srai_epi16(kernel_reg, 1);
625   kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
626   kernel_reg = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi32(0x05040302u));
627 
628   // First shuffle the data
629   src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr);
630   src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride));
631   src_reg_m10 = _mm_unpacklo_epi32(src_reg_m1, src_reg_0);
632 
633   // More shuffling
634   src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2));
635   src_reg_01 = _mm_unpacklo_epi32(src_reg_0, src_reg_1);
636 
637   // Put three rows next to each other
638   src_reg_m1001 = _mm_unpacklo_epi8(src_reg_m10, src_reg_01);
639 
640   for (h = height; h > 1; h -= 2) {
641     src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3));
642     src_reg_12 = _mm_unpacklo_epi32(src_reg_1, src_reg_2);
643 
644     src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4));
645     src_reg_23 = _mm_unpacklo_epi32(src_reg_2, src_reg_3);
646 
647     // Put three rows next to each other
648     src_reg_1223 = _mm_unpacklo_epi8(src_reg_12, src_reg_23);
649 
650     // Put all four rows next to each other
651     src_reg_m1012_1023_lo = _mm_unpacklo_epi16(src_reg_m1001, src_reg_1223);
652     src_reg_m1012_1023_hi = _mm_unpackhi_epi16(src_reg_m1001, src_reg_1223);
653 
654     // Get the results
655     reg_0 = _mm_maddubs_epi16(src_reg_m1012_1023_lo, kernel_reg);
656     reg_1 = _mm_maddubs_epi16(src_reg_m1012_1023_hi, kernel_reg);
657     reg_0 = _mm_hadds_epi16(reg_0, _mm_setzero_si128());
658     reg_1 = _mm_hadds_epi16(reg_1, _mm_setzero_si128());
659 
660     // Round the words
661     reg_0 = mm_round_epi16_sse2(&reg_0, &reg_32, 6);
662     reg_1 = mm_round_epi16_sse2(&reg_1, &reg_32, 6);
663 
664     // Pack from 16-bit to 8-bit and put them in the right order
665     reg_0 = _mm_packus_epi16(reg_0, reg_0);
666     reg_1 = _mm_packus_epi16(reg_1, reg_1);
667 
668     // Save the result
669     *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(reg_0);
670     *((uint32_t *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(reg_1);
671 
672     // Update the source by two rows
673     src_ptr += src_stride_unrolled;
674     dst_ptr += dst_stride_unrolled;
675 
676     src_reg_m1001 = src_reg_1223;
677     src_reg_1 = src_reg_3;
678   }
679 }
680 
681 // From vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
682 filter8_1dfunction vpx_filter_block1d16_v8_ssse3;
683 filter8_1dfunction vpx_filter_block1d16_h8_ssse3;
684 filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
685 filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3;
686 filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3;
687 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
688 filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
689 filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
690 filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
691 
692 // Use the [vh]8 version because there is no [vh]4 implementation.
693 #define vpx_filter_block1d16_v4_avg_ssse3 vpx_filter_block1d16_v8_avg_ssse3
694 #define vpx_filter_block1d16_h4_avg_ssse3 vpx_filter_block1d16_h8_avg_ssse3
695 #define vpx_filter_block1d8_v4_avg_ssse3 vpx_filter_block1d8_v8_avg_ssse3
696 #define vpx_filter_block1d8_h4_avg_ssse3 vpx_filter_block1d8_h8_avg_ssse3
697 #define vpx_filter_block1d4_v4_avg_ssse3 vpx_filter_block1d4_v8_avg_ssse3
698 #define vpx_filter_block1d4_h4_avg_ssse3 vpx_filter_block1d4_h8_avg_ssse3
699 
700 // From vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
701 filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
702 filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
703 filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
704 filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
705 filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
706 filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
707 filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
708 filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
709 filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
710 filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
711 filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
712 filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
713 
714 // void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
715 //                                uint8_t *dst, ptrdiff_t dst_stride,
716 //                                const InterpKernel *filter, int x0_q4,
717 //                                int32_t x_step_q4, int y0_q4, int y_step_q4,
718 //                                int w, int h);
719 // void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
720 //                               uint8_t *dst, ptrdiff_t dst_stride,
721 //                               const InterpKernel *filter, int x0_q4,
722 //                               int32_t x_step_q4, int y0_q4, int y_step_q4,
723 //                               int w, int h);
724 // void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
725 //                                    uint8_t *dst, ptrdiff_t dst_stride,
726 //                                    const InterpKernel *filter, int x0_q4,
727 //                                    int32_t x_step_q4, int y0_q4,
728 //                                    int y_step_q4, int w, int h);
729 // void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
730 //                                   uint8_t *dst, ptrdiff_t dst_stride,
731 //                                   const InterpKernel *filter, int x0_q4,
732 //                                   int32_t x_step_q4, int y0_q4,
733 //                                   int y_step_q4, int w, int h);
734 FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3, 0);
735 FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), ,
736             ssse3, 0);
737 FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3, 1);
738 FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
739             src - src_stride * (num_taps / 2 - 1), avg_, ssse3, 1);
740 
filter_horiz_w8_ssse3(const uint8_t * const src,const ptrdiff_t src_stride,uint8_t * const dst,const int16_t * const x_filter)741 static void filter_horiz_w8_ssse3(const uint8_t *const src,
742                                   const ptrdiff_t src_stride,
743                                   uint8_t *const dst,
744                                   const int16_t *const x_filter) {
745   __m128i s[8], ss[4], temp;
746 
747   load_8bit_8x8(src, src_stride, s);
748   // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
749   // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
750   // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
751   // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
752   transpose_16bit_4x8(s, ss);
753   temp = shuffle_filter_convolve8_8_ssse3(ss, x_filter);
754   // shrink to 8 bit each 16 bits
755   temp = _mm_packus_epi16(temp, temp);
756   // save only 8 bytes convolve result
757   _mm_storel_epi64((__m128i *)dst, temp);
758 }
759 
transpose8x8_to_dst(const uint8_t * const src,const ptrdiff_t src_stride,uint8_t * const dst,const ptrdiff_t dst_stride)760 static void transpose8x8_to_dst(const uint8_t *const src,
761                                 const ptrdiff_t src_stride, uint8_t *const dst,
762                                 const ptrdiff_t dst_stride) {
763   __m128i s[8];
764 
765   load_8bit_8x8(src, src_stride, s);
766   transpose_8bit_8x8(s, s);
767   store_8bit_8x8(s, dst, dst_stride);
768 }
769 
scaledconvolve_horiz_w8(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const InterpKernel * const x_filters,const int x0_q4,const int x_step_q4,const int w,const int h)770 static void scaledconvolve_horiz_w8(const uint8_t *src,
771                                     const ptrdiff_t src_stride, uint8_t *dst,
772                                     const ptrdiff_t dst_stride,
773                                     const InterpKernel *const x_filters,
774                                     const int x0_q4, const int x_step_q4,
775                                     const int w, const int h) {
776   DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
777   int x, y, z;
778   src -= SUBPEL_TAPS / 2 - 1;
779 
780   // This function processes 8x8 areas. The intermediate height is not always
781   // a multiple of 8, so force it to be a multiple of 8 here.
782   y = h + (8 - (h & 0x7));
783 
784   do {
785     int x_q4 = x0_q4;
786     for (x = 0; x < w; x += 8) {
787       // process 8 src_x steps
788       for (z = 0; z < 8; ++z) {
789         const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
790         const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
791         if (x_q4 & SUBPEL_MASK) {
792           filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
793         } else {
794           int i;
795           for (i = 0; i < 8; ++i) {
796             temp[z * 8 + i] = src_x[i * src_stride + 3];
797           }
798         }
799         x_q4 += x_step_q4;
800       }
801 
802       // transpose the 8x8 filters values back to dst
803       transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
804     }
805 
806     src += src_stride * 8;
807     dst += dst_stride * 8;
808   } while (y -= 8);
809 }
810 
filter_horiz_w4_ssse3(const uint8_t * const src,const ptrdiff_t src_stride,uint8_t * const dst,const int16_t * const filter)811 static void filter_horiz_w4_ssse3(const uint8_t *const src,
812                                   const ptrdiff_t src_stride,
813                                   uint8_t *const dst,
814                                   const int16_t *const filter) {
815   __m128i s[4], ss[2];
816   __m128i temp;
817 
818   load_8bit_8x4(src, src_stride, s);
819   transpose_16bit_4x4(s, ss);
820   // 00 01 10 11 20 21 30 31
821   s[0] = ss[0];
822   // 02 03 12 13 22 23 32 33
823   s[1] = _mm_srli_si128(ss[0], 8);
824   // 04 05 14 15 24 25 34 35
825   s[2] = ss[1];
826   // 06 07 16 17 26 27 36 37
827   s[3] = _mm_srli_si128(ss[1], 8);
828 
829   temp = shuffle_filter_convolve8_8_ssse3(s, filter);
830   // shrink to 8 bit each 16 bits
831   temp = _mm_packus_epi16(temp, temp);
832   // save only 4 bytes
833   *(int *)dst = _mm_cvtsi128_si32(temp);
834 }
835 
transpose4x4_to_dst(const uint8_t * const src,const ptrdiff_t src_stride,uint8_t * const dst,const ptrdiff_t dst_stride)836 static void transpose4x4_to_dst(const uint8_t *const src,
837                                 const ptrdiff_t src_stride, uint8_t *const dst,
838                                 const ptrdiff_t dst_stride) {
839   __m128i s[4];
840 
841   load_8bit_4x4(src, src_stride, s);
842   s[0] = transpose_8bit_4x4(s);
843   s[1] = _mm_srli_si128(s[0], 4);
844   s[2] = _mm_srli_si128(s[0], 8);
845   s[3] = _mm_srli_si128(s[0], 12);
846   store_8bit_4x4(s, dst, dst_stride);
847 }
848 
scaledconvolve_horiz_w4(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * dst,const ptrdiff_t dst_stride,const InterpKernel * const x_filters,const int x0_q4,const int x_step_q4,const int w,const int h)849 static void scaledconvolve_horiz_w4(const uint8_t *src,
850                                     const ptrdiff_t src_stride, uint8_t *dst,
851                                     const ptrdiff_t dst_stride,
852                                     const InterpKernel *const x_filters,
853                                     const int x0_q4, const int x_step_q4,
854                                     const int w, const int h) {
855   DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
856   int x, y, z;
857   src -= SUBPEL_TAPS / 2 - 1;
858 
859   for (y = 0; y < h; y += 4) {
860     int x_q4 = x0_q4;
861     for (x = 0; x < w; x += 4) {
862       // process 4 src_x steps
863       for (z = 0; z < 4; ++z) {
864         const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
865         const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
866         if (x_q4 & SUBPEL_MASK) {
867           filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
868         } else {
869           int i;
870           for (i = 0; i < 4; ++i) {
871             temp[z * 4 + i] = src_x[i * src_stride + 3];
872           }
873         }
874         x_q4 += x_step_q4;
875       }
876 
877       // transpose the 4x4 filters values back to dst
878       transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
879     }
880 
881     src += src_stride * 4;
882     dst += dst_stride * 4;
883   }
884 }
885 
filter_vert_kernel(const __m128i * const s,const int16_t * const filter)886 static __m128i filter_vert_kernel(const __m128i *const s,
887                                   const int16_t *const filter) {
888   __m128i ss[4];
889   __m128i temp;
890 
891   // 00 10 01 11 02 12 03 13
892   ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
893   // 20 30 21 31 22 32 23 33
894   ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
895   // 40 50 41 51 42 52 43 53
896   ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
897   // 60 70 61 71 62 72 63 73
898   ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
899 
900   temp = shuffle_filter_convolve8_8_ssse3(ss, filter);
901   // shrink to 8 bit each 16 bits
902   return _mm_packus_epi16(temp, temp);
903 }
904 
filter_vert_w4_ssse3(const uint8_t * const src,const ptrdiff_t src_stride,uint8_t * const dst,const int16_t * const filter)905 static void filter_vert_w4_ssse3(const uint8_t *const src,
906                                  const ptrdiff_t src_stride, uint8_t *const dst,
907                                  const int16_t *const filter) {
908   __m128i s[8];
909   __m128i temp;
910 
911   load_8bit_4x8(src, src_stride, s);
912   temp = filter_vert_kernel(s, filter);
913   // save only 4 bytes
914   *(int *)dst = _mm_cvtsi128_si32(temp);
915 }
916 
scaledconvolve_vert_w4(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * const dst,const ptrdiff_t dst_stride,const InterpKernel * const y_filters,const int y0_q4,const int y_step_q4,const int w,const int h)917 static void scaledconvolve_vert_w4(
918     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
919     const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
920     const int y0_q4, const int y_step_q4, const int w, const int h) {
921   int y;
922   int y_q4 = y0_q4;
923 
924   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
925   for (y = 0; y < h; ++y) {
926     const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
927     const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
928 
929     if (y_q4 & SUBPEL_MASK) {
930       filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
931     } else {
932       memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
933     }
934 
935     y_q4 += y_step_q4;
936   }
937 }
938 
filter_vert_w8_ssse3(const uint8_t * const src,const ptrdiff_t src_stride,uint8_t * const dst,const int16_t * const filter)939 static void filter_vert_w8_ssse3(const uint8_t *const src,
940                                  const ptrdiff_t src_stride, uint8_t *const dst,
941                                  const int16_t *const filter) {
942   __m128i s[8], temp;
943 
944   load_8bit_8x8(src, src_stride, s);
945   temp = filter_vert_kernel(s, filter);
946   // save only 8 bytes convolve result
947   _mm_storel_epi64((__m128i *)dst, temp);
948 }
949 
scaledconvolve_vert_w8(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * const dst,const ptrdiff_t dst_stride,const InterpKernel * const y_filters,const int y0_q4,const int y_step_q4,const int w,const int h)950 static void scaledconvolve_vert_w8(
951     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
952     const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
953     const int y0_q4, const int y_step_q4, const int w, const int h) {
954   int y;
955   int y_q4 = y0_q4;
956 
957   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
958   for (y = 0; y < h; ++y) {
959     const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
960     const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
961     if (y_q4 & SUBPEL_MASK) {
962       filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
963     } else {
964       memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
965     }
966     y_q4 += y_step_q4;
967   }
968 }
969 
filter_vert_w16_ssse3(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * const dst,const int16_t * const filter,const int w)970 static void filter_vert_w16_ssse3(const uint8_t *src,
971                                   const ptrdiff_t src_stride,
972                                   uint8_t *const dst,
973                                   const int16_t *const filter, const int w) {
974   int i;
975   __m128i f[4];
976   shuffle_filter_ssse3(filter, f);
977 
978   for (i = 0; i < w; i += 16) {
979     __m128i s[8], s_lo[4], s_hi[4], temp_lo, temp_hi;
980 
981     loadu_8bit_16x8(src, src_stride, s);
982 
983     // merge the result together
984     s_lo[0] = _mm_unpacklo_epi8(s[0], s[1]);
985     s_hi[0] = _mm_unpackhi_epi8(s[0], s[1]);
986     s_lo[1] = _mm_unpacklo_epi8(s[2], s[3]);
987     s_hi[1] = _mm_unpackhi_epi8(s[2], s[3]);
988     s_lo[2] = _mm_unpacklo_epi8(s[4], s[5]);
989     s_hi[2] = _mm_unpackhi_epi8(s[4], s[5]);
990     s_lo[3] = _mm_unpacklo_epi8(s[6], s[7]);
991     s_hi[3] = _mm_unpackhi_epi8(s[6], s[7]);
992     temp_lo = convolve8_8_ssse3(s_lo, f);
993     temp_hi = convolve8_8_ssse3(s_hi, f);
994 
995     // shrink to 8 bit each 16 bits, the first lane contain the first convolve
996     // result and the second lane contain the second convolve result
997     temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
998     src += 16;
999     // save 16 bytes convolve result
1000     _mm_store_si128((__m128i *)&dst[i], temp_hi);
1001   }
1002 }
1003 
scaledconvolve_vert_w16(const uint8_t * src,const ptrdiff_t src_stride,uint8_t * const dst,const ptrdiff_t dst_stride,const InterpKernel * const y_filters,const int y0_q4,const int y_step_q4,const int w,const int h)1004 static void scaledconvolve_vert_w16(
1005     const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
1006     const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
1007     const int y0_q4, const int y_step_q4, const int w, const int h) {
1008   int y;
1009   int y_q4 = y0_q4;
1010 
1011   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1012   for (y = 0; y < h; ++y) {
1013     const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1014     const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1015     if (y_q4 & SUBPEL_MASK) {
1016       filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
1017                             w);
1018     } else {
1019       memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
1020     }
1021     y_q4 += y_step_q4;
1022   }
1023 }
1024 
vpx_scaled_2d_ssse3(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)1025 void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1026                          ptrdiff_t dst_stride, const InterpKernel *filter,
1027                          int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
1028                          int w, int h) {
1029   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
1030   // 2d filtering proceeds in 2 steps:
1031   //   (1) Interpolate horizontally into an intermediate buffer, temp.
1032   //   (2) Interpolate temp vertically to derive the sub-pixel result.
1033   // Deriving the maximum number of rows in the temp buffer (135):
1034   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1035   // --Largest block size is 64x64 pixels.
1036   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
1037   //   original frame (in 1/16th pixel units).
1038   // --Must round-up because block may be located at sub-pixel position.
1039   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1040   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
1041   // --Require an additional 8 rows for the horiz_w8 transpose tail.
1042   // When calling in frame scaling function, the smallest scaling factor is x1/4
1043   // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
1044   // big enough.
1045   DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
1046   const int intermediate_height =
1047       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1048 
1049   assert(w <= 64);
1050   assert(h <= 64);
1051   assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
1052   assert(x_step_q4 <= 64);
1053 
1054   if (w >= 8) {
1055     scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1056                             src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
1057                             intermediate_height);
1058   } else {
1059     scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1060                             src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
1061                             intermediate_height);
1062   }
1063 
1064   if (w >= 16) {
1065     scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1066                             dst_stride, filter, y0_q4, y_step_q4, w, h);
1067   } else if (w == 8) {
1068     scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1069                            dst_stride, filter, y0_q4, y_step_q4, w, h);
1070   } else {
1071     scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
1072                            dst_stride, filter, y0_q4, y_step_q4, w, h);
1073   }
1074 }
1075 
1076 // void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
1077 //                          uint8_t *dst, ptrdiff_t dst_stride,
1078 //                          const InterpKernel *filter, int x0_q4,
1079 //                          int32_t x_step_q4, int y0_q4, int y_step_q4,
1080 //                          int w, int h);
1081 // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
1082 //                              uint8_t *dst, ptrdiff_t dst_stride,
1083 //                              const InterpKernel *filter, int x0_q4,
1084 //                              int32_t x_step_q4, int y0_q4, int y_step_q4,
1085 //                              int w, int h);
1086 FUN_CONV_2D(, ssse3, 0);
1087 FUN_CONV_2D(avg_, ssse3, 1);
1088