1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 // Due to a header conflict between math.h and intrinsics includes with ceil()
12 // in certain configurations under vs9 this include needs to precede
13 // immintrin.h.
14 #include "./vp9_rtcd.h"
15
16 #include <immintrin.h>
17
18 #include "vp9/common/x86/convolve.h"
19 #include "vpx_ports/mem.h"
20
21 // filters for 16_h8 and 16_v8
22 DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
23 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
24 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
25 };
26
27 DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
28 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
29 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
30 };
31
32 DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
33 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
34 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
35 };
36
37 DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
38 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
39 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
40 };
41
42 #if defined(__clang__)
43 # if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \
44 (defined(__APPLE__) && __clang_major__ == 5 && __clang_minor__ == 0)
45 # define MM256_BROADCASTSI128_SI256(x) \
46 _mm_broadcastsi128_si256((__m128i const *)&(x))
47 # else // clang > 3.3, and not 5.0 on macosx.
48 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
49 # endif // clang <= 3.3
50 #elif defined(__GNUC__)
51 # if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
52 # define MM256_BROADCASTSI128_SI256(x) \
53 _mm_broadcastsi128_si256((__m128i const *)&(x))
54 # elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
55 # define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
56 # else // gcc > 4.7
57 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
58 # endif // gcc <= 4.6
59 #else // !(gcc || clang)
60 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
61 #endif // __clang__
62
vp9_filter_block1d16_h8_avx2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)63 static void vp9_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
64 ptrdiff_t src_pixels_per_line,
65 uint8_t *output_ptr,
66 ptrdiff_t output_pitch,
67 uint32_t output_height,
68 const int16_t *filter) {
69 __m128i filtersReg;
70 __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
71 __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
72 __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
73 __m256i srcReg32b1, srcReg32b2, filtersReg32;
74 unsigned int i;
75 ptrdiff_t src_stride, dst_stride;
76
77 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
78 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
79 filtersReg = _mm_loadu_si128((const __m128i *)filter);
80 // converting the 16 bit (short) to 8 bit (byte) and have the same data
81 // in both lanes of 128 bit register.
82 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
83 // have the same data in both lanes of a 256 bit register
84 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
85
86 // duplicate only the first 16 bits (first and second byte)
87 // across 256 bit register
88 firstFilters = _mm256_shuffle_epi8(filtersReg32,
89 _mm256_set1_epi16(0x100u));
90 // duplicate only the second 16 bits (third and forth byte)
91 // across 256 bit register
92 secondFilters = _mm256_shuffle_epi8(filtersReg32,
93 _mm256_set1_epi16(0x302u));
94 // duplicate only the third 16 bits (fifth and sixth byte)
95 // across 256 bit register
96 thirdFilters = _mm256_shuffle_epi8(filtersReg32,
97 _mm256_set1_epi16(0x504u));
98 // duplicate only the forth 16 bits (seventh and eighth byte)
99 // across 256 bit register
100 forthFilters = _mm256_shuffle_epi8(filtersReg32,
101 _mm256_set1_epi16(0x706u));
102
103 filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
104 filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
105 filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
106 filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
107
108 // multiple the size of the source and destination stride by two
109 src_stride = src_pixels_per_line << 1;
110 dst_stride = output_pitch << 1;
111 for (i = output_height; i > 1; i-=2) {
112 // load the 2 strides of source
113 srcReg32b1 = _mm256_castsi128_si256(
114 _mm_loadu_si128((const __m128i *)(src_ptr - 3)));
115 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
116 _mm_loadu_si128((const __m128i *)
117 (src_ptr+src_pixels_per_line-3)), 1);
118
119 // filter the source buffer
120 srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
121 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
122
123 // multiply 2 adjacent elements with the filter and add the result
124 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
125 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
126
127 // add and saturate the results together
128 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
129
130 // filter the source buffer
131 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
132 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
133
134 // multiply 2 adjacent elements with the filter and add the result
135 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
136 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
137
138 // add and saturate the results together
139 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
140 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
141
142 // reading 2 strides of the next 16 bytes
143 // (part of it was being read by earlier read)
144 srcReg32b2 = _mm256_castsi128_si256(
145 _mm_loadu_si128((const __m128i *)(src_ptr + 5)));
146 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
147 _mm_loadu_si128((const __m128i *)
148 (src_ptr+src_pixels_per_line+5)), 1);
149
150 // add and saturate the results together
151 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
152 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
153
154 // filter the source buffer
155 srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
156 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
157
158 // multiply 2 adjacent elements with the filter and add the result
159 srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
160 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
161
162 // add and saturate the results together
163 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
164
165 // filter the source buffer
166 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
167 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
168
169 // multiply 2 adjacent elements with the filter and add the result
170 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
171 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
172
173 // add and saturate the results together
174 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
175 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
176 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
177 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
178
179
180 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
181
182 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
183
184 // shift by 7 bit each 16 bit
185 srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
186 srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
187
188 // shrink to 8 bit each 16 bits, the first lane contain the first
189 // convolve result and the second lane contain the second convolve
190 // result
191 srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1,
192 srcRegFilt32b2_1);
193
194 src_ptr+=src_stride;
195
196 // save 16 bytes
197 _mm_store_si128((__m128i*)output_ptr,
198 _mm256_castsi256_si128(srcRegFilt32b1_1));
199
200 // save the next 16 bits
201 _mm_store_si128((__m128i*)(output_ptr+output_pitch),
202 _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
203 output_ptr+=dst_stride;
204 }
205
206 // if the number of strides is odd.
207 // process only 16 bytes
208 if (i > 0) {
209 __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
210 __m128i srcRegFilt2, srcRegFilt3;
211
212 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
213
214 // filter the source buffer
215 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
216 _mm256_castsi256_si128(filt1Reg));
217 srcRegFilt2 = _mm_shuffle_epi8(srcReg1,
218 _mm256_castsi256_si128(filt4Reg));
219
220 // multiply 2 adjacent elements with the filter and add the result
221 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
222 _mm256_castsi256_si128(firstFilters));
223 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
224 _mm256_castsi256_si128(forthFilters));
225
226 // add and saturate the results together
227 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
228
229 // filter the source buffer
230 srcRegFilt3= _mm_shuffle_epi8(srcReg1,
231 _mm256_castsi256_si128(filt2Reg));
232 srcRegFilt2= _mm_shuffle_epi8(srcReg1,
233 _mm256_castsi256_si128(filt3Reg));
234
235 // multiply 2 adjacent elements with the filter and add the result
236 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
237 _mm256_castsi256_si128(secondFilters));
238 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
239 _mm256_castsi256_si128(thirdFilters));
240
241 // add and saturate the results together
242 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
243 _mm_min_epi16(srcRegFilt3, srcRegFilt2));
244
245 // reading the next 16 bytes
246 // (part of it was being read by earlier read)
247 srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
248
249 // add and saturate the results together
250 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
251 _mm_max_epi16(srcRegFilt3, srcRegFilt2));
252
253 // filter the source buffer
254 srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,
255 _mm256_castsi256_si128(filt1Reg));
256 srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
257 _mm256_castsi256_si128(filt4Reg));
258
259 // multiply 2 adjacent elements with the filter and add the result
260 srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,
261 _mm256_castsi256_si128(firstFilters));
262 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
263 _mm256_castsi256_si128(forthFilters));
264
265 // add and saturate the results together
266 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
267
268 // filter the source buffer
269 srcRegFilt3 = _mm_shuffle_epi8(srcReg2,
270 _mm256_castsi256_si128(filt2Reg));
271 srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
272 _mm256_castsi256_si128(filt3Reg));
273
274 // multiply 2 adjacent elements with the filter and add the result
275 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
276 _mm256_castsi256_si128(secondFilters));
277 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
278 _mm256_castsi256_si128(thirdFilters));
279
280 // add and saturate the results together
281 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
282 _mm_min_epi16(srcRegFilt3, srcRegFilt2));
283 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
284 _mm_max_epi16(srcRegFilt3, srcRegFilt2));
285
286
287 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
288 _mm256_castsi256_si128(addFilterReg64));
289
290 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
291 _mm256_castsi256_si128(addFilterReg64));
292
293 // shift by 7 bit each 16 bit
294 srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
295 srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
296
297 // shrink to 8 bit each 16 bits, the first lane contain the first
298 // convolve result and the second lane contain the second convolve
299 // result
300 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
301
302 // save 16 bytes
303 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
304 }
305 }
306
vp9_filter_block1d16_v8_avx2(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)307 static void vp9_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
308 ptrdiff_t src_pitch,
309 uint8_t *output_ptr,
310 ptrdiff_t out_pitch,
311 uint32_t output_height,
312 const int16_t *filter) {
313 __m128i filtersReg;
314 __m256i addFilterReg64;
315 __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
316 __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
317 __m256i srcReg32b11, srcReg32b12, filtersReg32;
318 __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
319 unsigned int i;
320 ptrdiff_t src_stride, dst_stride;
321
322 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
323 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
324 filtersReg = _mm_loadu_si128((const __m128i *)filter);
325 // converting the 16 bit (short) to 8 bit (byte) and have the
326 // same data in both lanes of 128 bit register.
327 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
328 // have the same data in both lanes of a 256 bit register
329 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
330
331 // duplicate only the first 16 bits (first and second byte)
332 // across 256 bit register
333 firstFilters = _mm256_shuffle_epi8(filtersReg32,
334 _mm256_set1_epi16(0x100u));
335 // duplicate only the second 16 bits (third and forth byte)
336 // across 256 bit register
337 secondFilters = _mm256_shuffle_epi8(filtersReg32,
338 _mm256_set1_epi16(0x302u));
339 // duplicate only the third 16 bits (fifth and sixth byte)
340 // across 256 bit register
341 thirdFilters = _mm256_shuffle_epi8(filtersReg32,
342 _mm256_set1_epi16(0x504u));
343 // duplicate only the forth 16 bits (seventh and eighth byte)
344 // across 256 bit register
345 forthFilters = _mm256_shuffle_epi8(filtersReg32,
346 _mm256_set1_epi16(0x706u));
347
348 // multiple the size of the source and destination stride by two
349 src_stride = src_pitch << 1;
350 dst_stride = out_pitch << 1;
351
352 // load 16 bytes 7 times in stride of src_pitch
353 srcReg32b1 = _mm256_castsi128_si256(
354 _mm_loadu_si128((const __m128i *)(src_ptr)));
355 srcReg32b2 = _mm256_castsi128_si256(
356 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
357 srcReg32b3 = _mm256_castsi128_si256(
358 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
359 srcReg32b4 = _mm256_castsi128_si256(
360 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
361 srcReg32b5 = _mm256_castsi128_si256(
362 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
363 srcReg32b6 = _mm256_castsi128_si256(
364 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
365 srcReg32b7 = _mm256_castsi128_si256(
366 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
367
368 // have each consecutive loads on the same 256 register
369 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
370 _mm256_castsi256_si128(srcReg32b2), 1);
371 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
372 _mm256_castsi256_si128(srcReg32b3), 1);
373 srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
374 _mm256_castsi256_si128(srcReg32b4), 1);
375 srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
376 _mm256_castsi256_si128(srcReg32b5), 1);
377 srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
378 _mm256_castsi256_si128(srcReg32b6), 1);
379 srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
380 _mm256_castsi256_si128(srcReg32b7), 1);
381
382 // merge every two consecutive registers except the last one
383 srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
384 srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
385
386 // save
387 srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
388
389 // save
390 srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
391
392 // save
393 srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
394
395 // save
396 srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
397
398
399 for (i = output_height; i > 1; i-=2) {
400 // load the last 2 loads of 16 bytes and have every two
401 // consecutive loads in the same 256 bit register
402 srcReg32b8 = _mm256_castsi128_si256(
403 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
404 srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
405 _mm256_castsi256_si128(srcReg32b8), 1);
406 srcReg32b9 = _mm256_castsi128_si256(
407 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
408 srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
409 _mm256_castsi256_si128(srcReg32b9), 1);
410
411 // merge every two consecutive registers
412 // save
413 srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
414 srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
415
416 // multiply 2 adjacent elements with the filter and add the result
417 srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
418 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
419
420 // add and saturate the results together
421 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
422
423 // multiply 2 adjacent elements with the filter and add the result
424 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
425 srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
426
427 // add and saturate the results together
428 srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
429 _mm256_min_epi16(srcReg32b8, srcReg32b12));
430 srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
431 _mm256_max_epi16(srcReg32b8, srcReg32b12));
432
433 // multiply 2 adjacent elements with the filter and add the result
434 srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
435 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
436
437 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
438
439 // multiply 2 adjacent elements with the filter and add the result
440 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
441 srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
442
443 // add and saturate the results together
444 srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
445 _mm256_min_epi16(srcReg32b8, srcReg32b12));
446 srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
447 _mm256_max_epi16(srcReg32b8, srcReg32b12));
448
449 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
450 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
451
452 // shift by 7 bit each 16 bit
453 srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
454 srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
455
456 // shrink to 8 bit each 16 bits, the first lane contain the first
457 // convolve result and the second lane contain the second convolve
458 // result
459 srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
460
461 src_ptr+=src_stride;
462
463 // save 16 bytes
464 _mm_store_si128((__m128i*)output_ptr,
465 _mm256_castsi256_si128(srcReg32b1));
466
467 // save the next 16 bits
468 _mm_store_si128((__m128i*)(output_ptr+out_pitch),
469 _mm256_extractf128_si256(srcReg32b1, 1));
470
471 output_ptr+=dst_stride;
472
473 // save part of the registers for next strides
474 srcReg32b10 = srcReg32b11;
475 srcReg32b1 = srcReg32b3;
476 srcReg32b11 = srcReg32b2;
477 srcReg32b3 = srcReg32b5;
478 srcReg32b2 = srcReg32b4;
479 srcReg32b5 = srcReg32b7;
480 srcReg32b7 = srcReg32b9;
481 }
482 if (i > 0) {
483 __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
484 __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
485 // load the last 16 bytes
486 srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
487
488 // merge the last 2 results together
489 srcRegFilt4 = _mm_unpacklo_epi8(
490 _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
491 srcRegFilt7 = _mm_unpackhi_epi8(
492 _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
493
494 // multiply 2 adjacent elements with the filter and add the result
495 srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
496 _mm256_castsi256_si128(firstFilters));
497 srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4,
498 _mm256_castsi256_si128(forthFilters));
499 srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
500 _mm256_castsi256_si128(firstFilters));
501 srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7,
502 _mm256_castsi256_si128(forthFilters));
503
504 // add and saturate the results together
505 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
506 srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
507
508
509 // multiply 2 adjacent elements with the filter and add the result
510 srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
511 _mm256_castsi256_si128(secondFilters));
512 srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
513 _mm256_castsi256_si128(secondFilters));
514
515 // multiply 2 adjacent elements with the filter and add the result
516 srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
517 _mm256_castsi256_si128(thirdFilters));
518 srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
519 _mm256_castsi256_si128(thirdFilters));
520
521 // add and saturate the results together
522 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
523 _mm_min_epi16(srcRegFilt4, srcRegFilt6));
524 srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
525 _mm_min_epi16(srcRegFilt5, srcRegFilt7));
526
527 // add and saturate the results together
528 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
529 _mm_max_epi16(srcRegFilt4, srcRegFilt6));
530 srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
531 _mm_max_epi16(srcRegFilt5, srcRegFilt7));
532
533
534 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
535 _mm256_castsi256_si128(addFilterReg64));
536 srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
537 _mm256_castsi256_si128(addFilterReg64));
538
539 // shift by 7 bit each 16 bit
540 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
541 srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
542
543 // shrink to 8 bit each 16 bits, the first lane contain the first
544 // convolve result and the second lane contain the second convolve
545 // result
546 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
547
548 // save 16 bytes
549 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
550 }
551 }
552
553 #if HAVE_AVX2 && HAVE_SSSE3
554 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
555 #if ARCH_X86_64
556 filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
557 filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
558 filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
559 #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
560 #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
561 #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
562 #else // ARCH_X86
563 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
564 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
565 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
566 #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
567 #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
568 #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
569 #endif // ARCH_X86_64
570 filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
571 filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
572 filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
573 filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
574 filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
575 filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
576 #define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
577 #define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
578 #define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
579 #define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3
580 #define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3
581 #define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3
582 #define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3
583 // void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
584 // uint8_t *dst, ptrdiff_t dst_stride,
585 // const int16_t *filter_x, int x_step_q4,
586 // const int16_t *filter_y, int y_step_q4,
587 // int w, int h);
588 // void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
589 // uint8_t *dst, ptrdiff_t dst_stride,
590 // const int16_t *filter_x, int x_step_q4,
591 // const int16_t *filter_y, int y_step_q4,
592 // int w, int h);
593 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
594 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
595
596 // void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
597 // uint8_t *dst, ptrdiff_t dst_stride,
598 // const int16_t *filter_x, int x_step_q4,
599 // const int16_t *filter_y, int y_step_q4,
600 // int w, int h);
601 FUN_CONV_2D(, avx2);
602 #endif // HAVE_AX2 && HAVE_SSSE3
603