1 /*
2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef VPX_VPX_DSP_X86_CONVOLVE_SSSE3_H_
12 #define VPX_VPX_DSP_X86_CONVOLVE_SSSE3_H_
13 
14 #include <assert.h>
15 #include <tmmintrin.h>  // SSSE3
16 
17 #include "./vpx_config.h"
18 
shuffle_filter_ssse3(const int16_t * const filter,__m128i * const f)19 static INLINE void shuffle_filter_ssse3(const int16_t *const filter,
20                                         __m128i *const f) {
21   const __m128i f_values = _mm_load_si128((const __m128i *)filter);
22   // pack and duplicate the filter values
23   f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
24   f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
25   f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
26   f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
27 }
28 
shuffle_filter_odd_ssse3(const int16_t * const filter,__m128i * const f)29 static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter,
30                                             __m128i *const f) {
31   const __m128i f_values = _mm_load_si128((const __m128i *)filter);
32   // pack and duplicate the filter values
33   // It utilizes the fact that the high byte of filter[3] is always 0 to clean
34   // half of f[0] and f[4].
35   assert(filter[3] >= 0 && filter[3] < 256);
36   f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u));
37   f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u));
38   f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u));
39   f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au));
40   f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu));
41 }
42 
convolve8_8_ssse3(const __m128i * const s,const __m128i * const f)43 static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
44                                         const __m128i *const f) {
45   // multiply 2 adjacent elements with the filter and add the result
46   const __m128i k_64 = _mm_set1_epi16(1 << 6);
47   const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
48   const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
49   const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
50   const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
51   __m128i sum1, sum2;
52 
53   // sum the results together, saturating only on the final step
54   // adding x0 with x2 and x1 with x3 is the only order that prevents
55   // outranges for all filters
56   sum1 = _mm_add_epi16(x0, x2);
57   sum2 = _mm_add_epi16(x1, x3);
58   // add the rounding offset early to avoid another saturated add
59   sum1 = _mm_add_epi16(sum1, k_64);
60   sum1 = _mm_adds_epi16(sum1, sum2);
61   // shift by 7 bit each 16 bit
62   sum1 = _mm_srai_epi16(sum1, 7);
63   return sum1;
64 }
65 
convolve8_8_even_offset_ssse3(const __m128i * const s,const __m128i * const f)66 static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,
67                                                     const __m128i *const f) {
68   // multiply 2 adjacent elements with the filter and add the result
69   const __m128i k_64 = _mm_set1_epi16(1 << 6);
70   const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
71   const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
72   const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
73   const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
74   // compensate the subtracted 64 in f[1]. x4 is always non negative.
75   const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64));
76   // add and saturate the results together
77   __m128i temp = _mm_adds_epi16(x0, x3);
78   temp = _mm_adds_epi16(temp, x1);
79   temp = _mm_adds_epi16(temp, x2);
80   temp = _mm_adds_epi16(temp, x4);
81   // round and shift by 7 bit each 16 bit
82   temp = _mm_adds_epi16(temp, k_64);
83   temp = _mm_srai_epi16(temp, 7);
84   return temp;
85 }
86 
convolve8_8_odd_offset_ssse3(const __m128i * const s,const __m128i * const f)87 static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s,
88                                                    const __m128i *const f) {
89   // multiply 2 adjacent elements with the filter and add the result
90   const __m128i k_64 = _mm_set1_epi16(1 << 6);
91   const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
92   const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
93   const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
94   const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
95   const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]);
96   // compensate the subtracted 64 in f[2]. x5 is always non negative.
97   const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64));
98   __m128i temp;
99 
100   // add and saturate the results together
101   temp = _mm_adds_epi16(x0, x1);
102   temp = _mm_adds_epi16(temp, x2);
103   temp = _mm_adds_epi16(temp, x3);
104   temp = _mm_adds_epi16(temp, x4);
105   temp = _mm_adds_epi16(temp, x5);
106   // round and shift by 7 bit each 16 bit
107   temp = _mm_adds_epi16(temp, k_64);
108   temp = _mm_srai_epi16(temp, 7);
109   return temp;
110 }
111 
112 #endif  // VPX_VPX_DSP_X86_CONVOLVE_SSSE3_H_
113