1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef VPX_VPX_DSP_X86_TXFM_COMMON_SSE2_H_
12 #define VPX_VPX_DSP_X86_TXFM_COMMON_SSE2_H_
13 
14 #include <emmintrin.h>
15  //#include "vpx/vpx_integer.h"
16 
17 #define pair_set_epi16(a, b) \
18   _mm_set1_epi32((int)((uint16_t)(a) | ((uint32_t)(b) << 16) ))
19 
20 #define pair_set_epi32(a, b) \
21   _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a))
22 
23 #define dual_set_epi16(a, b)                                            \
24   _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
25                 (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))
26 
27 #define octa_set_epi16(a, b, c, d, e, f, g, h)                           \
28   _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
29                  (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
30 
dct_const_round_shift_sse2(const __m128i in)31 static INLINE __m128i dct_const_round_shift_sse2(const __m128i in) {
32     const __m128i t = _mm_add_epi32(in, _mm_set1_epi32(DCT_CONST_ROUNDING));
33     return _mm_srai_epi32(t, DCT_CONST_BITS);
34 }
35 
idct_madd_round_shift_sse2(const __m128i in,const __m128i cospi)36 static INLINE __m128i idct_madd_round_shift_sse2(const __m128i in,
37     const __m128i cospi) {
38     const __m128i t = _mm_madd_epi16(in, cospi);
39     return dct_const_round_shift_sse2(t);
40 }
41 
42 // Calculate the dot product between in0/1 and x and wrap to short.
idct_calc_wraplow_sse2(const __m128i in0,const __m128i in1,const __m128i x)43 static INLINE __m128i idct_calc_wraplow_sse2(const __m128i in0,
44     const __m128i in1,
45     const __m128i x) {
46     const __m128i t0 = idct_madd_round_shift_sse2(in0, x);
47     const __m128i t1 = idct_madd_round_shift_sse2(in1, x);
48     return _mm_packs_epi32(t0, t1);
49 }
50 
51 // Multiply elements by constants and add them together.
butterfly(const __m128i in0,const __m128i in1,const int c0,const int c1,__m128i * const out0,__m128i * const out1)52 static INLINE void butterfly(const __m128i in0, const __m128i in1, const int c0,
53     const int c1, __m128i *const out0,
54     __m128i *const out1) {
55     const __m128i cst0 = pair_set_epi16(c0, -c1);
56     const __m128i cst1 = pair_set_epi16(c1, c0);
57     const __m128i lo = _mm_unpacklo_epi16(in0, in1);
58     const __m128i hi = _mm_unpackhi_epi16(in0, in1);
59     *out0 = idct_calc_wraplow_sse2(lo, hi, cst0);
60     *out1 = idct_calc_wraplow_sse2(lo, hi, cst1);
61 }
62 
recon_and_store_16(const __m128i in0,const __m128i in1,uint8_t * const dest)63 static INLINE void recon_and_store_16(const __m128i in0, const __m128i in1,
64     uint8_t *const dest) {
65     const __m128i zero = _mm_setzero_si128();
66     const __m128i d = _mm_loadu_si128((__m128i *)dest);
67     const __m128i d0 = _mm_unpacklo_epi8(d, zero);
68     const __m128i d1 = _mm_unpackhi_epi8(d, zero);
69     const __m128i d2 = _mm_add_epi16(in0, d0);
70     const __m128i d3 = _mm_add_epi16(in1, d1);
71     const __m128i dd = _mm_packus_epi16(d2, d3);
72     _mm_storeu_si128((__m128i *)dest, dd);
73 }
74 
75 #endif  // VPX_VPX_DSP_X86_TXFM_COMMON_SSE2_H_
76