1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef AOM_AOM_DSP_X86_MEM_SSE2_H_
13 #define AOM_AOM_DSP_X86_MEM_SSE2_H_
14 
15 #include <emmintrin.h>  // SSE2
16 #include <string.h>
17 
18 #include "config/aom_config.h"
19 
20 #include "aom/aom_integer.h"
21 
loadu_uint16(const void * src)22 static INLINE uint16_t loadu_uint16(const void *src) {
23   uint16_t v;
24   memcpy(&v, src, sizeof(v));
25   return v;
26 }
27 
loadu_uint32(const void * src)28 static INLINE uint32_t loadu_uint32(const void *src) {
29   uint32_t v;
30   memcpy(&v, src, sizeof(v));
31   return v;
32 }
33 
loadu_uint64(const void * src)34 static INLINE uint64_t loadu_uint64(const void *src) {
35   uint64_t v;
36   memcpy(&v, src, sizeof(v));
37   return v;
38 }
39 
_mm_storeh_epi64(__m128i * const d,const __m128i s)40 static INLINE void _mm_storeh_epi64(__m128i *const d, const __m128i s) {
41   _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s));
42 }
43 
loadh_epi64(const void * const src,const __m128i s)44 static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) {
45   return _mm_castps_si128(
46       _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
47 }
48 
load_8bit_4x4_to_1_reg_sse2(const void * const src,const int byte_stride)49 static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
50                                                   const int byte_stride) {
51   return _mm_setr_epi32(loadu_uint32((int8_t *)src + 0 * byte_stride),
52                         loadu_uint32((int8_t *)src + 1 * byte_stride),
53                         loadu_uint32((int8_t *)src + 2 * byte_stride),
54                         loadu_uint32((int8_t *)src + 3 * byte_stride));
55 }
56 
load_8bit_8x2_to_1_reg_sse2(const void * const src,const int byte_stride)57 static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
58                                                   const int byte_stride) {
59   __m128i dst;
60   dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride));
61   dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst);
62   return dst;
63 }
64 
store_8bit_8x4_from_16x2(const __m128i * const s,uint8_t * const d,const ptrdiff_t stride)65 static INLINE void store_8bit_8x4_from_16x2(const __m128i *const s,
66                                             uint8_t *const d,
67                                             const ptrdiff_t stride) {
68   _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
69   _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]);
70   _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]);
71   _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]);
72 }
73 
store_8bit_4x4(const __m128i * const s,uint8_t * const d,const ptrdiff_t stride)74 static INLINE void store_8bit_4x4(const __m128i *const s, uint8_t *const d,
75                                   const ptrdiff_t stride) {
76   *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]);
77   *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]);
78   *(int *)(d + 2 * stride) = _mm_cvtsi128_si32(s[2]);
79   *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]);
80 }
81 
store_8bit_4x4_sse2(const __m128i s,uint8_t * const d,const ptrdiff_t stride)82 static INLINE void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d,
83                                        const ptrdiff_t stride) {
84   __m128i ss[4];
85 
86   ss[0] = s;
87   ss[1] = _mm_srli_si128(s, 4);
88   ss[2] = _mm_srli_si128(s, 8);
89   ss[3] = _mm_srli_si128(s, 12);
90   store_8bit_4x4(ss, d, stride);
91 }
92 
load_8bit_4x4(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)93 static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride,
94                                  __m128i *const d) {
95   d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride));
96   d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride));
97   d[2] = _mm_cvtsi32_si128(*(const int *)(s + 2 * stride));
98   d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride));
99 }
100 
load_8bit_4x8(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)101 static INLINE void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride,
102                                  __m128i *const d) {
103   load_8bit_4x4(s + 0 * stride, stride, &d[0]);
104   load_8bit_4x4(s + 4 * stride, stride, &d[4]);
105 }
106 
load_8bit_8x4(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)107 static INLINE void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride,
108                                  __m128i *const d) {
109   d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride));
110   d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride));
111   d[2] = _mm_loadl_epi64((const __m128i *)(s + 2 * stride));
112   d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride));
113 }
114 
loadu_8bit_16x4(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)115 static INLINE void loadu_8bit_16x4(const uint8_t *const s,
116                                    const ptrdiff_t stride, __m128i *const d) {
117   d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride));
118   d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride));
119   d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride));
120   d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride));
121 }
122 
load_8bit_8x8(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)123 static INLINE void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride,
124                                  __m128i *const d) {
125   load_8bit_8x4(s + 0 * stride, stride, &d[0]);
126   load_8bit_8x4(s + 4 * stride, stride, &d[4]);
127 }
128 
load_8bit_16x8(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)129 static INLINE void load_8bit_16x8(const uint8_t *const s,
130                                   const ptrdiff_t stride, __m128i *const d) {
131   d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride));
132   d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride));
133   d[2] = _mm_load_si128((const __m128i *)(s + 2 * stride));
134   d[3] = _mm_load_si128((const __m128i *)(s + 3 * stride));
135   d[4] = _mm_load_si128((const __m128i *)(s + 4 * stride));
136   d[5] = _mm_load_si128((const __m128i *)(s + 5 * stride));
137   d[6] = _mm_load_si128((const __m128i *)(s + 6 * stride));
138   d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride));
139 }
140 
loadu_8bit_16x8(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)141 static INLINE void loadu_8bit_16x8(const uint8_t *const s,
142                                    const ptrdiff_t stride, __m128i *const d) {
143   loadu_8bit_16x4(s + 0 * stride, stride, &d[0]);
144   loadu_8bit_16x4(s + 4 * stride, stride, &d[4]);
145 }
146 
store_8bit_8x8(const __m128i * const s,uint8_t * const d,const ptrdiff_t stride)147 static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d,
148                                   const ptrdiff_t stride) {
149   _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
150   _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]);
151   _mm_storel_epi64((__m128i *)(d + 2 * stride), s[2]);
152   _mm_storel_epi64((__m128i *)(d + 3 * stride), s[3]);
153   _mm_storel_epi64((__m128i *)(d + 4 * stride), s[4]);
154   _mm_storel_epi64((__m128i *)(d + 5 * stride), s[5]);
155   _mm_storel_epi64((__m128i *)(d + 6 * stride), s[6]);
156   _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]);
157 }
158 
storeu_8bit_16x4(const __m128i * const s,uint8_t * const d,const ptrdiff_t stride)159 static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d,
160                                     const ptrdiff_t stride) {
161   _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]);
162   _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]);
163   _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]);
164   _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]);
165 }
166 
167 #endif  // AOM_AOM_DSP_X86_MEM_SSE2_H_
168