1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #ifndef AOM_AOM_DSP_X86_MEM_SSE2_H_
13 #define AOM_AOM_DSP_X86_MEM_SSE2_H_
14
15 #include <emmintrin.h> // SSE2
16 #include <string.h>
17
18 #include "config/aom_config.h"
19
20 #include "aom/aom_integer.h"
21
loadu_uint16(const void * src)22 static INLINE uint16_t loadu_uint16(const void *src) {
23 uint16_t v;
24 memcpy(&v, src, sizeof(v));
25 return v;
26 }
27
loadu_uint32(const void * src)28 static INLINE uint32_t loadu_uint32(const void *src) {
29 uint32_t v;
30 memcpy(&v, src, sizeof(v));
31 return v;
32 }
33
loadu_uint64(const void * src)34 static INLINE uint64_t loadu_uint64(const void *src) {
35 uint64_t v;
36 memcpy(&v, src, sizeof(v));
37 return v;
38 }
39
_mm_storeh_epi64(__m128i * const d,const __m128i s)40 static INLINE void _mm_storeh_epi64(__m128i *const d, const __m128i s) {
41 _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s));
42 }
43
loadh_epi64(const void * const src,const __m128i s)44 static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) {
45 return _mm_castps_si128(
46 _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
47 }
48
load_8bit_4x4_to_1_reg_sse2(const void * const src,const int byte_stride)49 static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
50 const int byte_stride) {
51 return _mm_setr_epi32(loadu_uint32((int8_t *)src + 0 * byte_stride),
52 loadu_uint32((int8_t *)src + 1 * byte_stride),
53 loadu_uint32((int8_t *)src + 2 * byte_stride),
54 loadu_uint32((int8_t *)src + 3 * byte_stride));
55 }
56
load_8bit_8x2_to_1_reg_sse2(const void * const src,const int byte_stride)57 static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
58 const int byte_stride) {
59 __m128i dst;
60 dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride));
61 dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst);
62 return dst;
63 }
64
store_8bit_8x4_from_16x2(const __m128i * const s,uint8_t * const d,const ptrdiff_t stride)65 static INLINE void store_8bit_8x4_from_16x2(const __m128i *const s,
66 uint8_t *const d,
67 const ptrdiff_t stride) {
68 _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
69 _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]);
70 _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]);
71 _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]);
72 }
73
store_8bit_4x4(const __m128i * const s,uint8_t * const d,const ptrdiff_t stride)74 static INLINE void store_8bit_4x4(const __m128i *const s, uint8_t *const d,
75 const ptrdiff_t stride) {
76 *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]);
77 *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]);
78 *(int *)(d + 2 * stride) = _mm_cvtsi128_si32(s[2]);
79 *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]);
80 }
81
store_8bit_4x4_sse2(const __m128i s,uint8_t * const d,const ptrdiff_t stride)82 static INLINE void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d,
83 const ptrdiff_t stride) {
84 __m128i ss[4];
85
86 ss[0] = s;
87 ss[1] = _mm_srli_si128(s, 4);
88 ss[2] = _mm_srli_si128(s, 8);
89 ss[3] = _mm_srli_si128(s, 12);
90 store_8bit_4x4(ss, d, stride);
91 }
92
load_8bit_4x4(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)93 static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride,
94 __m128i *const d) {
95 d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride));
96 d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride));
97 d[2] = _mm_cvtsi32_si128(*(const int *)(s + 2 * stride));
98 d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride));
99 }
100
load_8bit_4x8(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)101 static INLINE void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride,
102 __m128i *const d) {
103 load_8bit_4x4(s + 0 * stride, stride, &d[0]);
104 load_8bit_4x4(s + 4 * stride, stride, &d[4]);
105 }
106
load_8bit_8x4(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)107 static INLINE void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride,
108 __m128i *const d) {
109 d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride));
110 d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride));
111 d[2] = _mm_loadl_epi64((const __m128i *)(s + 2 * stride));
112 d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride));
113 }
114
loadu_8bit_16x4(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)115 static INLINE void loadu_8bit_16x4(const uint8_t *const s,
116 const ptrdiff_t stride, __m128i *const d) {
117 d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride));
118 d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride));
119 d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride));
120 d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride));
121 }
122
load_8bit_8x8(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)123 static INLINE void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride,
124 __m128i *const d) {
125 load_8bit_8x4(s + 0 * stride, stride, &d[0]);
126 load_8bit_8x4(s + 4 * stride, stride, &d[4]);
127 }
128
load_8bit_16x8(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)129 static INLINE void load_8bit_16x8(const uint8_t *const s,
130 const ptrdiff_t stride, __m128i *const d) {
131 d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride));
132 d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride));
133 d[2] = _mm_load_si128((const __m128i *)(s + 2 * stride));
134 d[3] = _mm_load_si128((const __m128i *)(s + 3 * stride));
135 d[4] = _mm_load_si128((const __m128i *)(s + 4 * stride));
136 d[5] = _mm_load_si128((const __m128i *)(s + 5 * stride));
137 d[6] = _mm_load_si128((const __m128i *)(s + 6 * stride));
138 d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride));
139 }
140
loadu_8bit_16x8(const uint8_t * const s,const ptrdiff_t stride,__m128i * const d)141 static INLINE void loadu_8bit_16x8(const uint8_t *const s,
142 const ptrdiff_t stride, __m128i *const d) {
143 loadu_8bit_16x4(s + 0 * stride, stride, &d[0]);
144 loadu_8bit_16x4(s + 4 * stride, stride, &d[4]);
145 }
146
store_8bit_8x8(const __m128i * const s,uint8_t * const d,const ptrdiff_t stride)147 static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d,
148 const ptrdiff_t stride) {
149 _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
150 _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]);
151 _mm_storel_epi64((__m128i *)(d + 2 * stride), s[2]);
152 _mm_storel_epi64((__m128i *)(d + 3 * stride), s[3]);
153 _mm_storel_epi64((__m128i *)(d + 4 * stride), s[4]);
154 _mm_storel_epi64((__m128i *)(d + 5 * stride), s[5]);
155 _mm_storel_epi64((__m128i *)(d + 6 * stride), s[6]);
156 _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]);
157 }
158
storeu_8bit_16x4(const __m128i * const s,uint8_t * const d,const ptrdiff_t stride)159 static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d,
160 const ptrdiff_t stride) {
161 _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]);
162 _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]);
163 _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]);
164 _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]);
165 }
166
167 #endif // AOM_AOM_DSP_X86_MEM_SSE2_H_
168