1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10 #include <immintrin.h> // AVX2
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx/vpx_integer.h"
13
calc_final_4(const __m256i * const sums,uint32_t * sad_array)14 static INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
15 uint32_t *sad_array) {
16 const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
17 const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
18 const __m256i t2 = _mm256_hadd_epi32(t0, t1);
19 const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2),
20 _mm256_extractf128_si256(t2, 1));
21 _mm_storeu_si128((__m128i *)sad_array, sum);
22 }
23
vpx_sad32x32x4d_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t sad_array[4])24 void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
25 const uint8_t *const ref_array[4], int ref_stride,
26 uint32_t sad_array[4]) {
27 int i;
28 const uint8_t *refs[4];
29 __m256i sums[4];
30
31 refs[0] = ref_array[0];
32 refs[1] = ref_array[1];
33 refs[2] = ref_array[2];
34 refs[3] = ref_array[3];
35 sums[0] = _mm256_setzero_si256();
36 sums[1] = _mm256_setzero_si256();
37 sums[2] = _mm256_setzero_si256();
38 sums[3] = _mm256_setzero_si256();
39
40 for (i = 0; i < 32; i++) {
41 __m256i r[4];
42
43 // load src and all ref[]
44 const __m256i s = _mm256_load_si256((const __m256i *)src_ptr);
45 r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
46 r[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
47 r[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
48 r[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
49
50 // sum of the absolute differences between every ref[] to src
51 r[0] = _mm256_sad_epu8(r[0], s);
52 r[1] = _mm256_sad_epu8(r[1], s);
53 r[2] = _mm256_sad_epu8(r[2], s);
54 r[3] = _mm256_sad_epu8(r[3], s);
55
56 // sum every ref[]
57 sums[0] = _mm256_add_epi32(sums[0], r[0]);
58 sums[1] = _mm256_add_epi32(sums[1], r[1]);
59 sums[2] = _mm256_add_epi32(sums[2], r[2]);
60 sums[3] = _mm256_add_epi32(sums[3], r[3]);
61
62 src_ptr += src_stride;
63 refs[0] += ref_stride;
64 refs[1] += ref_stride;
65 refs[2] += ref_stride;
66 refs[3] += ref_stride;
67 }
68
69 calc_final_4(sums, sad_array);
70 }
71
vpx_sad32x32x8_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,uint32_t * sad_array)72 void vpx_sad32x32x8_avx2(const uint8_t *src_ptr, int src_stride,
73 const uint8_t *ref_ptr, int ref_stride,
74 uint32_t *sad_array) {
75 int i;
76 __m256i sums[8];
77
78 sums[0] = _mm256_setzero_si256();
79 sums[1] = _mm256_setzero_si256();
80 sums[2] = _mm256_setzero_si256();
81 sums[3] = _mm256_setzero_si256();
82 sums[4] = _mm256_setzero_si256();
83 sums[5] = _mm256_setzero_si256();
84 sums[6] = _mm256_setzero_si256();
85 sums[7] = _mm256_setzero_si256();
86
87 for (i = 0; i < 32; i++) {
88 __m256i r[8];
89
90 // load src and all ref[]
91 const __m256i s = _mm256_load_si256((const __m256i *)src_ptr);
92 r[0] = _mm256_loadu_si256((const __m256i *)&ref_ptr[0]);
93 r[1] = _mm256_loadu_si256((const __m256i *)&ref_ptr[1]);
94 r[2] = _mm256_loadu_si256((const __m256i *)&ref_ptr[2]);
95 r[3] = _mm256_loadu_si256((const __m256i *)&ref_ptr[3]);
96 r[4] = _mm256_loadu_si256((const __m256i *)&ref_ptr[4]);
97 r[5] = _mm256_loadu_si256((const __m256i *)&ref_ptr[5]);
98 r[6] = _mm256_loadu_si256((const __m256i *)&ref_ptr[6]);
99 r[7] = _mm256_loadu_si256((const __m256i *)&ref_ptr[7]);
100
101 // sum of the absolute differences between every ref[] to src
102 r[0] = _mm256_sad_epu8(r[0], s);
103 r[1] = _mm256_sad_epu8(r[1], s);
104 r[2] = _mm256_sad_epu8(r[2], s);
105 r[3] = _mm256_sad_epu8(r[3], s);
106 r[4] = _mm256_sad_epu8(r[4], s);
107 r[5] = _mm256_sad_epu8(r[5], s);
108 r[6] = _mm256_sad_epu8(r[6], s);
109 r[7] = _mm256_sad_epu8(r[7], s);
110
111 // sum every ref[]
112 sums[0] = _mm256_add_epi32(sums[0], r[0]);
113 sums[1] = _mm256_add_epi32(sums[1], r[1]);
114 sums[2] = _mm256_add_epi32(sums[2], r[2]);
115 sums[3] = _mm256_add_epi32(sums[3], r[3]);
116 sums[4] = _mm256_add_epi32(sums[4], r[4]);
117 sums[5] = _mm256_add_epi32(sums[5], r[5]);
118 sums[6] = _mm256_add_epi32(sums[6], r[6]);
119 sums[7] = _mm256_add_epi32(sums[7], r[7]);
120
121 src_ptr += src_stride;
122 ref_ptr += ref_stride;
123 }
124
125 calc_final_4(sums, sad_array);
126 calc_final_4(sums + 4, sad_array + 4);
127 }
128
vpx_sad64x64x4d_avx2(const uint8_t * src_ptr,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t sad_array[4])129 void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
130 const uint8_t *const ref_array[4], int ref_stride,
131 uint32_t sad_array[4]) {
132 __m256i sums[4];
133 int i;
134 const uint8_t *refs[4];
135
136 refs[0] = ref_array[0];
137 refs[1] = ref_array[1];
138 refs[2] = ref_array[2];
139 refs[3] = ref_array[3];
140 sums[0] = _mm256_setzero_si256();
141 sums[1] = _mm256_setzero_si256();
142 sums[2] = _mm256_setzero_si256();
143 sums[3] = _mm256_setzero_si256();
144
145 for (i = 0; i < 64; i++) {
146 __m256i r_lo[4], r_hi[4];
147 // load 64 bytes from src and all ref[]
148 const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr);
149 const __m256i s_hi = _mm256_load_si256((const __m256i *)(src_ptr + 32));
150 r_lo[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
151 r_hi[0] = _mm256_loadu_si256((const __m256i *)(refs[0] + 32));
152 r_lo[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
153 r_hi[1] = _mm256_loadu_si256((const __m256i *)(refs[1] + 32));
154 r_lo[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
155 r_hi[2] = _mm256_loadu_si256((const __m256i *)(refs[2] + 32));
156 r_lo[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
157 r_hi[3] = _mm256_loadu_si256((const __m256i *)(refs[3] + 32));
158
159 // sum of the absolute differences between every ref[] to src
160 r_lo[0] = _mm256_sad_epu8(r_lo[0], s_lo);
161 r_lo[1] = _mm256_sad_epu8(r_lo[1], s_lo);
162 r_lo[2] = _mm256_sad_epu8(r_lo[2], s_lo);
163 r_lo[3] = _mm256_sad_epu8(r_lo[3], s_lo);
164 r_hi[0] = _mm256_sad_epu8(r_hi[0], s_hi);
165 r_hi[1] = _mm256_sad_epu8(r_hi[1], s_hi);
166 r_hi[2] = _mm256_sad_epu8(r_hi[2], s_hi);
167 r_hi[3] = _mm256_sad_epu8(r_hi[3], s_hi);
168
169 // sum every ref[]
170 sums[0] = _mm256_add_epi32(sums[0], r_lo[0]);
171 sums[1] = _mm256_add_epi32(sums[1], r_lo[1]);
172 sums[2] = _mm256_add_epi32(sums[2], r_lo[2]);
173 sums[3] = _mm256_add_epi32(sums[3], r_lo[3]);
174 sums[0] = _mm256_add_epi32(sums[0], r_hi[0]);
175 sums[1] = _mm256_add_epi32(sums[1], r_hi[1]);
176 sums[2] = _mm256_add_epi32(sums[2], r_hi[2]);
177 sums[3] = _mm256_add_epi32(sums[3], r_hi[3]);
178
179 src_ptr += src_stride;
180 refs[0] += ref_stride;
181 refs[1] += ref_stride;
182 refs[2] += ref_stride;
183 refs[3] += ref_stride;
184 }
185
186 calc_final_4(sums, sad_array);
187 }
188