1 /*****************************************************************************
2 * This file is part of Kvazaar HEVC encoder.
3 *
4 * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without modification,
8 * are permitted provided that the following conditions are met:
9 *
10 * * Redistributions of source code must retain the above copyright notice, this
11 * list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above copyright notice, this
14 * list of conditions and the following disclaimer in the documentation and/or
15 * other materials provided with the distribution.
16 *
17 * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
18 * contributors may be used to endorse or promote products derived from
19 * this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
28 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
31 ****************************************************************************/
32
33 #ifndef REG_SAD_POW2_WIDTHS_SSE41_H_
34 #define REG_SAD_POW2_WIDTHS_SSE41_H_
35
36 #include "kvazaar.h"
37
38 #if KVZ_BIT_DEPTH == 8
39
40 #include "strategies/missing-intel-intrinsics.h"
41 #include <immintrin.h>
42
reg_sad_w0(const uint8_t * const data1,const uint8_t * const data2,const int32_t height,const uint32_t stride1,const uint32_t stride2)43 static INLINE uint32_t reg_sad_w0(const uint8_t * const data1, const uint8_t * const data2,
44 const int32_t height, const uint32_t stride1,
45 const uint32_t stride2)
46 {
47 return 0;
48 }
49
reg_sad_w4(const uint8_t * const data1,const uint8_t * const data2,const int32_t height,const uint32_t stride1,const uint32_t stride2)50 static INLINE uint32_t reg_sad_w4(const uint8_t * const data1, const uint8_t * const data2,
51 const int32_t height, const uint32_t stride1,
52 const uint32_t stride2)
53 {
54 __m128i sse_inc = _mm_setzero_si128();
55 int32_t y;
56
57 const int32_t height_fourline_groups = height & ~3;
58 const int32_t height_residual_lines = height & 3;
59
60 for (y = 0; y < height_fourline_groups; y += 4) {
61 __m128i a = _mm_cvtsi32_si128(*(uint32_t *)(data1 + y * stride1));
62 __m128i b = _mm_cvtsi32_si128(*(uint32_t *)(data2 + y * stride2));
63
64 a = _mm_insert_epi32(a, *(const uint32_t *)(data1 + (y + 1) * stride1), 1);
65 b = _mm_insert_epi32(b, *(const uint32_t *)(data2 + (y + 1) * stride2), 1);
66 a = _mm_insert_epi32(a, *(const uint32_t *)(data1 + (y + 2) * stride1), 2);
67 b = _mm_insert_epi32(b, *(const uint32_t *)(data2 + (y + 2) * stride2), 2);
68 a = _mm_insert_epi32(a, *(const uint32_t *)(data1 + (y + 3) * stride1), 3);
69 b = _mm_insert_epi32(b, *(const uint32_t *)(data2 + (y + 3) * stride2), 3);
70
71 __m128i curr_sads = _mm_sad_epu8(a, b);
72 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
73 }
74 if (height_residual_lines) {
75 for (; y < height; y++) {
76 __m128i a = _mm_cvtsi32_si128(*(const uint32_t *)(data1 + y * stride1));
77 __m128i b = _mm_cvtsi32_si128(*(const uint32_t *)(data2 + y * stride2));
78
79 __m128i curr_sads = _mm_sad_epu8(a, b);
80 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
81 }
82 }
83 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
84 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
85
86 return _mm_cvtsi128_si32(sad);
87 }
88
reg_sad_w8(const uint8_t * const data1,const uint8_t * const data2,const int32_t height,const uint32_t stride1,const uint32_t stride2)89 static INLINE uint32_t reg_sad_w8(const uint8_t * const data1, const uint8_t * const data2,
90 const int32_t height, const uint32_t stride1,
91 const uint32_t stride2)
92 {
93 __m128i sse_inc = _mm_setzero_si128();
94 int32_t y;
95
96 const int32_t height_fourline_groups = height & ~3;
97 const int32_t height_residual_lines = height & 3;
98
99 for (y = 0; y < height_fourline_groups; y += 4) {
100 __m128d a_d = _mm_setzero_pd();
101 __m128d b_d = _mm_setzero_pd();
102 __m128d c_d = _mm_setzero_pd();
103 __m128d d_d = _mm_setzero_pd();
104
105 a_d = _mm_loadl_pd(a_d, (const double *)(data1 + (y + 0) * stride1));
106 b_d = _mm_loadl_pd(b_d, (const double *)(data2 + (y + 0) * stride2));
107 a_d = _mm_loadh_pd(a_d, (const double *)(data1 + (y + 1) * stride1));
108 b_d = _mm_loadh_pd(b_d, (const double *)(data2 + (y + 1) * stride2));
109
110 c_d = _mm_loadl_pd(c_d, (const double *)(data1 + (y + 2) * stride1));
111 d_d = _mm_loadl_pd(d_d, (const double *)(data2 + (y + 2) * stride2));
112 c_d = _mm_loadh_pd(c_d, (const double *)(data1 + (y + 3) * stride1));
113 d_d = _mm_loadh_pd(d_d, (const double *)(data2 + (y + 3) * stride2));
114
115 __m128i a = _mm_castpd_si128(a_d);
116 __m128i b = _mm_castpd_si128(b_d);
117 __m128i c = _mm_castpd_si128(c_d);
118 __m128i d = _mm_castpd_si128(d_d);
119
120 __m128i curr_sads_ab = _mm_sad_epu8(a, b);
121 __m128i curr_sads_cd = _mm_sad_epu8(c, d);
122 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
123 sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
124 }
125 if (height_residual_lines) {
126 for (; y < height; y++) {
127 __m128i a = _mm_loadl_epi64((__m128i *)(data1 + y * stride1));
128 __m128i b = _mm_loadl_epi64((__m128i *)(data2 + y * stride2));
129
130 __m128i curr_sads_ab = _mm_sad_epu8(a, b);
131 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
132 }
133 }
134 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
135 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
136
137 return _mm_cvtsi128_si32(sad);
138 }
139
reg_sad_w12(const uint8_t * const data1,const uint8_t * const data2,const int32_t height,const uint32_t stride1,const uint32_t stride2)140 static INLINE uint32_t reg_sad_w12(const uint8_t * const data1, const uint8_t * const data2,
141 const int32_t height, const uint32_t stride1,
142 const uint32_t stride2)
143 {
144 __m128i sse_inc = _mm_setzero_si128();
145 int32_t y;
146 for (y = 0; y < height; y++) {
147 __m128i a = _mm_loadu_si128((const __m128i *)(data1 + y * stride1));
148 __m128i b = _mm_loadu_si128((const __m128i *)(data2 + y * stride2));
149
150 __m128i b_masked = _mm_blend_epi16(a, b, 0x3f);
151 __m128i curr_sads = _mm_sad_epu8 (a, b_masked);
152 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
153 }
154 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
155 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
156 return _mm_cvtsi128_si32(sad);
157 }
158
reg_sad_w16(const uint8_t * const data1,const uint8_t * const data2,const int32_t height,const uint32_t stride1,const uint32_t stride2)159 static INLINE uint32_t reg_sad_w16(const uint8_t * const data1, const uint8_t * const data2,
160 const int32_t height, const uint32_t stride1,
161 const uint32_t stride2)
162 {
163 __m128i sse_inc = _mm_setzero_si128();
164 int32_t y;
165
166 const int32_t height_fourline_groups = height & ~3;
167 const int32_t height_residual_lines = height & 3;
168
169 for (y = 0; y < height_fourline_groups; y += 4) {
170 __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y + 0) * stride1));
171 __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y + 0) * stride2));
172 __m128i c = _mm_loadu_si128((const __m128i *)(data1 + (y + 1) * stride1));
173 __m128i d = _mm_loadu_si128((const __m128i *)(data2 + (y + 1) * stride2));
174 __m128i e = _mm_loadu_si128((const __m128i *)(data1 + (y + 2) * stride1));
175 __m128i f = _mm_loadu_si128((const __m128i *)(data2 + (y + 2) * stride2));
176 __m128i g = _mm_loadu_si128((const __m128i *)(data1 + (y + 3) * stride1));
177 __m128i h = _mm_loadu_si128((const __m128i *)(data2 + (y + 3) * stride2));
178
179 __m128i curr_sads_ab = _mm_sad_epu8(a, b);
180 __m128i curr_sads_cd = _mm_sad_epu8(c, d);
181 __m128i curr_sads_ef = _mm_sad_epu8(e, f);
182 __m128i curr_sads_gh = _mm_sad_epu8(g, h);
183
184 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
185 sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
186 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
187 sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
188 }
189 if (height_residual_lines) {
190 for (; y < height; y++) {
191 __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y + 0) * stride1));
192 __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y + 0) * stride2));
193
194 __m128i curr_sads = _mm_sad_epu8(a, b);
195 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
196 }
197 }
198
199 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
200 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
201 return _mm_cvtsi128_si32(sad);
202 }
203
reg_sad_w24(const uint8_t * const data1,const uint8_t * const data2,const int32_t height,const uint32_t stride1,const uint32_t stride2)204 static INLINE uint32_t reg_sad_w24(const uint8_t * const data1, const uint8_t * const data2,
205 const int32_t height, const uint32_t stride1,
206 const uint32_t stride2)
207 {
208 __m128i sse_inc = _mm_setzero_si128();
209 int32_t y;
210
211 const int32_t height_doublelines = height & ~1;
212 const int32_t height_parity = height & 1;
213
214 for (y = 0; y < height_doublelines; y += 2) {
215 __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y + 0) * stride1));
216 __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y + 0) * stride2));
217 __m128i c = _mm_loadu_si128((const __m128i *)(data1 + (y + 1) * stride1));
218 __m128i d = _mm_loadu_si128((const __m128i *)(data2 + (y + 1) * stride2));
219
220 __m128d e_d = _mm_setzero_pd();
221 __m128d f_d = _mm_setzero_pd();
222
223 e_d = _mm_loadl_pd(e_d, (const double *)(data1 + (y + 0) * stride1 + 16));
224 f_d = _mm_loadl_pd(f_d, (const double *)(data2 + (y + 0) * stride2 + 16));
225 e_d = _mm_loadh_pd(e_d, (const double *)(data1 + (y + 1) * stride1 + 16));
226 f_d = _mm_loadh_pd(f_d, (const double *)(data2 + (y + 1) * stride2 + 16));
227
228 __m128i e = _mm_castpd_si128(e_d);
229 __m128i f = _mm_castpd_si128(f_d);
230
231 __m128i curr_sads_1 = _mm_sad_epu8(a, b);
232 __m128i curr_sads_2 = _mm_sad_epu8(c, d);
233 __m128i curr_sads_3 = _mm_sad_epu8(e, f);
234
235 sse_inc = _mm_add_epi64(sse_inc, curr_sads_1);
236 sse_inc = _mm_add_epi64(sse_inc, curr_sads_2);
237 sse_inc = _mm_add_epi64(sse_inc, curr_sads_3);
238 }
239 if (height_parity) {
240 __m128i a = _mm_loadu_si128 ((const __m128i *)(data1 + y * stride1));
241 __m128i b = _mm_loadu_si128 ((const __m128i *)(data2 + y * stride2));
242 __m128i c = _mm_loadl_epi64 ((const __m128i *)(data1 + y * stride1 + 16));
243 __m128i d = _mm_loadl_epi64 ((const __m128i *)(data2 + y * stride2 + 16));
244
245 __m128i curr_sads_1 = _mm_sad_epu8(a, b);
246 __m128i curr_sads_2 = _mm_sad_epu8(c, d);
247
248 sse_inc = _mm_add_epi64(sse_inc, curr_sads_1);
249 sse_inc = _mm_add_epi64(sse_inc, curr_sads_2);
250 }
251 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
252 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
253 return _mm_cvtsi128_si32(sad);
254 }
255
reg_sad_arbitrary(const uint8_t * const data1,const uint8_t * const data2,const int32_t width,const int32_t height,const uint32_t stride1,const uint32_t stride2)256 static INLINE uint32_t reg_sad_arbitrary(const uint8_t * const data1, const uint8_t * const data2,
257 const int32_t width, const int32_t height, const uint32_t stride1,
258 const uint32_t stride2)
259 {
260 int32_t y, x;
261 __m128i sse_inc = _mm_setzero_si128();
262
263 // Bytes in block in 128-bit blocks per each scanline, and remainder
264 const int32_t width_xmms = width & ~15;
265 const int32_t width_residual_pixels = width & 15;
266
267 const int32_t height_fourline_groups = height & ~3;
268 const int32_t height_residual_lines = height & 3;
269
270 const __m128i rds = _mm_set1_epi8 (width_residual_pixels);
271 const __m128i ns = _mm_setr_epi8 (0, 1, 2, 3, 4, 5, 6, 7,
272 8, 9, 10, 11, 12, 13, 14, 15);
273 const __m128i rdmask = _mm_cmpgt_epi8(rds, ns);
274
275 for (x = 0; x < width_xmms; x += 16) {
276 for (y = 0; y < height_fourline_groups; y += 4) {
277 __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y + 0) * stride1 + x));
278 __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y + 0) * stride2 + x));
279 __m128i c = _mm_loadu_si128((const __m128i *)(data1 + (y + 1) * stride1 + x));
280 __m128i d = _mm_loadu_si128((const __m128i *)(data2 + (y + 1) * stride2 + x));
281 __m128i e = _mm_loadu_si128((const __m128i *)(data1 + (y + 2) * stride1 + x));
282 __m128i f = _mm_loadu_si128((const __m128i *)(data2 + (y + 2) * stride2 + x));
283 __m128i g = _mm_loadu_si128((const __m128i *)(data1 + (y + 3) * stride1 + x));
284 __m128i h = _mm_loadu_si128((const __m128i *)(data2 + (y + 3) * stride2 + x));
285
286 __m128i curr_sads_ab = _mm_sad_epu8(a, b);
287 __m128i curr_sads_cd = _mm_sad_epu8(c, d);
288 __m128i curr_sads_ef = _mm_sad_epu8(e, f);
289 __m128i curr_sads_gh = _mm_sad_epu8(g, h);
290
291 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
292 sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
293 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
294 sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
295 }
296 if (height_residual_lines) {
297 for (; y < height; y++) {
298 __m128i a = _mm_loadu_si128((const __m128i *)(data1 + y * stride1 + x));
299 __m128i b = _mm_loadu_si128((const __m128i *)(data2 + y * stride2 + x));
300
301 __m128i curr_sads = _mm_sad_epu8(a, b);
302
303 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
304 }
305 }
306 }
307
308 if (width_residual_pixels) {
309 for (y = 0; y < height_fourline_groups; y += 4) {
310 __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y + 0) * stride1 + x));
311 __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y + 0) * stride2 + x));
312 __m128i c = _mm_loadu_si128((const __m128i *)(data1 + (y + 1) * stride1 + x));
313 __m128i d = _mm_loadu_si128((const __m128i *)(data2 + (y + 1) * stride2 + x));
314 __m128i e = _mm_loadu_si128((const __m128i *)(data1 + (y + 2) * stride1 + x));
315 __m128i f = _mm_loadu_si128((const __m128i *)(data2 + (y + 2) * stride2 + x));
316 __m128i g = _mm_loadu_si128((const __m128i *)(data1 + (y + 3) * stride1 + x));
317 __m128i h = _mm_loadu_si128((const __m128i *)(data2 + (y + 3) * stride2 + x));
318
319 __m128i b_masked = _mm_blendv_epi8(a, b, rdmask);
320 __m128i d_masked = _mm_blendv_epi8(c, d, rdmask);
321 __m128i f_masked = _mm_blendv_epi8(e, f, rdmask);
322 __m128i h_masked = _mm_blendv_epi8(g, h, rdmask);
323
324 __m128i curr_sads_ab = _mm_sad_epu8 (a, b_masked);
325 __m128i curr_sads_cd = _mm_sad_epu8 (c, d_masked);
326 __m128i curr_sads_ef = _mm_sad_epu8 (e, f_masked);
327 __m128i curr_sads_gh = _mm_sad_epu8 (g, h_masked);
328
329 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
330 sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
331 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
332 sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
333 }
334 if (height_residual_lines) {
335 for (; y < height; y++) {
336 __m128i a = _mm_loadu_si128((const __m128i *)(data1 + y * stride1 + x));
337 __m128i b = _mm_loadu_si128((const __m128i *)(data2 + y * stride2 + x));
338
339 __m128i b_masked = _mm_blendv_epi8(a, b, rdmask);
340 __m128i curr_sads = _mm_sad_epu8 (a, b_masked);
341
342 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
343 }
344 }
345 }
346 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
347 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
348
349 return _mm_cvtsi128_si32(sad);
350 }
351
ver_sad_w4(const uint8_t * pic_data,const uint8_t * ref_data,int32_t height,uint32_t stride)352 static uint32_t ver_sad_w4(const uint8_t *pic_data, const uint8_t *ref_data,
353 int32_t height, uint32_t stride)
354 {
355 __m128i ref_row = _mm_set1_epi32(*(const uint32_t *)ref_data);
356 __m128i sse_inc = _mm_setzero_si128();
357 int32_t y;
358
359 const int32_t height_fourline_groups = height & ~3;
360 const int32_t height_residual_lines = height & 3;
361
362 for (y = 0; y < height_fourline_groups; y += 4) {
363 __m128i a = _mm_cvtsi32_si128(*(uint32_t *)(pic_data + y * stride));
364
365 a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 1) * stride), 1);
366 a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 2) * stride), 2);
367 a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 3) * stride), 3);
368
369 __m128i curr_sads = _mm_sad_epu8(a, ref_row);
370 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
371 }
372 if (height_residual_lines) {
373 // Only pick the last dword, because we're comparing single dwords (lines)
374 ref_row = _mm_bsrli_si128(ref_row, 12);
375
376 for (; y < height; y++) {
377 __m128i a = _mm_cvtsi32_si128(*(const uint32_t *)(pic_data + y * stride));
378
379 __m128i curr_sads = _mm_sad_epu8(a, ref_row);
380 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
381 }
382 }
383 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
384 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
385
386 return _mm_cvtsi128_si32(sad);
387 }
388
ver_sad_w8(const uint8_t * pic_data,const uint8_t * ref_data,int32_t height,uint32_t stride)389 static uint32_t ver_sad_w8(const uint8_t *pic_data, const uint8_t *ref_data,
390 int32_t height, uint32_t stride)
391 {
392 const __m128i ref_row = _mm_set1_epi64x(*(const uint64_t *)ref_data);
393 __m128i sse_inc = _mm_setzero_si128();
394 int32_t y;
395
396 const int32_t height_fourline_groups = height & ~3;
397 const int32_t height_residual_lines = height & 3;
398
399 for (y = 0; y < height_fourline_groups; y += 4) {
400 __m128d a_d = _mm_setzero_pd();
401 __m128d c_d = _mm_setzero_pd();
402
403 a_d = _mm_loadl_pd(a_d, (const double *)(pic_data + (y + 0) * stride));
404 a_d = _mm_loadh_pd(a_d, (const double *)(pic_data + (y + 1) * stride));
405
406 c_d = _mm_loadl_pd(c_d, (const double *)(pic_data + (y + 2) * stride));
407 c_d = _mm_loadh_pd(c_d, (const double *)(pic_data + (y + 3) * stride));
408
409 __m128i a = _mm_castpd_si128(a_d);
410 __m128i c = _mm_castpd_si128(c_d);
411
412 __m128i curr_sads_ab = _mm_sad_epu8(a, ref_row);
413 __m128i curr_sads_cd = _mm_sad_epu8(c, ref_row);
414 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
415 sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
416 }
417 if (height_residual_lines) {
418 __m128i b = _mm_move_epi64(ref_row);
419
420 for (; y < height; y++) {
421 __m128i a = _mm_loadl_epi64((__m128i *)(pic_data + y * stride));
422
423 __m128i curr_sads_ab = _mm_sad_epu8(a, b);
424 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
425 }
426 }
427 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
428 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
429
430 return _mm_cvtsi128_si32(sad);
431 }
432
ver_sad_w12(const uint8_t * pic_data,const uint8_t * ref_data,int32_t height,uint32_t stride)433 static uint32_t ver_sad_w12(const uint8_t *pic_data, const uint8_t *ref_data,
434 int32_t height, uint32_t stride)
435 {
436 const __m128i ref_row = _mm_loadu_si128((__m128i *)ref_data);
437 __m128i sse_inc = _mm_setzero_si128();
438 int32_t y;
439
440 for (y = 0; y < height; y++) {
441 __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + y * stride));
442
443 __m128i a_masked = _mm_blend_epi16(ref_row, a, 0x3f);
444 __m128i curr_sads = _mm_sad_epu8 (ref_row, a_masked);
445 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
446 }
447 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
448 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
449 return _mm_cvtsi128_si32(sad);
450 }
451
ver_sad_w16(const uint8_t * pic_data,const uint8_t * ref_data,int32_t height,uint32_t stride)452 static uint32_t ver_sad_w16(const uint8_t *pic_data, const uint8_t *ref_data,
453 int32_t height, uint32_t stride)
454 {
455 const __m128i ref_row = _mm_loadu_si128((__m128i *)ref_data);
456 __m128i sse_inc = _mm_setzero_si128();
457 int32_t y;
458
459 const int32_t height_fourline_groups = height & ~3;
460 const int32_t height_residual_lines = height & 3;
461
462 for (y = 0; y < height_fourline_groups; y += 4) {
463 __m128i pic_row_1 = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * stride));
464 __m128i pic_row_2 = _mm_loadu_si128((__m128i *)(pic_data + (y + 1) * stride));
465 __m128i pic_row_3 = _mm_loadu_si128((__m128i *)(pic_data + (y + 2) * stride));
466 __m128i pic_row_4 = _mm_loadu_si128((__m128i *)(pic_data + (y + 3) * stride));
467
468 __m128i curr_sads_1 = _mm_sad_epu8 (pic_row_1, ref_row);
469 __m128i curr_sads_2 = _mm_sad_epu8 (pic_row_2, ref_row);
470 __m128i curr_sads_3 = _mm_sad_epu8 (pic_row_3, ref_row);
471 __m128i curr_sads_4 = _mm_sad_epu8 (pic_row_4, ref_row);
472
473 sse_inc = _mm_add_epi64(sse_inc, curr_sads_1);
474 sse_inc = _mm_add_epi64(sse_inc, curr_sads_2);
475 sse_inc = _mm_add_epi64(sse_inc, curr_sads_3);
476 sse_inc = _mm_add_epi64(sse_inc, curr_sads_4);
477 }
478 if (height_residual_lines) {
479 for (; y < height; y++) {
480 __m128i pic_row = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * stride));
481 __m128i curr_sads = _mm_sad_epu8 (pic_row, ref_row);
482
483 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
484 }
485 }
486 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
487 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
488
489 return _mm_cvtsi128_si32(sad);
490 }
491
ver_sad_arbitrary(const uint8_t * pic_data,const uint8_t * ref_data,int32_t width,int32_t height,uint32_t stride)492 static uint32_t ver_sad_arbitrary(const uint8_t *pic_data, const uint8_t *ref_data,
493 int32_t width, int32_t height, uint32_t stride)
494 {
495 int32_t y, x;
496 __m128i sse_inc = _mm_setzero_si128();
497
498 // Bytes in block in 128-bit blocks per each scanline, and remainder
499 const int32_t width_xmms = width & ~15;
500 const int32_t width_residual_pixels = width & 15;
501
502 const int32_t height_fourline_groups = height & ~3;
503 const int32_t height_residual_lines = height & 3;
504
505 const __m128i rds = _mm_set1_epi8 (width_residual_pixels);
506 const __m128i ns = _mm_setr_epi8 (0, 1, 2, 3, 4, 5, 6, 7,
507 8, 9, 10, 11, 12, 13, 14, 15);
508 const __m128i rdmask = _mm_cmpgt_epi8(rds, ns);
509
510 for (x = 0; x < width_xmms; x += 16) {
511 const __m128i ref_row = _mm_loadu_si128((__m128i *)(ref_data + x));
512 for (y = 0; y < height_fourline_groups; y += 4) {
513 __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + (y + 0) * stride + x));
514 __m128i c = _mm_loadu_si128((const __m128i *)(pic_data + (y + 1) * stride + x));
515 __m128i e = _mm_loadu_si128((const __m128i *)(pic_data + (y + 2) * stride + x));
516 __m128i g = _mm_loadu_si128((const __m128i *)(pic_data + (y + 3) * stride + x));
517
518 __m128i curr_sads_ab = _mm_sad_epu8(ref_row, a);
519 __m128i curr_sads_cd = _mm_sad_epu8(ref_row, c);
520 __m128i curr_sads_ef = _mm_sad_epu8(ref_row, e);
521 __m128i curr_sads_gh = _mm_sad_epu8(ref_row, g);
522
523 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
524 sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
525 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
526 sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
527 }
528 if (height_residual_lines) {
529 for (; y < height; y++) {
530 __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + y * stride + x));
531
532 __m128i curr_sads = _mm_sad_epu8(a, ref_row);
533
534 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
535 }
536 }
537 }
538
539 if (width_residual_pixels) {
540 const __m128i ref_row = _mm_loadu_si128((__m128i *)(ref_data + x));
541 for (y = 0; y < height_fourline_groups; y += 4) {
542 __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + (y + 0) * stride + x));
543 __m128i c = _mm_loadu_si128((const __m128i *)(pic_data + (y + 1) * stride + x));
544 __m128i e = _mm_loadu_si128((const __m128i *)(pic_data + (y + 2) * stride + x));
545 __m128i g = _mm_loadu_si128((const __m128i *)(pic_data + (y + 3) * stride + x));
546
547 __m128i a_masked = _mm_blendv_epi8(ref_row, a, rdmask);
548 __m128i c_masked = _mm_blendv_epi8(ref_row, c, rdmask);
549 __m128i e_masked = _mm_blendv_epi8(ref_row, e, rdmask);
550 __m128i g_masked = _mm_blendv_epi8(ref_row, g, rdmask);
551
552 __m128i curr_sads_ab = _mm_sad_epu8 (ref_row, a_masked);
553 __m128i curr_sads_cd = _mm_sad_epu8 (ref_row, c_masked);
554 __m128i curr_sads_ef = _mm_sad_epu8 (ref_row, e_masked);
555 __m128i curr_sads_gh = _mm_sad_epu8 (ref_row, g_masked);
556
557 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
558 sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
559 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
560 sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
561 }
562 if (height_residual_lines) {
563 for (; y < height; y++) {
564 __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + y * stride + x));
565
566 __m128i a_masked = _mm_blendv_epi8(ref_row, a, rdmask);
567 __m128i curr_sads = _mm_sad_epu8 (ref_row, a_masked);
568
569 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
570 }
571 }
572 }
573 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
574 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
575
576 return _mm_cvtsi128_si32(sad);
577 }
578
hor_sad_sse41_w4(const uint8_t * pic_data,const uint8_t * ref_data,int32_t height,uint32_t pic_stride,uint32_t ref_stride,uint32_t left,uint32_t right)579 static uint32_t hor_sad_sse41_w4(const uint8_t *pic_data, const uint8_t *ref_data,
580 int32_t height, uint32_t pic_stride, uint32_t ref_stride,
581 uint32_t left, uint32_t right)
582 {
583 const int32_t right_border_idx = 3 - right;
584 const int32_t border_idx = left ? left : right_border_idx;
585
586 const __m128i ns = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7,
587 8, 9, 10, 11, 12, 13, 14, 15);
588
589 const int32_t border_idx_negative = border_idx >> 31;
590 const int32_t leftoff = border_idx_negative | left;
591
592 // Dualword (ie. line) base indexes, ie. the edges the lines read will be
593 // clamped towards
594 const __m128i dwbaseids = _mm_setr_epi8(0, 0, 0, 0, 4, 4, 4, 4,
595 8, 8, 8, 8, 12, 12, 12, 12);
596
597 __m128i right_border_idxs = _mm_set1_epi8((int8_t)right_border_idx);
598 __m128i left_128 = _mm_set1_epi8((int8_t)left);
599
600 right_border_idxs = _mm_add_epi8 (right_border_idxs, dwbaseids);
601
602 __m128i mask_right = _mm_min_epi8 (ns, right_border_idxs);
603 __m128i mask1 = _mm_sub_epi8 (mask_right, left_128);
604
605 const __m128i epol_mask = _mm_max_epi8(mask1, dwbaseids);
606
607 const int32_t height_fourline_groups = height & ~3;
608 const int32_t height_residual_lines = height & 3;
609
610 __m128i sse_inc = _mm_setzero_si128();
611 int32_t y;
612 for (y = 0; y < height_fourline_groups; y += 4) {
613 __m128i a = _mm_cvtsi32_si128(*(const uint32_t *)(pic_data + y * pic_stride));
614 __m128i b = _mm_cvtsi32_si128(*(const uint32_t *)(ref_data + y * ref_stride + leftoff));
615
616 a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 1) * pic_stride), 1);
617 b = _mm_insert_epi32(b, *(const uint32_t *)(ref_data + (y + 1) * ref_stride + leftoff), 1);
618 a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 2) * pic_stride), 2);
619 b = _mm_insert_epi32(b, *(const uint32_t *)(ref_data + (y + 2) * ref_stride + leftoff), 2);
620 a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 3) * pic_stride), 3);
621 b = _mm_insert_epi32(b, *(const uint32_t *)(ref_data + (y + 3) * ref_stride + leftoff), 3);
622
623 __m128i b_epol = _mm_shuffle_epi8(b, epol_mask);
624 __m128i curr_sads = _mm_sad_epu8 (a, b_epol);
625 sse_inc = _mm_add_epi64 (sse_inc, curr_sads);
626 }
627 if (height_residual_lines) {
628 for (; y < height; y++) {
629 __m128i a = _mm_cvtsi32_si128(*(const uint32_t *)(pic_data + y * pic_stride));
630 __m128i b = _mm_cvtsi32_si128(*(const uint32_t *)(ref_data + y * ref_stride + leftoff));
631
632 __m128i b_epol = _mm_shuffle_epi8(b, epol_mask);
633 __m128i curr_sads = _mm_sad_epu8 (a, b_epol);
634 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
635 }
636 }
637 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
638 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
639
640 return _mm_cvtsi128_si32(sad);
641 }
642
hor_sad_sse41_w8(const uint8_t * pic_data,const uint8_t * ref_data,int32_t height,uint32_t pic_stride,uint32_t ref_stride,uint32_t left,uint32_t right)643 static uint32_t hor_sad_sse41_w8(const uint8_t *pic_data, const uint8_t *ref_data,
644 int32_t height, uint32_t pic_stride, uint32_t ref_stride,
645 uint32_t left, uint32_t right)
646 {
647 // right is the number of overhanging pixels in the vector, so it has to be
648 // handled this way to produce the index of last valid (border) pixel
649 const int32_t right_border_idx = 7 - right;
650 const int32_t border_idx = left ? left : right_border_idx;
651
652 const __m128i ns = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7,
653 8, 9, 10, 11, 12, 13, 14, 15);
654
655 // Quadword (ie. line) base indexes, ie. the edges the lines read will be
656 // clamped towards; higher qword (lower line) bytes tend towards 8 and lower
657 // qword (higher line) bytes towards 0
658 const __m128i qwbaseids = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0,
659 8, 8, 8, 8, 8, 8, 8, 8);
660
661 // Dirty hack alert! If right == block_width (ie. the entire vector is
662 // outside the frame), move the block offset one pixel to the left (so
663 // that the leftmost pixel in vector is actually the valid border pixel
664 // from which we want to extrapolate), and use an epol mask that will
665 // simply stretch the pixel all over the vector.
666 //
667 // To avoid a branch here:
668 // The mask will be -1 (0xffffffff) for border_idx -1 and 0 for >= 0
669 const int32_t border_idx_negative = border_idx >> 31;
670 const int32_t leftoff = border_idx_negative | left;
671
672 __m128i right_border_idxs = _mm_set1_epi8((int8_t)right_border_idx);
673 __m128i left_128 = _mm_set1_epi8((int8_t)left);
674
675 right_border_idxs = _mm_add_epi8 (right_border_idxs, qwbaseids);
676
677 // If we're straddling the left border, right_border_idx is 7 and the first
678 // operation does nothing. If right border, left is 0 and the second
679 // operation does nothing.
680 __m128i mask_right = _mm_min_epi8 (ns, right_border_idxs);
681 __m128i mask1 = _mm_sub_epi8 (mask_right, left_128);
682
683 // If right == 8 (we're completely outside the frame), right_border_idx is
684 // -1 and so is mask1. Clamp negative values to qwbaseid and as discussed
685 // earlier, adjust the load offset instead to load the "-1'st" pixels and
686 // using qwbaseids as the shuffle mask, broadcast it all over the rows.
687 const __m128i epol_mask = _mm_max_epi8(mask1, qwbaseids);
688
689 const int32_t height_fourline_groups = height & ~3;
690 const int32_t height_residual_lines = height & 3;
691
692 __m128i sse_inc = _mm_setzero_si128();
693 int32_t y;
694 for (y = 0; y < height_fourline_groups; y += 4) {
695 __m128d a_d = _mm_setzero_pd();
696 __m128d b_d = _mm_setzero_pd();
697 __m128d c_d = _mm_setzero_pd();
698 __m128d d_d = _mm_setzero_pd();
699
700 a_d = _mm_loadl_pd(a_d, (const double *)(pic_data + (y + 0) * pic_stride));
701 b_d = _mm_loadl_pd(b_d, (const double *)(ref_data + (y + 0) * ref_stride + leftoff));
702 a_d = _mm_loadh_pd(a_d, (const double *)(pic_data + (y + 1) * pic_stride));
703 b_d = _mm_loadh_pd(b_d, (const double *)(ref_data + (y + 1) * ref_stride + leftoff));
704
705 c_d = _mm_loadl_pd(c_d, (const double *)(pic_data + (y + 2) * pic_stride));
706 d_d = _mm_loadl_pd(d_d, (const double *)(ref_data + (y + 2) * ref_stride + leftoff));
707 c_d = _mm_loadh_pd(c_d, (const double *)(pic_data + (y + 3) * pic_stride));
708 d_d = _mm_loadh_pd(d_d, (const double *)(ref_data + (y + 3) * ref_stride + leftoff));
709
710 __m128i a = _mm_castpd_si128(a_d);
711 __m128i b = _mm_castpd_si128(b_d);
712 __m128i c = _mm_castpd_si128(c_d);
713 __m128i d = _mm_castpd_si128(d_d);
714
715 __m128i b_epol = _mm_shuffle_epi8(b, epol_mask);
716 __m128i d_epol = _mm_shuffle_epi8(d, epol_mask);
717
718 __m128i curr_sads_ab = _mm_sad_epu8(a, b_epol);
719 __m128i curr_sads_cd = _mm_sad_epu8(c, d_epol);
720 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
721 sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
722 }
723 if (height_residual_lines) {
724 for (; y < height; y++) {
725 __m128i a = _mm_loadl_epi64((__m128i *)(pic_data + y * pic_stride));
726 __m128i b = _mm_loadl_epi64((__m128i *)(ref_data + y * ref_stride + leftoff));
727
728 __m128i b_epol = _mm_shuffle_epi8(b, epol_mask);
729
730 __m128i curr_sads_ab = _mm_sad_epu8(a, b_epol);
731 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
732 }
733 }
734 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
735 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
736 return _mm_cvtsi128_si32(sad);
737 }
738
739 /*
740 * left and right measure how many pixels of one horizontal scanline will be
741 * outside either the left or the right screen border. For blocks straddling
742 * the left border, read the scanlines starting from the left border instead,
743 * and use the extrapolation mask to essentially move the pixels right while
744 * copying the left border pixel to the vector positions that logically point
745 * outside of the buffer.
746 *
747 * For blocks straddling the right border, just read over the right border,
748 * and extrapolate all pixels beyond the border idx to copy the value of the
749 * border pixel. An exception is right == width (leftmost reference pixel is
750 * one place right from the right border, it's ugly because the pixel to
751 * extrapolate from is located at relative X offset -1), abuse the left border
752 * aligning functionality instead to actually read starting from the valid
753 * border pixel, and use a suitable mask to fill all the other pixels with
754 * that value.
755 */
hor_sad_sse41_w16(const uint8_t * pic_data,const uint8_t * ref_data,int32_t height,uint32_t pic_stride,uint32_t ref_stride,const uint32_t left,const uint32_t right)756 static uint32_t hor_sad_sse41_w16(const uint8_t *pic_data, const uint8_t *ref_data,
757 int32_t height, uint32_t pic_stride, uint32_t ref_stride,
758 const uint32_t left, const uint32_t right)
759 {
760 // right is the number of overhanging pixels in the vector, so it has to be
761 // handled this way to produce the index of last valid (border) pixel
762 const int32_t right_border_idx = 15 - right;
763 const int32_t border_idx = left ? left : right_border_idx;
764
765 const __m128i ns = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7,
766 8, 9, 10, 11, 12, 13, 14, 15);
767 const __m128i zero = _mm_setzero_si128();
768
769 // Dirty hack alert! If right == block_width (ie. the entire vector is
770 // outside the frame), move the block offset one pixel to the left (so
771 // that the leftmost pixel in vector is actually the valid border pixel
772 // from which we want to extrapolate), and use an epol mask that will
773 // simply stretch the pixel all over the vector.
774 //
775 // To avoid a branch here:
776 // The mask will be -1 (0xffffffff) for border_idx -1 and 0 for >= 0
777 const int32_t border_idx_negative = border_idx >> 31;
778 const int32_t leftoff = border_idx_negative | left;
779
780 __m128i right_border_idxs = _mm_set1_epi8((int8_t)right_border_idx);
781 __m128i left_128 = _mm_set1_epi8((int8_t)left);
782
783 // If we're straddling the left border, right_border_idx is 15 and the first
784 // operation does nothing. If right border, left is 0 and the second
785 // operation does nothing.
786 __m128i mask_right = _mm_min_epi8 (ns, right_border_idxs);
787 __m128i mask1 = _mm_sub_epi8 (mask_right, left_128);
788
789 // If right == 16 (we're completely outside the frame), right_border_idx is
790 // -1 and so is mask1. Clamp negative values to zero and as discussed
791 // earlier, adjust the load offset instead to load the "-1'st" pixel and
792 // using an all-zero shuffle mask, broadcast it all over the vector.
793 const __m128i epol_mask = _mm_max_epi8(mask1, zero);
794
795 const int32_t height_fourline_groups = height & ~3;
796 const int32_t height_residual_lines = height & 3;
797
798 __m128i sse_inc = _mm_setzero_si128();
799 int32_t y;
800 for (y = 0; y < height_fourline_groups; y += 4) {
801 __m128i a = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride));
802 __m128i b = _mm_loadu_si128((__m128i *)(ref_data + (y + 0) * ref_stride + leftoff));
803 __m128i c = _mm_loadu_si128((__m128i *)(pic_data + (y + 1) * pic_stride));
804 __m128i d = _mm_loadu_si128((__m128i *)(ref_data + (y + 1) * ref_stride + leftoff));
805 __m128i e = _mm_loadu_si128((__m128i *)(pic_data + (y + 2) * pic_stride));
806 __m128i f = _mm_loadu_si128((__m128i *)(ref_data + (y + 2) * ref_stride + leftoff));
807 __m128i g = _mm_loadu_si128((__m128i *)(pic_data + (y + 3) * pic_stride));
808 __m128i h = _mm_loadu_si128((__m128i *)(ref_data + (y + 3) * ref_stride + leftoff));
809
810 __m128i b_epol = _mm_shuffle_epi8(b, epol_mask);
811 __m128i d_epol = _mm_shuffle_epi8(d, epol_mask);
812 __m128i f_epol = _mm_shuffle_epi8(f, epol_mask);
813 __m128i h_epol = _mm_shuffle_epi8(h, epol_mask);
814
815 __m128i curr_sads_ab = _mm_sad_epu8(a, b_epol);
816 __m128i curr_sads_cd = _mm_sad_epu8(c, d_epol);
817 __m128i curr_sads_ef = _mm_sad_epu8(e, f_epol);
818 __m128i curr_sads_gh = _mm_sad_epu8(g, h_epol);
819
820 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
821 sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
822 sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
823 sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
824 }
825 if (height_residual_lines) {
826 for (; y < height; y++) {
827 __m128i a = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride));
828 __m128i b = _mm_loadu_si128((__m128i *)(ref_data + (y + 0) * ref_stride + leftoff));
829 __m128i b_epol = _mm_shuffle_epi8(b, epol_mask);
830 __m128i curr_sads = _mm_sad_epu8(a, b_epol);
831 sse_inc = _mm_add_epi64(sse_inc, curr_sads);
832 }
833 }
834 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
835 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
836 return _mm_cvtsi128_si32(sad);
837 }
838
hor_sad_sse41_arbitrary(const uint8_t * pic_data,const uint8_t * ref_data,int32_t width,int32_t height,uint32_t pic_stride,uint32_t ref_stride,uint32_t left,uint32_t right)839 static INLINE uint32_t hor_sad_sse41_arbitrary(const uint8_t *pic_data, const uint8_t *ref_data,
840 int32_t width, int32_t height, uint32_t pic_stride,
841 uint32_t ref_stride, uint32_t left, uint32_t right)
842 {
843 __m128i sse_inc = _mm_setzero_si128();
844
845 const size_t vec_width = 16;
846 const size_t vecwid_bitmask = 15;
847 const size_t vec_width_log2 = 4;
848
849 const int32_t height_fourline_groups = height & ~3;
850 const int32_t height_residual_lines = height & 3;
851
852 const __m128i rights = _mm_set1_epi8((uint8_t)right);
853 const __m128i blk_widths = _mm_set1_epi8((uint8_t)width);
854 const __m128i vec_widths = _mm_set1_epi8((uint8_t)vec_width);
855 const __m128i nslo = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
856
857 uint32_t outside_vecs, inside_vecs, left_offset, is_left_bm;
858 int32_t outside_width, inside_width, border_off, invec_lstart,
859 invec_lend, invec_linc;
860 if (left) {
861 outside_vecs = left >> vec_width_log2;
862 inside_vecs = (( width + vecwid_bitmask) >> vec_width_log2) - outside_vecs;
863 outside_width = outside_vecs * vec_width;
864 inside_width = inside_vecs * vec_width;
865 left_offset = left;
866 border_off = left;
867 invec_lstart = 0;
868 invec_lend = inside_vecs;
869 invec_linc = 1;
870 is_left_bm = -1;
871 } else {
872 inside_vecs = ((width - right) + vecwid_bitmask) >> vec_width_log2;
873 outside_vecs = (( width + vecwid_bitmask) >> vec_width_log2) - inside_vecs;
874 outside_width = outside_vecs * vec_width;
875 inside_width = inside_vecs * vec_width;
876 left_offset = right - width;
877 border_off = width - 1 - right;
878 invec_lstart = inside_vecs - 1;
879 invec_lend = -1;
880 invec_linc = -1;
881 is_left_bm = 0;
882 }
883 left_offset &= vecwid_bitmask;
884
885 const __m128i left_offsets = _mm_set1_epi8 ((uint8_t)left_offset);
886 const __m128i is_left = _mm_cmpeq_epi8(rights, _mm_setzero_si128());
887 const __m128i vw_for_left = _mm_and_si128 (is_left, vec_widths);
888
889 // -x == (x ^ 0xff) + 1 = (x ^ 0xff) - 0xff. Also x == (x ^ 0x00) - 0x00.
890 // in other words, calculate inverse of left_offsets if is_left is true.
891 const __m128i offs_neg = _mm_xor_si128 (left_offsets, is_left);
892 const __m128i offs_for_sm1 = _mm_sub_epi8 (offs_neg, is_left);
893
894 const __m128i ns_for_sm1 = _mm_or_si128 (vw_for_left, nslo);
895 const __m128i shufmask1 = _mm_add_epi8 (ns_for_sm1, offs_for_sm1);
896
897 const __m128i mo2bmask_l = _mm_cmpgt_epi8(left_offsets, nslo);
898 const __m128i mo2bimask_l = _mm_cmpeq_epi8(mo2bmask_l, _mm_setzero_si128());
899 const __m128i mo2bimask_r = _mm_cmpgt_epi8(vec_widths, shufmask1);
900 const __m128i move_old_to_b_imask = _mm_blendv_epi8(mo2bimask_r, mo2bimask_l, is_left);
901
902 const int32_t outvec_offset = (~is_left_bm) & inside_width;
903 int32_t x, y;
904 for (y = 0; y < height_fourline_groups; y += 4) {
905 __m128i borderpx_vec_b = _mm_set1_epi8(ref_data[(int32_t)((y + 0) * ref_stride + border_off)]);
906 __m128i borderpx_vec_d = _mm_set1_epi8(ref_data[(int32_t)((y + 1) * ref_stride + border_off)]);
907 __m128i borderpx_vec_f = _mm_set1_epi8(ref_data[(int32_t)((y + 2) * ref_stride + border_off)]);
908 __m128i borderpx_vec_h = _mm_set1_epi8(ref_data[(int32_t)((y + 3) * ref_stride + border_off)]);
909
910 for (x = 0; x < outside_vecs; x++) {
911 __m128i a = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 0) * pic_stride + outvec_offset));
912 __m128i c = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 1) * pic_stride + outvec_offset));
913 __m128i e = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 2) * pic_stride + outvec_offset));
914 __m128i g = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 3) * pic_stride + outvec_offset));
915
916 __m128i startoffs = _mm_set1_epi8 ((x + inside_vecs) << vec_width_log2);
917 __m128i ns = _mm_add_epi8 (startoffs, nslo);
918
919 // Unread imask is (is_left NOR unrd_imask_for_right), do the maths etc
920 __m128i unrd_imask = _mm_cmpgt_epi8 (blk_widths, ns);
921 unrd_imask = _mm_or_si128 (unrd_imask, is_left);
922 __m128i unrd_mask = _mm_cmpeq_epi8 (unrd_imask, _mm_setzero_si128());
923
924 __m128i b_unread = _mm_blendv_epi8(borderpx_vec_b, a, unrd_mask);
925 __m128i d_unread = _mm_blendv_epi8(borderpx_vec_d, c, unrd_mask);
926 __m128i f_unread = _mm_blendv_epi8(borderpx_vec_f, e, unrd_mask);
927 __m128i h_unread = _mm_blendv_epi8(borderpx_vec_h, g, unrd_mask);
928
929 __m128i sad_ab = _mm_sad_epu8 (a, b_unread);
930 __m128i sad_cd = _mm_sad_epu8 (c, d_unread);
931 __m128i sad_ef = _mm_sad_epu8 (e, f_unread);
932 __m128i sad_gh = _mm_sad_epu8 (g, h_unread);
933
934 sse_inc = _mm_add_epi64(sse_inc, sad_ab);
935 sse_inc = _mm_add_epi64(sse_inc, sad_cd);
936 sse_inc = _mm_add_epi64(sse_inc, sad_ef);
937 sse_inc = _mm_add_epi64(sse_inc, sad_gh);
938 }
939 int32_t a_off = outside_width & is_left_bm;
940 int32_t leftoff_with_sign_neg = (left_offset ^ is_left_bm) - is_left_bm;
941
942 __m128i old_b = borderpx_vec_b;
943 __m128i old_d = borderpx_vec_d;
944 __m128i old_f = borderpx_vec_f;
945 __m128i old_h = borderpx_vec_h;
946
947 for (x = invec_lstart; x != invec_lend; x += invec_linc) {
948 __m128i a = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 0) * pic_stride + a_off));
949 __m128i c = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 1) * pic_stride + a_off));
950 __m128i e = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 2) * pic_stride + a_off));
951 __m128i g = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 3) * pic_stride + a_off));
952 __m128i b = _mm_loadu_si128((__m128i *)(ref_data + x * vec_width + (y + 0) * ref_stride + a_off - leftoff_with_sign_neg));
953 __m128i d = _mm_loadu_si128((__m128i *)(ref_data + x * vec_width + (y + 1) * ref_stride + a_off - leftoff_with_sign_neg));
954 __m128i f = _mm_loadu_si128((__m128i *)(ref_data + x * vec_width + (y + 2) * ref_stride + a_off - leftoff_with_sign_neg));
955 __m128i h = _mm_loadu_si128((__m128i *)(ref_data + x * vec_width + (y + 3) * ref_stride + a_off - leftoff_with_sign_neg));
956
957 __m128i b_shifted = _mm_shuffle_epi8(b, shufmask1);
958 __m128i d_shifted = _mm_shuffle_epi8(d, shufmask1);
959 __m128i f_shifted = _mm_shuffle_epi8(f, shufmask1);
960 __m128i h_shifted = _mm_shuffle_epi8(h, shufmask1);
961
962 __m128i b_with_old = _mm_blendv_epi8 (old_b, b_shifted, move_old_to_b_imask);
963 __m128i d_with_old = _mm_blendv_epi8 (old_d, d_shifted, move_old_to_b_imask);
964 __m128i f_with_old = _mm_blendv_epi8 (old_f, f_shifted, move_old_to_b_imask);
965 __m128i h_with_old = _mm_blendv_epi8 (old_h, h_shifted, move_old_to_b_imask);
966
967 uint8_t startoff = (x << vec_width_log2) + a_off;
968 __m128i startoffs = _mm_set1_epi8 (startoff);
969 __m128i curr_ns = _mm_add_epi8 (startoffs, nslo);
970 __m128i unrd_imask = _mm_cmpgt_epi8 (blk_widths, curr_ns);
971 __m128i unrd_mask = _mm_cmpeq_epi8 (unrd_imask, _mm_setzero_si128());
972
973 __m128i b_unread = _mm_blendv_epi8 (b_with_old, a, unrd_mask);
974 __m128i d_unread = _mm_blendv_epi8 (d_with_old, c, unrd_mask);
975 __m128i f_unread = _mm_blendv_epi8 (f_with_old, e, unrd_mask);
976 __m128i h_unread = _mm_blendv_epi8 (h_with_old, g, unrd_mask);
977
978 old_b = b_shifted;
979 old_d = d_shifted;
980 old_f = f_shifted;
981 old_h = h_shifted;
982
983 __m128i sad_ab = _mm_sad_epu8(a, b_unread);
984 __m128i sad_cd = _mm_sad_epu8(c, d_unread);
985 __m128i sad_ef = _mm_sad_epu8(e, f_unread);
986 __m128i sad_gh = _mm_sad_epu8(g, h_unread);
987
988 sse_inc = _mm_add_epi64(sse_inc, sad_ab);
989 sse_inc = _mm_add_epi64(sse_inc, sad_cd);
990 sse_inc = _mm_add_epi64(sse_inc, sad_ef);
991 sse_inc = _mm_add_epi64(sse_inc, sad_gh);
992 }
993 }
994 if (height_residual_lines) {
995 for (; y < height; y++) {
996 __m128i borderpx_vec = _mm_set1_epi8(ref_data[(int32_t)((y + 0) * ref_stride + border_off)]);
997 for (x = 0; x < outside_vecs; x++) {
998 __m128i a = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 0) * pic_stride + outvec_offset));
999
1000 __m128i startoffs = _mm_set1_epi8 ((x + inside_vecs) << vec_width_log2);
1001 __m128i ns = _mm_add_epi8 (startoffs, nslo);
1002
1003 // Unread imask is (is_left NOR unrd_imask_for_right), do the maths etc
1004 __m128i unrd_imask = _mm_cmpgt_epi8 (blk_widths, ns);
1005 unrd_imask = _mm_or_si128 (unrd_imask, is_left);
1006 __m128i unrd_mask = _mm_cmpeq_epi8 (unrd_imask, _mm_setzero_si128());
1007 __m128i b_unread = _mm_blendv_epi8(borderpx_vec, a, unrd_mask);
1008
1009 __m128i sad_ab = _mm_sad_epu8 (a, b_unread);
1010 sse_inc = _mm_add_epi64(sse_inc, sad_ab);
1011 }
1012 int32_t a_off = outside_width & is_left_bm;
1013 int32_t leftoff_with_sign_neg = (left_offset ^ is_left_bm) - is_left_bm;
1014
1015 __m128i old_b = borderpx_vec;
1016 for (x = invec_lstart; x != invec_lend; x += invec_linc) {
1017 __m128i a = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 0) * pic_stride + a_off));
1018 __m128i b = _mm_loadu_si128((__m128i *)(ref_data + x * vec_width + (y + 0) * ref_stride + a_off - leftoff_with_sign_neg));
1019
1020 __m128i b_shifted = _mm_shuffle_epi8(b, shufmask1);
1021 __m128i b_with_old = _mm_blendv_epi8 (old_b, b_shifted, move_old_to_b_imask);
1022
1023 uint8_t startoff = (x << vec_width_log2) + a_off;
1024 __m128i startoffs = _mm_set1_epi8 (startoff);
1025 __m128i curr_ns = _mm_add_epi8 (startoffs, nslo);
1026 __m128i unrd_imask = _mm_cmpgt_epi8 (blk_widths, curr_ns);
1027 __m128i unrd_mask = _mm_cmpeq_epi8 (unrd_imask, _mm_setzero_si128());
1028 __m128i b_unread = _mm_blendv_epi8 (b_with_old, a, unrd_mask);
1029
1030 old_b = b_shifted;
1031
1032 __m128i sad_ab = _mm_sad_epu8(a, b_unread);
1033 sse_inc = _mm_add_epi64(sse_inc, sad_ab);
1034 }
1035 }
1036 }
1037 __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
1038 __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2);
1039 return _mm_cvtsi128_si32(sad);
1040 }
1041
1042 #endif // KVZ_BIT_DEPTH == 8
1043
1044 #endif
1045