1 /*****************************************************************************
2  * This file is part of Kvazaar HEVC encoder.
3  *
4  * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without modification,
8  * are permitted provided that the following conditions are met:
9  *
10  * * Redistributions of source code must retain the above copyright notice, this
11  *   list of conditions and the following disclaimer.
12  *
13  * * Redistributions in binary form must reproduce the above copyright notice, this
14  *   list of conditions and the following disclaimer in the documentation and/or
15  *   other materials provided with the distribution.
16  *
17  * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
18  *   contributors may be used to endorse or promote products derived from
19  *   this software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26  * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
28  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
31  ****************************************************************************/
32 
33 #ifndef REG_SAD_POW2_WIDTHS_SSE41_H_
34 #define REG_SAD_POW2_WIDTHS_SSE41_H_
35 
36 #include "kvazaar.h"
37 
38 #if KVZ_BIT_DEPTH == 8
39 
40 #include "strategies/missing-intel-intrinsics.h"
41 #include <immintrin.h>
42 
reg_sad_w0(const uint8_t * const data1,const uint8_t * const data2,const int32_t height,const uint32_t stride1,const uint32_t stride2)43 static INLINE uint32_t reg_sad_w0(const uint8_t * const data1, const uint8_t * const data2,
44                            const int32_t height, const uint32_t stride1,
45                            const uint32_t stride2)
46 {
47   return 0;
48 }
49 
reg_sad_w4(const uint8_t * const data1,const uint8_t * const data2,const int32_t height,const uint32_t stride1,const uint32_t stride2)50 static INLINE uint32_t reg_sad_w4(const uint8_t * const data1, const uint8_t * const data2,
51                            const int32_t height, const uint32_t stride1,
52                            const uint32_t stride2)
53 {
54   __m128i sse_inc = _mm_setzero_si128();
55   int32_t y;
56 
57   const int32_t height_fourline_groups = height & ~3;
58   const int32_t height_residual_lines  = height &  3;
59 
60   for (y = 0; y < height_fourline_groups; y += 4) {
61     __m128i a = _mm_cvtsi32_si128(*(uint32_t *)(data1 + y * stride1));
62     __m128i b = _mm_cvtsi32_si128(*(uint32_t *)(data2 + y * stride2));
63 
64     a = _mm_insert_epi32(a, *(const uint32_t *)(data1 + (y + 1) * stride1), 1);
65     b = _mm_insert_epi32(b, *(const uint32_t *)(data2 + (y + 1) * stride2), 1);
66     a = _mm_insert_epi32(a, *(const uint32_t *)(data1 + (y + 2) * stride1), 2);
67     b = _mm_insert_epi32(b, *(const uint32_t *)(data2 + (y + 2) * stride2), 2);
68     a = _mm_insert_epi32(a, *(const uint32_t *)(data1 + (y + 3) * stride1), 3);
69     b = _mm_insert_epi32(b, *(const uint32_t *)(data2 + (y + 3) * stride2), 3);
70 
71     __m128i curr_sads = _mm_sad_epu8(a, b);
72     sse_inc = _mm_add_epi64(sse_inc, curr_sads);
73   }
74   if (height_residual_lines) {
75     for (; y < height; y++) {
76       __m128i a = _mm_cvtsi32_si128(*(const uint32_t *)(data1 + y * stride1));
77       __m128i b = _mm_cvtsi32_si128(*(const uint32_t *)(data2 + y * stride2));
78 
79       __m128i curr_sads = _mm_sad_epu8(a, b);
80       sse_inc = _mm_add_epi64(sse_inc, curr_sads);
81     }
82   }
83   __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
84   __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
85 
86   return _mm_cvtsi128_si32(sad);
87 }
88 
reg_sad_w8(const uint8_t * const data1,const uint8_t * const data2,const int32_t height,const uint32_t stride1,const uint32_t stride2)89 static INLINE uint32_t reg_sad_w8(const uint8_t * const data1, const uint8_t * const data2,
90                            const int32_t height, const uint32_t stride1,
91                            const uint32_t stride2)
92 {
93   __m128i sse_inc = _mm_setzero_si128();
94   int32_t y;
95 
96   const int32_t height_fourline_groups = height & ~3;
97   const int32_t height_residual_lines  = height &  3;
98 
99   for (y = 0; y < height_fourline_groups; y += 4) {
100     __m128d a_d = _mm_setzero_pd();
101     __m128d b_d = _mm_setzero_pd();
102     __m128d c_d = _mm_setzero_pd();
103     __m128d d_d = _mm_setzero_pd();
104 
105     a_d = _mm_loadl_pd(a_d, (const double *)(data1 + (y + 0) * stride1));
106     b_d = _mm_loadl_pd(b_d, (const double *)(data2 + (y + 0) * stride2));
107     a_d = _mm_loadh_pd(a_d, (const double *)(data1 + (y + 1) * stride1));
108     b_d = _mm_loadh_pd(b_d, (const double *)(data2 + (y + 1) * stride2));
109 
110     c_d = _mm_loadl_pd(c_d, (const double *)(data1 + (y + 2) * stride1));
111     d_d = _mm_loadl_pd(d_d, (const double *)(data2 + (y + 2) * stride2));
112     c_d = _mm_loadh_pd(c_d, (const double *)(data1 + (y + 3) * stride1));
113     d_d = _mm_loadh_pd(d_d, (const double *)(data2 + (y + 3) * stride2));
114 
115     __m128i a = _mm_castpd_si128(a_d);
116     __m128i b = _mm_castpd_si128(b_d);
117     __m128i c = _mm_castpd_si128(c_d);
118     __m128i d = _mm_castpd_si128(d_d);
119 
120     __m128i curr_sads_ab = _mm_sad_epu8(a, b);
121     __m128i curr_sads_cd = _mm_sad_epu8(c, d);
122     sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
123     sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
124   }
125   if (height_residual_lines) {
126     for (; y < height; y++) {
127       __m128i a = _mm_loadl_epi64((__m128i *)(data1 + y * stride1));
128       __m128i b = _mm_loadl_epi64((__m128i *)(data2 + y * stride2));
129 
130       __m128i curr_sads_ab = _mm_sad_epu8(a, b);
131       sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
132     }
133   }
134   __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
135   __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
136 
137   return _mm_cvtsi128_si32(sad);
138 }
139 
reg_sad_w12(const uint8_t * const data1,const uint8_t * const data2,const int32_t height,const uint32_t stride1,const uint32_t stride2)140 static INLINE uint32_t reg_sad_w12(const uint8_t * const data1, const uint8_t * const data2,
141                             const int32_t height, const uint32_t stride1,
142                             const uint32_t stride2)
143 {
144   __m128i sse_inc = _mm_setzero_si128();
145   int32_t y;
146   for (y = 0; y < height; y++) {
147     __m128i a = _mm_loadu_si128((const __m128i *)(data1 + y * stride1));
148     __m128i b = _mm_loadu_si128((const __m128i *)(data2 + y * stride2));
149 
150     __m128i b_masked  = _mm_blend_epi16(a, b, 0x3f);
151     __m128i curr_sads = _mm_sad_epu8   (a, b_masked);
152     sse_inc = _mm_add_epi64(sse_inc, curr_sads);
153   }
154   __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
155   __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
156   return _mm_cvtsi128_si32(sad);
157 }
158 
reg_sad_w16(const uint8_t * const data1,const uint8_t * const data2,const int32_t height,const uint32_t stride1,const uint32_t stride2)159 static INLINE uint32_t reg_sad_w16(const uint8_t * const data1, const uint8_t * const data2,
160                             const int32_t height, const uint32_t stride1,
161                             const uint32_t stride2)
162 {
163   __m128i sse_inc = _mm_setzero_si128();
164   int32_t y;
165 
166   const int32_t height_fourline_groups = height & ~3;
167   const int32_t height_residual_lines  = height &  3;
168 
169   for (y = 0; y < height_fourline_groups; y += 4) {
170     __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y + 0) * stride1));
171     __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y + 0) * stride2));
172     __m128i c = _mm_loadu_si128((const __m128i *)(data1 + (y + 1) * stride1));
173     __m128i d = _mm_loadu_si128((const __m128i *)(data2 + (y + 1) * stride2));
174     __m128i e = _mm_loadu_si128((const __m128i *)(data1 + (y + 2) * stride1));
175     __m128i f = _mm_loadu_si128((const __m128i *)(data2 + (y + 2) * stride2));
176     __m128i g = _mm_loadu_si128((const __m128i *)(data1 + (y + 3) * stride1));
177     __m128i h = _mm_loadu_si128((const __m128i *)(data2 + (y + 3) * stride2));
178 
179     __m128i curr_sads_ab = _mm_sad_epu8(a, b);
180     __m128i curr_sads_cd = _mm_sad_epu8(c, d);
181     __m128i curr_sads_ef = _mm_sad_epu8(e, f);
182     __m128i curr_sads_gh = _mm_sad_epu8(g, h);
183 
184     sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
185     sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
186     sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
187     sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
188   }
189   if (height_residual_lines) {
190     for (; y < height; y++) {
191       __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y + 0) * stride1));
192       __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y + 0) * stride2));
193 
194       __m128i curr_sads = _mm_sad_epu8(a, b);
195       sse_inc = _mm_add_epi64(sse_inc, curr_sads);
196     }
197   }
198 
199   __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
200   __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
201   return _mm_cvtsi128_si32(sad);
202 }
203 
reg_sad_w24(const uint8_t * const data1,const uint8_t * const data2,const int32_t height,const uint32_t stride1,const uint32_t stride2)204 static INLINE uint32_t reg_sad_w24(const uint8_t * const data1, const uint8_t * const data2,
205                             const int32_t height, const uint32_t stride1,
206                             const uint32_t stride2)
207 {
208   __m128i sse_inc = _mm_setzero_si128();
209   int32_t y;
210 
211   const int32_t height_doublelines = height & ~1;
212   const int32_t height_parity      = height &  1;
213 
214   for (y = 0; y < height_doublelines; y += 2) {
215     __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y + 0) * stride1));
216     __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y + 0) * stride2));
217     __m128i c = _mm_loadu_si128((const __m128i *)(data1 + (y + 1) * stride1));
218     __m128i d = _mm_loadu_si128((const __m128i *)(data2 + (y + 1) * stride2));
219 
220     __m128d e_d = _mm_setzero_pd();
221     __m128d f_d = _mm_setzero_pd();
222 
223     e_d = _mm_loadl_pd(e_d, (const double *)(data1 + (y + 0) * stride1 + 16));
224     f_d = _mm_loadl_pd(f_d, (const double *)(data2 + (y + 0) * stride2 + 16));
225     e_d = _mm_loadh_pd(e_d, (const double *)(data1 + (y + 1) * stride1 + 16));
226     f_d = _mm_loadh_pd(f_d, (const double *)(data2 + (y + 1) * stride2 + 16));
227 
228     __m128i e = _mm_castpd_si128(e_d);
229     __m128i f = _mm_castpd_si128(f_d);
230 
231     __m128i curr_sads_1 = _mm_sad_epu8(a, b);
232     __m128i curr_sads_2 = _mm_sad_epu8(c, d);
233     __m128i curr_sads_3 = _mm_sad_epu8(e, f);
234 
235     sse_inc = _mm_add_epi64(sse_inc, curr_sads_1);
236     sse_inc = _mm_add_epi64(sse_inc, curr_sads_2);
237     sse_inc = _mm_add_epi64(sse_inc, curr_sads_3);
238   }
239   if (height_parity) {
240     __m128i a = _mm_loadu_si128   ((const __m128i *)(data1 + y * stride1));
241     __m128i b = _mm_loadu_si128   ((const __m128i *)(data2 + y * stride2));
242     __m128i c = _mm_loadl_epi64   ((const __m128i *)(data1 + y * stride1 + 16));
243     __m128i d = _mm_loadl_epi64   ((const __m128i *)(data2 + y * stride2 + 16));
244 
245     __m128i curr_sads_1 = _mm_sad_epu8(a, b);
246     __m128i curr_sads_2 = _mm_sad_epu8(c, d);
247 
248     sse_inc = _mm_add_epi64(sse_inc, curr_sads_1);
249     sse_inc = _mm_add_epi64(sse_inc, curr_sads_2);
250   }
251   __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
252   __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
253   return _mm_cvtsi128_si32(sad);
254 }
255 
reg_sad_arbitrary(const uint8_t * const data1,const uint8_t * const data2,const int32_t width,const int32_t height,const uint32_t stride1,const uint32_t stride2)256 static INLINE uint32_t reg_sad_arbitrary(const uint8_t * const data1, const uint8_t * const data2,
257                                   const int32_t width, const int32_t height, const uint32_t stride1,
258                                   const uint32_t stride2)
259 {
260   int32_t y, x;
261   __m128i sse_inc = _mm_setzero_si128();
262 
263   // Bytes in block in 128-bit blocks per each scanline, and remainder
264   const int32_t width_xmms             = width  & ~15;
265   const int32_t width_residual_pixels  = width  &  15;
266 
267   const int32_t height_fourline_groups = height & ~3;
268   const int32_t height_residual_lines  = height &  3;
269 
270   const __m128i rds    = _mm_set1_epi8 (width_residual_pixels);
271   const __m128i ns     = _mm_setr_epi8 (0,  1,  2,  3,  4,  5,  6,  7,
272                                         8,  9,  10, 11, 12, 13, 14, 15);
273   const __m128i rdmask = _mm_cmpgt_epi8(rds, ns);
274 
275   for (x = 0; x < width_xmms; x += 16) {
276     for (y = 0; y < height_fourline_groups; y += 4) {
277       __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y + 0) * stride1 + x));
278       __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y + 0) * stride2 + x));
279       __m128i c = _mm_loadu_si128((const __m128i *)(data1 + (y + 1) * stride1 + x));
280       __m128i d = _mm_loadu_si128((const __m128i *)(data2 + (y + 1) * stride2 + x));
281       __m128i e = _mm_loadu_si128((const __m128i *)(data1 + (y + 2) * stride1 + x));
282       __m128i f = _mm_loadu_si128((const __m128i *)(data2 + (y + 2) * stride2 + x));
283       __m128i g = _mm_loadu_si128((const __m128i *)(data1 + (y + 3) * stride1 + x));
284       __m128i h = _mm_loadu_si128((const __m128i *)(data2 + (y + 3) * stride2 + x));
285 
286       __m128i curr_sads_ab = _mm_sad_epu8(a, b);
287       __m128i curr_sads_cd = _mm_sad_epu8(c, d);
288       __m128i curr_sads_ef = _mm_sad_epu8(e, f);
289       __m128i curr_sads_gh = _mm_sad_epu8(g, h);
290 
291       sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
292       sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
293       sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
294       sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
295     }
296     if (height_residual_lines) {
297       for (; y < height; y++) {
298         __m128i a = _mm_loadu_si128((const __m128i *)(data1 + y * stride1 + x));
299         __m128i b = _mm_loadu_si128((const __m128i *)(data2 + y * stride2 + x));
300 
301         __m128i curr_sads = _mm_sad_epu8(a, b);
302 
303         sse_inc = _mm_add_epi64(sse_inc, curr_sads);
304       }
305     }
306   }
307 
308   if (width_residual_pixels) {
309     for (y = 0; y < height_fourline_groups; y += 4) {
310       __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y + 0) * stride1 + x));
311       __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y + 0) * stride2 + x));
312       __m128i c = _mm_loadu_si128((const __m128i *)(data1 + (y + 1) * stride1 + x));
313       __m128i d = _mm_loadu_si128((const __m128i *)(data2 + (y + 1) * stride2 + x));
314       __m128i e = _mm_loadu_si128((const __m128i *)(data1 + (y + 2) * stride1 + x));
315       __m128i f = _mm_loadu_si128((const __m128i *)(data2 + (y + 2) * stride2 + x));
316       __m128i g = _mm_loadu_si128((const __m128i *)(data1 + (y + 3) * stride1 + x));
317       __m128i h = _mm_loadu_si128((const __m128i *)(data2 + (y + 3) * stride2 + x));
318 
319       __m128i b_masked     = _mm_blendv_epi8(a, b, rdmask);
320       __m128i d_masked     = _mm_blendv_epi8(c, d, rdmask);
321       __m128i f_masked     = _mm_blendv_epi8(e, f, rdmask);
322       __m128i h_masked     = _mm_blendv_epi8(g, h, rdmask);
323 
324       __m128i curr_sads_ab = _mm_sad_epu8   (a, b_masked);
325       __m128i curr_sads_cd = _mm_sad_epu8   (c, d_masked);
326       __m128i curr_sads_ef = _mm_sad_epu8   (e, f_masked);
327       __m128i curr_sads_gh = _mm_sad_epu8   (g, h_masked);
328 
329       sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
330       sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
331       sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
332       sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
333     }
334     if (height_residual_lines) {
335       for (; y < height; y++) {
336         __m128i a = _mm_loadu_si128((const __m128i *)(data1 + y * stride1 + x));
337         __m128i b = _mm_loadu_si128((const __m128i *)(data2 + y * stride2 + x));
338 
339         __m128i b_masked  = _mm_blendv_epi8(a, b, rdmask);
340         __m128i curr_sads = _mm_sad_epu8   (a, b_masked);
341 
342         sse_inc = _mm_add_epi64(sse_inc, curr_sads);
343       }
344     }
345   }
346   __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
347   __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
348 
349   return _mm_cvtsi128_si32(sad);
350 }
351 
ver_sad_w4(const uint8_t * pic_data,const uint8_t * ref_data,int32_t height,uint32_t stride)352 static uint32_t ver_sad_w4(const uint8_t *pic_data, const uint8_t *ref_data,
353                            int32_t height, uint32_t stride)
354 {
355   __m128i ref_row = _mm_set1_epi32(*(const uint32_t *)ref_data);
356   __m128i sse_inc = _mm_setzero_si128();
357   int32_t y;
358 
359   const int32_t height_fourline_groups = height & ~3;
360   const int32_t height_residual_lines  = height &  3;
361 
362   for (y = 0; y < height_fourline_groups; y += 4) {
363     __m128i a = _mm_cvtsi32_si128(*(uint32_t *)(pic_data + y * stride));
364 
365     a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 1) * stride), 1);
366     a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 2) * stride), 2);
367     a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 3) * stride), 3);
368 
369     __m128i curr_sads = _mm_sad_epu8(a, ref_row);
370     sse_inc = _mm_add_epi64(sse_inc, curr_sads);
371   }
372   if (height_residual_lines) {
373     // Only pick the last dword, because we're comparing single dwords (lines)
374     ref_row = _mm_bsrli_si128(ref_row, 12);
375 
376     for (; y < height; y++) {
377       __m128i a = _mm_cvtsi32_si128(*(const uint32_t *)(pic_data + y * stride));
378 
379       __m128i curr_sads = _mm_sad_epu8(a, ref_row);
380       sse_inc = _mm_add_epi64(sse_inc, curr_sads);
381     }
382   }
383   __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
384   __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
385 
386   return _mm_cvtsi128_si32(sad);
387 }
388 
ver_sad_w8(const uint8_t * pic_data,const uint8_t * ref_data,int32_t height,uint32_t stride)389 static uint32_t ver_sad_w8(const uint8_t *pic_data, const uint8_t *ref_data,
390                            int32_t height, uint32_t stride)
391 {
392   const __m128i ref_row = _mm_set1_epi64x(*(const uint64_t *)ref_data);
393   __m128i sse_inc = _mm_setzero_si128();
394   int32_t y;
395 
396   const int32_t height_fourline_groups = height & ~3;
397   const int32_t height_residual_lines  = height &  3;
398 
399   for (y = 0; y < height_fourline_groups; y += 4) {
400     __m128d a_d = _mm_setzero_pd();
401     __m128d c_d = _mm_setzero_pd();
402 
403     a_d = _mm_loadl_pd(a_d, (const double *)(pic_data + (y + 0) * stride));
404     a_d = _mm_loadh_pd(a_d, (const double *)(pic_data + (y + 1) * stride));
405 
406     c_d = _mm_loadl_pd(c_d, (const double *)(pic_data + (y + 2) * stride));
407     c_d = _mm_loadh_pd(c_d, (const double *)(pic_data + (y + 3) * stride));
408 
409     __m128i a = _mm_castpd_si128(a_d);
410     __m128i c = _mm_castpd_si128(c_d);
411 
412     __m128i curr_sads_ab = _mm_sad_epu8(a, ref_row);
413     __m128i curr_sads_cd = _mm_sad_epu8(c, ref_row);
414     sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
415     sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
416   }
417   if (height_residual_lines) {
418     __m128i b = _mm_move_epi64(ref_row);
419 
420     for (; y < height; y++) {
421       __m128i a = _mm_loadl_epi64((__m128i *)(pic_data + y * stride));
422 
423       __m128i curr_sads_ab = _mm_sad_epu8(a, b);
424       sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
425     }
426   }
427   __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
428   __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
429 
430   return _mm_cvtsi128_si32(sad);
431 }
432 
ver_sad_w12(const uint8_t * pic_data,const uint8_t * ref_data,int32_t height,uint32_t stride)433 static uint32_t ver_sad_w12(const uint8_t *pic_data, const uint8_t *ref_data,
434                             int32_t height, uint32_t stride)
435 {
436   const __m128i ref_row = _mm_loadu_si128((__m128i *)ref_data);
437   __m128i sse_inc = _mm_setzero_si128();
438   int32_t y;
439 
440   for (y = 0; y < height; y++) {
441     __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + y * stride));
442 
443     __m128i a_masked  = _mm_blend_epi16(ref_row, a, 0x3f);
444     __m128i curr_sads = _mm_sad_epu8   (ref_row, a_masked);
445     sse_inc = _mm_add_epi64(sse_inc, curr_sads);
446   }
447   __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
448   __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
449   return _mm_cvtsi128_si32(sad);
450 }
451 
ver_sad_w16(const uint8_t * pic_data,const uint8_t * ref_data,int32_t height,uint32_t stride)452 static uint32_t ver_sad_w16(const uint8_t *pic_data, const uint8_t *ref_data,
453                             int32_t height, uint32_t stride)
454 {
455   const __m128i ref_row = _mm_loadu_si128((__m128i *)ref_data);
456   __m128i sse_inc       = _mm_setzero_si128();
457   int32_t y;
458 
459   const int32_t height_fourline_groups = height & ~3;
460   const int32_t height_residual_lines  = height &  3;
461 
462   for (y = 0; y < height_fourline_groups; y += 4) {
463     __m128i pic_row_1   = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * stride));
464     __m128i pic_row_2   = _mm_loadu_si128((__m128i *)(pic_data + (y + 1) * stride));
465     __m128i pic_row_3   = _mm_loadu_si128((__m128i *)(pic_data + (y + 2) * stride));
466     __m128i pic_row_4   = _mm_loadu_si128((__m128i *)(pic_data + (y + 3) * stride));
467 
468     __m128i curr_sads_1 = _mm_sad_epu8   (pic_row_1, ref_row);
469     __m128i curr_sads_2 = _mm_sad_epu8   (pic_row_2, ref_row);
470     __m128i curr_sads_3 = _mm_sad_epu8   (pic_row_3, ref_row);
471     __m128i curr_sads_4 = _mm_sad_epu8   (pic_row_4, ref_row);
472 
473     sse_inc = _mm_add_epi64(sse_inc, curr_sads_1);
474     sse_inc = _mm_add_epi64(sse_inc, curr_sads_2);
475     sse_inc = _mm_add_epi64(sse_inc, curr_sads_3);
476     sse_inc = _mm_add_epi64(sse_inc, curr_sads_4);
477   }
478   if (height_residual_lines) {
479     for (; y < height; y++) {
480       __m128i pic_row   = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * stride));
481       __m128i curr_sads = _mm_sad_epu8   (pic_row, ref_row);
482 
483       sse_inc = _mm_add_epi64(sse_inc, curr_sads);
484     }
485   }
486   __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
487   __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
488 
489   return _mm_cvtsi128_si32(sad);
490 }
491 
ver_sad_arbitrary(const uint8_t * pic_data,const uint8_t * ref_data,int32_t width,int32_t height,uint32_t stride)492 static uint32_t ver_sad_arbitrary(const uint8_t *pic_data, const uint8_t *ref_data,
493                                   int32_t width, int32_t height, uint32_t stride)
494 {
495   int32_t y, x;
496   __m128i sse_inc = _mm_setzero_si128();
497 
498   // Bytes in block in 128-bit blocks per each scanline, and remainder
499   const int32_t width_xmms             = width  & ~15;
500   const int32_t width_residual_pixels  = width  &  15;
501 
502   const int32_t height_fourline_groups = height & ~3;
503   const int32_t height_residual_lines  = height &  3;
504 
505   const __m128i rds    = _mm_set1_epi8 (width_residual_pixels);
506   const __m128i ns     = _mm_setr_epi8 (0,  1,  2,  3,  4,  5,  6,  7,
507                                         8,  9,  10, 11, 12, 13, 14, 15);
508   const __m128i rdmask = _mm_cmpgt_epi8(rds, ns);
509 
510   for (x = 0; x < width_xmms; x += 16) {
511     const __m128i ref_row = _mm_loadu_si128((__m128i *)(ref_data + x));
512     for (y = 0; y < height_fourline_groups; y += 4) {
513       __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + (y + 0) * stride + x));
514       __m128i c = _mm_loadu_si128((const __m128i *)(pic_data + (y + 1) * stride + x));
515       __m128i e = _mm_loadu_si128((const __m128i *)(pic_data + (y + 2) * stride + x));
516       __m128i g = _mm_loadu_si128((const __m128i *)(pic_data + (y + 3) * stride + x));
517 
518       __m128i curr_sads_ab = _mm_sad_epu8(ref_row, a);
519       __m128i curr_sads_cd = _mm_sad_epu8(ref_row, c);
520       __m128i curr_sads_ef = _mm_sad_epu8(ref_row, e);
521       __m128i curr_sads_gh = _mm_sad_epu8(ref_row, g);
522 
523       sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
524       sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
525       sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
526       sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
527     }
528     if (height_residual_lines) {
529       for (; y < height; y++) {
530         __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + y * stride + x));
531 
532         __m128i curr_sads = _mm_sad_epu8(a, ref_row);
533 
534         sse_inc = _mm_add_epi64(sse_inc, curr_sads);
535       }
536     }
537   }
538 
539   if (width_residual_pixels) {
540     const __m128i ref_row = _mm_loadu_si128((__m128i *)(ref_data + x));
541     for (y = 0; y < height_fourline_groups; y += 4) {
542       __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + (y + 0) * stride + x));
543       __m128i c = _mm_loadu_si128((const __m128i *)(pic_data + (y + 1) * stride + x));
544       __m128i e = _mm_loadu_si128((const __m128i *)(pic_data + (y + 2) * stride + x));
545       __m128i g = _mm_loadu_si128((const __m128i *)(pic_data + (y + 3) * stride + x));
546 
547       __m128i a_masked     = _mm_blendv_epi8(ref_row, a, rdmask);
548       __m128i c_masked     = _mm_blendv_epi8(ref_row, c, rdmask);
549       __m128i e_masked     = _mm_blendv_epi8(ref_row, e, rdmask);
550       __m128i g_masked     = _mm_blendv_epi8(ref_row, g, rdmask);
551 
552       __m128i curr_sads_ab = _mm_sad_epu8   (ref_row, a_masked);
553       __m128i curr_sads_cd = _mm_sad_epu8   (ref_row, c_masked);
554       __m128i curr_sads_ef = _mm_sad_epu8   (ref_row, e_masked);
555       __m128i curr_sads_gh = _mm_sad_epu8   (ref_row, g_masked);
556 
557       sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
558       sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
559       sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
560       sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
561     }
562     if (height_residual_lines) {
563       for (; y < height; y++) {
564         __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + y * stride + x));
565 
566         __m128i a_masked  = _mm_blendv_epi8(ref_row, a, rdmask);
567         __m128i curr_sads = _mm_sad_epu8   (ref_row, a_masked);
568 
569         sse_inc = _mm_add_epi64(sse_inc, curr_sads);
570       }
571     }
572   }
573   __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
574   __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
575 
576   return _mm_cvtsi128_si32(sad);
577 }
578 
hor_sad_sse41_w4(const uint8_t * pic_data,const uint8_t * ref_data,int32_t height,uint32_t pic_stride,uint32_t ref_stride,uint32_t left,uint32_t right)579 static uint32_t hor_sad_sse41_w4(const uint8_t *pic_data, const uint8_t *ref_data,
580                                  int32_t height, uint32_t pic_stride, uint32_t ref_stride,
581                                  uint32_t left, uint32_t right)
582 {
583   const int32_t right_border_idx = 3 - right;
584   const int32_t border_idx       = left ? left : right_border_idx;
585 
586   const __m128i ns               = _mm_setr_epi8(0,  1,  2,  3,  4,  5,  6,  7,
587                                                  8,  9,  10, 11, 12, 13, 14, 15);
588 
589   const int32_t border_idx_negative = border_idx >> 31;
590   const int32_t leftoff             = border_idx_negative | left;
591 
592   // Dualword (ie. line) base indexes, ie. the edges the lines read will be
593   // clamped towards
594   const __m128i dwbaseids   = _mm_setr_epi8(0, 0, 0, 0, 4, 4, 4, 4,
595                                             8, 8, 8, 8, 12, 12, 12, 12);
596 
597   __m128i right_border_idxs = _mm_set1_epi8((int8_t)right_border_idx);
598   __m128i left_128          = _mm_set1_epi8((int8_t)left);
599 
600   right_border_idxs         = _mm_add_epi8 (right_border_idxs, dwbaseids);
601 
602   __m128i mask_right        = _mm_min_epi8 (ns,         right_border_idxs);
603   __m128i mask1             = _mm_sub_epi8 (mask_right, left_128);
604 
605   const __m128i epol_mask   = _mm_max_epi8(mask1, dwbaseids);
606 
607   const int32_t height_fourline_groups = height & ~3;
608   const int32_t height_residual_lines  = height &  3;
609 
610   __m128i sse_inc = _mm_setzero_si128();
611   int32_t y;
612   for (y = 0; y < height_fourline_groups; y += 4) {
613     __m128i a = _mm_cvtsi32_si128(*(const uint32_t *)(pic_data + y * pic_stride));
614     __m128i b = _mm_cvtsi32_si128(*(const uint32_t *)(ref_data + y * ref_stride + leftoff));
615 
616     a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 1) * pic_stride),           1);
617     b = _mm_insert_epi32(b, *(const uint32_t *)(ref_data + (y + 1) * ref_stride + leftoff), 1);
618     a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 2) * pic_stride),           2);
619     b = _mm_insert_epi32(b, *(const uint32_t *)(ref_data + (y + 2) * ref_stride + leftoff), 2);
620     a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 3) * pic_stride),           3);
621     b = _mm_insert_epi32(b, *(const uint32_t *)(ref_data + (y + 3) * ref_stride + leftoff), 3);
622 
623     __m128i b_epol    = _mm_shuffle_epi8(b,       epol_mask);
624     __m128i curr_sads = _mm_sad_epu8    (a,       b_epol);
625             sse_inc   = _mm_add_epi64   (sse_inc, curr_sads);
626   }
627   if (height_residual_lines) {
628     for (; y < height; y++) {
629       __m128i a = _mm_cvtsi32_si128(*(const uint32_t *)(pic_data + y * pic_stride));
630       __m128i b = _mm_cvtsi32_si128(*(const uint32_t *)(ref_data + y * ref_stride + leftoff));
631 
632       __m128i b_epol = _mm_shuffle_epi8(b, epol_mask);
633       __m128i curr_sads = _mm_sad_epu8 (a, b_epol);
634       sse_inc = _mm_add_epi64(sse_inc, curr_sads);
635     }
636   }
637   __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
638   __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
639 
640   return _mm_cvtsi128_si32(sad);
641 }
642 
hor_sad_sse41_w8(const uint8_t * pic_data,const uint8_t * ref_data,int32_t height,uint32_t pic_stride,uint32_t ref_stride,uint32_t left,uint32_t right)643 static uint32_t hor_sad_sse41_w8(const uint8_t *pic_data, const uint8_t *ref_data,
644                                  int32_t height, uint32_t pic_stride, uint32_t ref_stride,
645                                  uint32_t left, uint32_t right)
646 {
647   // right is the number of overhanging pixels in the vector, so it has to be
648   // handled this way to produce the index of last valid (border) pixel
649   const int32_t right_border_idx = 7 - right;
650   const int32_t border_idx       = left ? left : right_border_idx;
651 
652   const __m128i ns               = _mm_setr_epi8(0,  1,  2,  3,  4,  5,  6,  7,
653                                                  8,  9,  10, 11, 12, 13, 14, 15);
654 
655   // Quadword (ie. line) base indexes, ie. the edges the lines read will be
656   // clamped towards; higher qword (lower line) bytes tend towards 8 and lower
657   // qword (higher line) bytes towards 0
658   const __m128i qwbaseids   = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0,
659                                             8, 8, 8, 8, 8, 8, 8, 8);
660 
661   // Dirty hack alert! If right == block_width (ie. the entire vector is
662   // outside the frame), move the block offset one pixel to the left (so
663   // that the leftmost pixel in vector is actually the valid border pixel
664   // from which we want to extrapolate), and use an epol mask that will
665   // simply stretch the pixel all over the vector.
666   //
667   // To avoid a branch here:
668   // The mask will be -1 (0xffffffff) for border_idx -1 and 0 for >= 0
669   const int32_t border_idx_negative = border_idx >> 31;
670   const int32_t leftoff             = border_idx_negative | left;
671 
672   __m128i right_border_idxs = _mm_set1_epi8((int8_t)right_border_idx);
673   __m128i left_128          = _mm_set1_epi8((int8_t)left);
674 
675   right_border_idxs         = _mm_add_epi8 (right_border_idxs, qwbaseids);
676 
677   // If we're straddling the left border, right_border_idx is 7 and the first
678   // operation does nothing. If right border, left is 0 and the second
679   // operation does nothing.
680   __m128i mask_right        = _mm_min_epi8 (ns,         right_border_idxs);
681   __m128i mask1             = _mm_sub_epi8 (mask_right, left_128);
682 
683   // If right == 8 (we're completely outside the frame), right_border_idx is
684   // -1 and so is mask1. Clamp negative values to qwbaseid and as discussed
685   // earlier, adjust the load offset instead to load the "-1'st" pixels and
686   // using qwbaseids as the shuffle mask, broadcast it all over the rows.
687   const __m128i epol_mask = _mm_max_epi8(mask1, qwbaseids);
688 
689   const int32_t height_fourline_groups = height & ~3;
690   const int32_t height_residual_lines  = height &  3;
691 
692   __m128i sse_inc = _mm_setzero_si128();
693   int32_t y;
694   for (y = 0; y < height_fourline_groups; y += 4) {
695     __m128d a_d = _mm_setzero_pd();
696     __m128d b_d = _mm_setzero_pd();
697     __m128d c_d = _mm_setzero_pd();
698     __m128d d_d = _mm_setzero_pd();
699 
700     a_d = _mm_loadl_pd(a_d, (const double *)(pic_data + (y + 0) * pic_stride));
701     b_d = _mm_loadl_pd(b_d, (const double *)(ref_data + (y + 0) * ref_stride + leftoff));
702     a_d = _mm_loadh_pd(a_d, (const double *)(pic_data + (y + 1) * pic_stride));
703     b_d = _mm_loadh_pd(b_d, (const double *)(ref_data + (y + 1) * ref_stride + leftoff));
704 
705     c_d = _mm_loadl_pd(c_d, (const double *)(pic_data + (y + 2) * pic_stride));
706     d_d = _mm_loadl_pd(d_d, (const double *)(ref_data + (y + 2) * ref_stride + leftoff));
707     c_d = _mm_loadh_pd(c_d, (const double *)(pic_data + (y + 3) * pic_stride));
708     d_d = _mm_loadh_pd(d_d, (const double *)(ref_data + (y + 3) * ref_stride + leftoff));
709 
710     __m128i a = _mm_castpd_si128(a_d);
711     __m128i b = _mm_castpd_si128(b_d);
712     __m128i c = _mm_castpd_si128(c_d);
713     __m128i d = _mm_castpd_si128(d_d);
714 
715     __m128i b_epol = _mm_shuffle_epi8(b, epol_mask);
716     __m128i d_epol = _mm_shuffle_epi8(d, epol_mask);
717 
718     __m128i curr_sads_ab = _mm_sad_epu8(a, b_epol);
719     __m128i curr_sads_cd = _mm_sad_epu8(c, d_epol);
720     sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
721     sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
722   }
723   if (height_residual_lines) {
724     for (; y < height; y++) {
725       __m128i a = _mm_loadl_epi64((__m128i *)(pic_data + y * pic_stride));
726       __m128i b = _mm_loadl_epi64((__m128i *)(ref_data + y * ref_stride + leftoff));
727 
728       __m128i b_epol = _mm_shuffle_epi8(b, epol_mask);
729 
730       __m128i curr_sads_ab = _mm_sad_epu8(a, b_epol);
731       sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
732     }
733   }
734   __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
735   __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
736   return _mm_cvtsi128_si32(sad);
737 }
738 
739 /*
740  * left and right measure how many pixels of one horizontal scanline will be
741  * outside either the left or the right screen border. For blocks straddling
742  * the left border, read the scanlines starting from the left border instead,
743  * and use the extrapolation mask to essentially move the pixels right while
744  * copying the left border pixel to the vector positions that logically point
745  * outside of the buffer.
746  *
747  * For blocks straddling the right border, just read over the right border,
748  * and extrapolate all pixels beyond the border idx to copy the value of the
749  * border pixel. An exception is right == width (leftmost reference pixel is
750  * one place right from the right border, it's ugly because the pixel to
751  * extrapolate from is located at relative X offset -1), abuse the left border
752  * aligning functionality instead to actually read starting from the valid
753  * border pixel, and use a suitable mask to fill all the other pixels with
754  * that value.
755  */
hor_sad_sse41_w16(const uint8_t * pic_data,const uint8_t * ref_data,int32_t height,uint32_t pic_stride,uint32_t ref_stride,const uint32_t left,const uint32_t right)756 static uint32_t hor_sad_sse41_w16(const uint8_t *pic_data, const uint8_t *ref_data,
757                                   int32_t height, uint32_t pic_stride, uint32_t ref_stride,
758                                   const uint32_t left, const uint32_t right)
759 {
760   // right is the number of overhanging pixels in the vector, so it has to be
761   // handled this way to produce the index of last valid (border) pixel
762   const int32_t right_border_idx = 15 - right;
763   const int32_t border_idx       = left ? left : right_border_idx;
764 
765   const __m128i ns               = _mm_setr_epi8(0,  1,  2,  3,  4,  5,  6,  7,
766                                                  8,  9,  10, 11, 12, 13, 14, 15);
767   const __m128i zero             = _mm_setzero_si128();
768 
769   // Dirty hack alert! If right == block_width (ie. the entire vector is
770   // outside the frame), move the block offset one pixel to the left (so
771   // that the leftmost pixel in vector is actually the valid border pixel
772   // from which we want to extrapolate), and use an epol mask that will
773   // simply stretch the pixel all over the vector.
774   //
775   // To avoid a branch here:
776   // The mask will be -1 (0xffffffff) for border_idx -1 and 0 for >= 0
777   const int32_t border_idx_negative = border_idx >> 31;
778   const int32_t leftoff             = border_idx_negative | left;
779 
780   __m128i right_border_idxs = _mm_set1_epi8((int8_t)right_border_idx);
781   __m128i left_128          = _mm_set1_epi8((int8_t)left);
782 
783   // If we're straddling the left border, right_border_idx is 15 and the first
784   // operation does nothing. If right border, left is 0 and the second
785   // operation does nothing.
786   __m128i mask_right        = _mm_min_epi8 (ns,         right_border_idxs);
787   __m128i mask1             = _mm_sub_epi8 (mask_right, left_128);
788 
789   // If right == 16 (we're completely outside the frame), right_border_idx is
790   // -1 and so is mask1. Clamp negative values to zero and as discussed
791   // earlier, adjust the load offset instead to load the "-1'st" pixel and
792   // using an all-zero shuffle mask, broadcast it all over the vector.
793   const __m128i epol_mask = _mm_max_epi8(mask1, zero);
794 
795   const int32_t height_fourline_groups = height & ~3;
796   const int32_t height_residual_lines  = height &  3;
797 
798   __m128i sse_inc = _mm_setzero_si128();
799   int32_t y;
800   for (y = 0; y < height_fourline_groups; y += 4) {
801     __m128i a = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride));
802     __m128i b = _mm_loadu_si128((__m128i *)(ref_data + (y + 0) * ref_stride + leftoff));
803     __m128i c = _mm_loadu_si128((__m128i *)(pic_data + (y + 1) * pic_stride));
804     __m128i d = _mm_loadu_si128((__m128i *)(ref_data + (y + 1) * ref_stride + leftoff));
805     __m128i e = _mm_loadu_si128((__m128i *)(pic_data + (y + 2) * pic_stride));
806     __m128i f = _mm_loadu_si128((__m128i *)(ref_data + (y + 2) * ref_stride + leftoff));
807     __m128i g = _mm_loadu_si128((__m128i *)(pic_data + (y + 3) * pic_stride));
808     __m128i h = _mm_loadu_si128((__m128i *)(ref_data + (y + 3) * ref_stride + leftoff));
809 
810     __m128i b_epol = _mm_shuffle_epi8(b, epol_mask);
811     __m128i d_epol = _mm_shuffle_epi8(d, epol_mask);
812     __m128i f_epol = _mm_shuffle_epi8(f, epol_mask);
813     __m128i h_epol = _mm_shuffle_epi8(h, epol_mask);
814 
815     __m128i curr_sads_ab = _mm_sad_epu8(a, b_epol);
816     __m128i curr_sads_cd = _mm_sad_epu8(c, d_epol);
817     __m128i curr_sads_ef = _mm_sad_epu8(e, f_epol);
818     __m128i curr_sads_gh = _mm_sad_epu8(g, h_epol);
819 
820     sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
821     sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
822     sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
823     sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
824   }
825   if (height_residual_lines) {
826     for (; y < height; y++) {
827       __m128i a = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride));
828       __m128i b = _mm_loadu_si128((__m128i *)(ref_data + (y + 0) * ref_stride + leftoff));
829       __m128i b_epol = _mm_shuffle_epi8(b, epol_mask);
830       __m128i curr_sads = _mm_sad_epu8(a, b_epol);
831       sse_inc = _mm_add_epi64(sse_inc, curr_sads);
832     }
833   }
834   __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
835   __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
836   return _mm_cvtsi128_si32(sad);
837 }
838 
hor_sad_sse41_arbitrary(const uint8_t * pic_data,const uint8_t * ref_data,int32_t width,int32_t height,uint32_t pic_stride,uint32_t ref_stride,uint32_t left,uint32_t right)839 static INLINE uint32_t hor_sad_sse41_arbitrary(const uint8_t *pic_data, const uint8_t *ref_data,
840                                                int32_t width, int32_t height, uint32_t pic_stride,
841                                                uint32_t ref_stride, uint32_t left, uint32_t right)
842 {
843   __m128i sse_inc = _mm_setzero_si128();
844 
845   const size_t vec_width = 16;
846   const size_t vecwid_bitmask = 15;
847   const size_t vec_width_log2 = 4;
848 
849   const int32_t height_fourline_groups = height & ~3;
850   const int32_t height_residual_lines  = height &  3;
851 
852   const __m128i rights     = _mm_set1_epi8((uint8_t)right);
853   const __m128i blk_widths = _mm_set1_epi8((uint8_t)width);
854   const __m128i vec_widths = _mm_set1_epi8((uint8_t)vec_width);
855   const __m128i nslo       = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
856 
857   uint32_t outside_vecs,  inside_vecs,  left_offset, is_left_bm;
858   int32_t  outside_width, inside_width, border_off,  invec_lstart,
859            invec_lend,    invec_linc;
860   if (left) {
861     outside_vecs  =    left                              >> vec_width_log2;
862     inside_vecs   = (( width           + vecwid_bitmask) >> vec_width_log2) - outside_vecs;
863     outside_width =    outside_vecs * vec_width;
864     inside_width  =    inside_vecs  * vec_width;
865     left_offset   =    left;
866     border_off    =    left;
867     invec_lstart  =    0;
868     invec_lend    =    inside_vecs;
869     invec_linc    =    1;
870     is_left_bm    =    -1;
871   } else {
872     inside_vecs   =  ((width - right) + vecwid_bitmask)  >> vec_width_log2;
873     outside_vecs  = (( width          + vecwid_bitmask)  >> vec_width_log2) - inside_vecs;
874     outside_width =    outside_vecs * vec_width;
875     inside_width  =    inside_vecs  * vec_width;
876     left_offset   =    right - width;
877     border_off    =    width - 1 - right;
878     invec_lstart  =    inside_vecs - 1;
879     invec_lend    =    -1;
880     invec_linc    =    -1;
881     is_left_bm    =    0;
882   }
883   left_offset &= vecwid_bitmask;
884 
885   const __m128i left_offsets = _mm_set1_epi8 ((uint8_t)left_offset);
886   const __m128i is_left      = _mm_cmpeq_epi8(rights, _mm_setzero_si128());
887   const __m128i vw_for_left  = _mm_and_si128 (is_left, vec_widths);
888 
889   // -x == (x ^ 0xff) + 1 = (x ^ 0xff) - 0xff. Also x == (x ^ 0x00) - 0x00.
890   // in other words, calculate inverse of left_offsets if is_left is true.
891   const __m128i offs_neg            = _mm_xor_si128 (left_offsets, is_left);
892   const __m128i offs_for_sm1        = _mm_sub_epi8  (offs_neg,     is_left);
893 
894   const __m128i ns_for_sm1          = _mm_or_si128  (vw_for_left,  nslo);
895   const __m128i shufmask1           = _mm_add_epi8  (ns_for_sm1,   offs_for_sm1);
896 
897   const __m128i mo2bmask_l          = _mm_cmpgt_epi8(left_offsets, nslo);
898   const __m128i mo2bimask_l         = _mm_cmpeq_epi8(mo2bmask_l,   _mm_setzero_si128());
899   const __m128i mo2bimask_r         = _mm_cmpgt_epi8(vec_widths,   shufmask1);
900   const __m128i move_old_to_b_imask = _mm_blendv_epi8(mo2bimask_r, mo2bimask_l, is_left);
901 
902   const int32_t outvec_offset = (~is_left_bm) & inside_width;
903   int32_t x, y;
904   for (y = 0; y < height_fourline_groups; y += 4) {
905     __m128i borderpx_vec_b = _mm_set1_epi8(ref_data[(int32_t)((y + 0) * ref_stride + border_off)]);
906     __m128i borderpx_vec_d = _mm_set1_epi8(ref_data[(int32_t)((y + 1) * ref_stride + border_off)]);
907     __m128i borderpx_vec_f = _mm_set1_epi8(ref_data[(int32_t)((y + 2) * ref_stride + border_off)]);
908     __m128i borderpx_vec_h = _mm_set1_epi8(ref_data[(int32_t)((y + 3) * ref_stride + border_off)]);
909 
910     for (x = 0; x < outside_vecs; x++) {
911       __m128i a = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 0) * pic_stride + outvec_offset));
912       __m128i c = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 1) * pic_stride + outvec_offset));
913       __m128i e = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 2) * pic_stride + outvec_offset));
914       __m128i g = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 3) * pic_stride + outvec_offset));
915 
916       __m128i startoffs  = _mm_set1_epi8  ((x + inside_vecs) << vec_width_log2);
917       __m128i ns         = _mm_add_epi8   (startoffs, nslo);
918 
919       // Unread imask is (is_left NOR unrd_imask_for_right), do the maths etc
920       __m128i unrd_imask = _mm_cmpgt_epi8 (blk_widths, ns);
921               unrd_imask = _mm_or_si128   (unrd_imask, is_left);
922       __m128i unrd_mask  = _mm_cmpeq_epi8 (unrd_imask, _mm_setzero_si128());
923 
924       __m128i b_unread   = _mm_blendv_epi8(borderpx_vec_b, a, unrd_mask);
925       __m128i d_unread   = _mm_blendv_epi8(borderpx_vec_d, c, unrd_mask);
926       __m128i f_unread   = _mm_blendv_epi8(borderpx_vec_f, e, unrd_mask);
927       __m128i h_unread   = _mm_blendv_epi8(borderpx_vec_h, g, unrd_mask);
928 
929       __m128i sad_ab     = _mm_sad_epu8   (a, b_unread);
930       __m128i sad_cd     = _mm_sad_epu8   (c, d_unread);
931       __m128i sad_ef     = _mm_sad_epu8   (e, f_unread);
932       __m128i sad_gh     = _mm_sad_epu8   (g, h_unread);
933 
934       sse_inc = _mm_add_epi64(sse_inc, sad_ab);
935       sse_inc = _mm_add_epi64(sse_inc, sad_cd);
936       sse_inc = _mm_add_epi64(sse_inc, sad_ef);
937       sse_inc = _mm_add_epi64(sse_inc, sad_gh);
938     }
939     int32_t a_off = outside_width & is_left_bm;
940     int32_t leftoff_with_sign_neg = (left_offset ^ is_left_bm) - is_left_bm;
941 
942     __m128i old_b = borderpx_vec_b;
943     __m128i old_d = borderpx_vec_d;
944     __m128i old_f = borderpx_vec_f;
945     __m128i old_h = borderpx_vec_h;
946 
947     for (x = invec_lstart; x != invec_lend; x += invec_linc) {
948       __m128i a = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 0) * pic_stride + a_off));
949       __m128i c = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 1) * pic_stride + a_off));
950       __m128i e = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 2) * pic_stride + a_off));
951       __m128i g = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 3) * pic_stride + a_off));
952       __m128i b = _mm_loadu_si128((__m128i *)(ref_data + x * vec_width + (y + 0) * ref_stride + a_off - leftoff_with_sign_neg));
953       __m128i d = _mm_loadu_si128((__m128i *)(ref_data + x * vec_width + (y + 1) * ref_stride + a_off - leftoff_with_sign_neg));
954       __m128i f = _mm_loadu_si128((__m128i *)(ref_data + x * vec_width + (y + 2) * ref_stride + a_off - leftoff_with_sign_neg));
955       __m128i h = _mm_loadu_si128((__m128i *)(ref_data + x * vec_width + (y + 3) * ref_stride + a_off - leftoff_with_sign_neg));
956 
957       __m128i b_shifted    = _mm_shuffle_epi8(b,     shufmask1);
958       __m128i d_shifted    = _mm_shuffle_epi8(d,     shufmask1);
959       __m128i f_shifted    = _mm_shuffle_epi8(f,     shufmask1);
960       __m128i h_shifted    = _mm_shuffle_epi8(h,     shufmask1);
961 
962       __m128i b_with_old   = _mm_blendv_epi8 (old_b, b_shifted, move_old_to_b_imask);
963       __m128i d_with_old   = _mm_blendv_epi8 (old_d, d_shifted, move_old_to_b_imask);
964       __m128i f_with_old   = _mm_blendv_epi8 (old_f, f_shifted, move_old_to_b_imask);
965       __m128i h_with_old   = _mm_blendv_epi8 (old_h, h_shifted, move_old_to_b_imask);
966 
967       uint8_t startoff     = (x << vec_width_log2) + a_off;
968       __m128i startoffs    = _mm_set1_epi8   (startoff);
969       __m128i curr_ns      = _mm_add_epi8    (startoffs,    nslo);
970       __m128i unrd_imask   = _mm_cmpgt_epi8  (blk_widths,   curr_ns);
971       __m128i unrd_mask    = _mm_cmpeq_epi8  (unrd_imask,   _mm_setzero_si128());
972 
973       __m128i b_unread     = _mm_blendv_epi8 (b_with_old,   a, unrd_mask);
974       __m128i d_unread     = _mm_blendv_epi8 (d_with_old,   c, unrd_mask);
975       __m128i f_unread     = _mm_blendv_epi8 (f_with_old,   e, unrd_mask);
976       __m128i h_unread     = _mm_blendv_epi8 (h_with_old,   g, unrd_mask);
977 
978       old_b = b_shifted;
979       old_d = d_shifted;
980       old_f = f_shifted;
981       old_h = h_shifted;
982 
983       __m128i sad_ab     = _mm_sad_epu8(a, b_unread);
984       __m128i sad_cd     = _mm_sad_epu8(c, d_unread);
985       __m128i sad_ef     = _mm_sad_epu8(e, f_unread);
986       __m128i sad_gh     = _mm_sad_epu8(g, h_unread);
987 
988       sse_inc = _mm_add_epi64(sse_inc, sad_ab);
989       sse_inc = _mm_add_epi64(sse_inc, sad_cd);
990       sse_inc = _mm_add_epi64(sse_inc, sad_ef);
991       sse_inc = _mm_add_epi64(sse_inc, sad_gh);
992     }
993   }
994   if (height_residual_lines) {
995     for (; y < height; y++) {
996       __m128i borderpx_vec = _mm_set1_epi8(ref_data[(int32_t)((y + 0) * ref_stride + border_off)]);
997       for (x = 0; x < outside_vecs; x++) {
998         __m128i a = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 0) * pic_stride + outvec_offset));
999 
1000         __m128i startoffs  = _mm_set1_epi8  ((x + inside_vecs) << vec_width_log2);
1001         __m128i ns         = _mm_add_epi8   (startoffs, nslo);
1002 
1003         // Unread imask is (is_left NOR unrd_imask_for_right), do the maths etc
1004         __m128i unrd_imask = _mm_cmpgt_epi8 (blk_widths, ns);
1005                 unrd_imask = _mm_or_si128   (unrd_imask, is_left);
1006         __m128i unrd_mask  = _mm_cmpeq_epi8 (unrd_imask, _mm_setzero_si128());
1007         __m128i b_unread   = _mm_blendv_epi8(borderpx_vec, a, unrd_mask);
1008 
1009         __m128i sad_ab     = _mm_sad_epu8   (a, b_unread);
1010         sse_inc = _mm_add_epi64(sse_inc, sad_ab);
1011       }
1012       int32_t a_off = outside_width & is_left_bm;
1013       int32_t leftoff_with_sign_neg = (left_offset ^ is_left_bm) - is_left_bm;
1014 
1015       __m128i old_b = borderpx_vec;
1016       for (x = invec_lstart; x != invec_lend; x += invec_linc) {
1017         __m128i a = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 0) * pic_stride + a_off));
1018         __m128i b = _mm_loadu_si128((__m128i *)(ref_data + x * vec_width + (y + 0) * ref_stride + a_off - leftoff_with_sign_neg));
1019 
1020         __m128i b_shifted    = _mm_shuffle_epi8(b,     shufmask1);
1021         __m128i b_with_old   = _mm_blendv_epi8 (old_b, b_shifted, move_old_to_b_imask);
1022 
1023         uint8_t startoff     = (x << vec_width_log2) + a_off;
1024         __m128i startoffs    = _mm_set1_epi8   (startoff);
1025         __m128i curr_ns      = _mm_add_epi8    (startoffs,    nslo);
1026         __m128i unrd_imask   = _mm_cmpgt_epi8  (blk_widths,   curr_ns);
1027         __m128i unrd_mask    = _mm_cmpeq_epi8  (unrd_imask,   _mm_setzero_si128());
1028         __m128i b_unread     = _mm_blendv_epi8 (b_with_old,   a, unrd_mask);
1029 
1030         old_b = b_shifted;
1031 
1032         __m128i sad_ab     = _mm_sad_epu8(a, b_unread);
1033         sse_inc = _mm_add_epi64(sse_inc, sad_ab);
1034       }
1035     }
1036   }
1037   __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
1038   __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
1039   return _mm_cvtsi128_si32(sad);
1040 }
1041 
1042 #endif // KVZ_BIT_DEPTH == 8
1043 
1044 #endif
1045