1 /*
2  *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/basic_types.h"
12 
13 #include "libyuv/compare_row.h"
14 #include "libyuv/row.h"
15 
16 #ifdef __cplusplus
17 namespace libyuv {
18 extern "C" {
19 #endif
20 
21 // This module is for GCC x86 and x64.
22 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
23 
24 #if defined(__x86_64__)
HammingDistance_SSE42(const uint8_t * src_a,const uint8_t * src_b,int count)25 uint32_t HammingDistance_SSE42(const uint8_t* src_a,
26                                const uint8_t* src_b,
27                                int count) {
28   uint64_t diff = 0u;
29 
30   asm volatile(
31       "xor         %3,%3                         \n"
32       "xor         %%r8,%%r8                     \n"
33       "xor         %%r9,%%r9                     \n"
34       "xor         %%r10,%%r10                   \n"
35 
36       // Process 32 bytes per loop.
37       LABELALIGN
38       "1:                                        \n"
39       "mov         (%0),%%rcx                    \n"
40       "mov         0x8(%0),%%rdx                 \n"
41       "xor         (%1),%%rcx                    \n"
42       "xor         0x8(%1),%%rdx                 \n"
43       "popcnt      %%rcx,%%rcx                   \n"
44       "popcnt      %%rdx,%%rdx                   \n"
45       "mov         0x10(%0),%%rsi                \n"
46       "mov         0x18(%0),%%rdi                \n"
47       "xor         0x10(%1),%%rsi                \n"
48       "xor         0x18(%1),%%rdi                \n"
49       "popcnt      %%rsi,%%rsi                   \n"
50       "popcnt      %%rdi,%%rdi                   \n"
51       "add         $0x20,%0                      \n"
52       "add         $0x20,%1                      \n"
53       "add         %%rcx,%3                      \n"
54       "add         %%rdx,%%r8                    \n"
55       "add         %%rsi,%%r9                    \n"
56       "add         %%rdi,%%r10                   \n"
57       "sub         $0x20,%2                      \n"
58       "jg          1b                            \n"
59 
60       "add         %%r8, %3                      \n"
61       "add         %%r9, %3                      \n"
62       "add         %%r10, %3                     \n"
63       : "+r"(src_a),  // %0
64         "+r"(src_b),  // %1
65         "+r"(count),  // %2
66         "=r"(diff)    // %3
67       :
68       : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
69 
70   return static_cast<uint32_t>(diff);
71 }
72 #else
73 uint32_t HammingDistance_SSE42(const uint8_t* src_a,
74                                const uint8_t* src_b,
75                                int count) {
76   uint32_t diff = 0u;
77 
78   asm volatile(
79       // Process 16 bytes per loop.
80       LABELALIGN
81       "1:                                        \n"
82       "mov         (%0),%%ecx                    \n"
83       "mov         0x4(%0),%%edx                 \n"
84       "xor         (%1),%%ecx                    \n"
85       "xor         0x4(%1),%%edx                 \n"
86       "popcnt      %%ecx,%%ecx                   \n"
87       "add         %%ecx,%3                      \n"
88       "popcnt      %%edx,%%edx                   \n"
89       "add         %%edx,%3                      \n"
90       "mov         0x8(%0),%%ecx                 \n"
91       "mov         0xc(%0),%%edx                 \n"
92       "xor         0x8(%1),%%ecx                 \n"
93       "xor         0xc(%1),%%edx                 \n"
94       "popcnt      %%ecx,%%ecx                   \n"
95       "add         %%ecx,%3                      \n"
96       "popcnt      %%edx,%%edx                   \n"
97       "add         %%edx,%3                      \n"
98       "add         $0x10,%0                      \n"
99       "add         $0x10,%1                      \n"
100       "sub         $0x10,%2                      \n"
101       "jg          1b                            \n"
102       : "+r"(src_a),  // %0
103         "+r"(src_b),  // %1
104         "+r"(count),  // %2
105         "+r"(diff)    // %3
106       :
107       : "memory", "cc", "ecx", "edx");
108 
109   return diff;
110 }
111 #endif
112 
113 static const vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15,
114                                  15, 15, 15, 15, 15, 15, 15, 15};
115 static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
116 
HammingDistance_SSSE3(const uint8_t * src_a,const uint8_t * src_b,int count)117 uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
118                                const uint8_t* src_b,
119                                int count) {
120   uint32_t diff = 0u;
121 
122   asm volatile(
123       "movdqa      %4,%%xmm2                     \n"
124       "movdqa      %5,%%xmm3                     \n"
125       "pxor        %%xmm0,%%xmm0                 \n"
126       "pxor        %%xmm1,%%xmm1                 \n"
127       "sub         %0,%1                         \n"
128 
129       LABELALIGN
130       "1:                                        \n"
131       "movdqa      (%0),%%xmm4                   \n"
132       "movdqa      0x10(%0), %%xmm5              \n"
133       "pxor        (%0,%1), %%xmm4               \n"
134       "movdqa      %%xmm4,%%xmm6                 \n"
135       "pand        %%xmm2,%%xmm6                 \n"
136       "psrlw       $0x4,%%xmm4                   \n"
137       "movdqa      %%xmm3,%%xmm7                 \n"
138       "pshufb      %%xmm6,%%xmm7                 \n"
139       "pand        %%xmm2,%%xmm4                 \n"
140       "movdqa      %%xmm3,%%xmm6                 \n"
141       "pshufb      %%xmm4,%%xmm6                 \n"
142       "paddb       %%xmm7,%%xmm6                 \n"
143       "pxor        0x10(%0,%1),%%xmm5            \n"
144       "add         $0x20,%0                      \n"
145       "movdqa      %%xmm5,%%xmm4                 \n"
146       "pand        %%xmm2,%%xmm5                 \n"
147       "psrlw       $0x4,%%xmm4                   \n"
148       "movdqa      %%xmm3,%%xmm7                 \n"
149       "pshufb      %%xmm5,%%xmm7                 \n"
150       "pand        %%xmm2,%%xmm4                 \n"
151       "movdqa      %%xmm3,%%xmm5                 \n"
152       "pshufb      %%xmm4,%%xmm5                 \n"
153       "paddb       %%xmm7,%%xmm5                 \n"
154       "paddb       %%xmm5,%%xmm6                 \n"
155       "psadbw      %%xmm1,%%xmm6                 \n"
156       "paddd       %%xmm6,%%xmm0                 \n"
157       "sub         $0x20,%2                      \n"
158       "jg          1b                            \n"
159 
160       "pshufd      $0xaa,%%xmm0,%%xmm1           \n"
161       "paddd       %%xmm1,%%xmm0                 \n"
162       "movd        %%xmm0, %3                    \n"
163       : "+r"(src_a),       // %0
164         "+r"(src_b),       // %1
165         "+r"(count),       // %2
166         "=r"(diff)         // %3
167       : "m"(kNibbleMask),  // %4
168         "m"(kBitCount)     // %5
169       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
170         "xmm7");
171 
172   return diff;
173 }
174 
175 #ifdef HAS_HAMMINGDISTANCE_AVX2
HammingDistance_AVX2(const uint8_t * src_a,const uint8_t * src_b,int count)176 uint32_t HammingDistance_AVX2(const uint8_t* src_a,
177                               const uint8_t* src_b,
178                               int count) {
179   uint32_t diff = 0u;
180 
181   asm volatile(
182       "vbroadcastf128 %4,%%ymm2                  \n"
183       "vbroadcastf128 %5,%%ymm3                  \n"
184       "vpxor       %%ymm0,%%ymm0,%%ymm0          \n"
185       "vpxor       %%ymm1,%%ymm1,%%ymm1          \n"
186       "sub         %0,%1                         \n"
187 
188       LABELALIGN
189       "1:                                        \n"
190       "vmovdqa     (%0),%%ymm4                   \n"
191       "vmovdqa     0x20(%0), %%ymm5              \n"
192       "vpxor       (%0,%1), %%ymm4, %%ymm4       \n"
193       "vpand       %%ymm2,%%ymm4,%%ymm6          \n"
194       "vpsrlw      $0x4,%%ymm4,%%ymm4            \n"
195       "vpshufb     %%ymm6,%%ymm3,%%ymm6          \n"
196       "vpand       %%ymm2,%%ymm4,%%ymm4          \n"
197       "vpshufb     %%ymm4,%%ymm3,%%ymm4          \n"
198       "vpaddb      %%ymm4,%%ymm6,%%ymm6          \n"
199       "vpxor       0x20(%0,%1),%%ymm5,%%ymm4     \n"
200       "add         $0x40,%0                      \n"
201       "vpand       %%ymm2,%%ymm4,%%ymm5          \n"
202       "vpsrlw      $0x4,%%ymm4,%%ymm4            \n"
203       "vpshufb     %%ymm5,%%ymm3,%%ymm5          \n"
204       "vpand       %%ymm2,%%ymm4,%%ymm4          \n"
205       "vpshufb     %%ymm4,%%ymm3,%%ymm4          \n"
206       "vpaddb      %%ymm5,%%ymm4,%%ymm4          \n"
207       "vpaddb      %%ymm6,%%ymm4,%%ymm4          \n"
208       "vpsadbw     %%ymm1,%%ymm4,%%ymm4          \n"
209       "vpaddd      %%ymm0,%%ymm4,%%ymm0          \n"
210       "sub         $0x40,%2                      \n"
211       "jg          1b                            \n"
212 
213       "vpermq      $0xb1,%%ymm0,%%ymm1           \n"
214       "vpaddd      %%ymm1,%%ymm0,%%ymm0          \n"
215       "vpermq      $0xaa,%%ymm0,%%ymm1           \n"
216       "vpaddd      %%ymm1,%%ymm0,%%ymm0          \n"
217       "vmovd       %%xmm0, %3                    \n"
218       "vzeroupper                                \n"
219       : "+r"(src_a),       // %0
220         "+r"(src_b),       // %1
221         "+r"(count),       // %2
222         "=r"(diff)         // %3
223       : "m"(kNibbleMask),  // %4
224         "m"(kBitCount)     // %5
225       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
226 
227   return diff;
228 }
229 #endif  // HAS_HAMMINGDISTANCE_AVX2
230 
SumSquareError_SSE2(const uint8_t * src_a,const uint8_t * src_b,int count)231 uint32_t SumSquareError_SSE2(const uint8_t* src_a,
232                              const uint8_t* src_b,
233                              int count) {
234   uint32_t sse;
235   asm volatile(
236       "pxor        %%xmm0,%%xmm0                 \n"
237       "pxor        %%xmm5,%%xmm5                 \n"
238 
239       LABELALIGN
240       "1:                                        \n"
241       "movdqu      (%0),%%xmm1                   \n"
242       "lea         0x10(%0),%0                   \n"
243       "movdqu      (%1),%%xmm2                   \n"
244       "lea         0x10(%1),%1                   \n"
245       "movdqa      %%xmm1,%%xmm3                 \n"
246       "psubusb     %%xmm2,%%xmm1                 \n"
247       "psubusb     %%xmm3,%%xmm2                 \n"
248       "por         %%xmm2,%%xmm1                 \n"
249       "movdqa      %%xmm1,%%xmm2                 \n"
250       "punpcklbw   %%xmm5,%%xmm1                 \n"
251       "punpckhbw   %%xmm5,%%xmm2                 \n"
252       "pmaddwd     %%xmm1,%%xmm1                 \n"
253       "pmaddwd     %%xmm2,%%xmm2                 \n"
254       "paddd       %%xmm1,%%xmm0                 \n"
255       "paddd       %%xmm2,%%xmm0                 \n"
256       "sub         $0x10,%2                      \n"
257       "jg          1b                            \n"
258 
259       "pshufd      $0xee,%%xmm0,%%xmm1           \n"
260       "paddd       %%xmm1,%%xmm0                 \n"
261       "pshufd      $0x1,%%xmm0,%%xmm1            \n"
262       "paddd       %%xmm1,%%xmm0                 \n"
263       "movd        %%xmm0,%3                     \n"
264 
265       : "+r"(src_a),  // %0
266         "+r"(src_b),  // %1
267         "+r"(count),  // %2
268         "=g"(sse)     // %3
269         ::"memory",
270         "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
271   return sse;
272 }
273 
274 static const uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0};  // 33 ^ 16
275 static const uvec32 kHashMul0 = {
276     0x0c3525e1,  // 33 ^ 15
277     0xa3476dc1,  // 33 ^ 14
278     0x3b4039a1,  // 33 ^ 13
279     0x4f5f0981,  // 33 ^ 12
280 };
281 static const uvec32 kHashMul1 = {
282     0x30f35d61,  // 33 ^ 11
283     0x855cb541,  // 33 ^ 10
284     0x040a9121,  // 33 ^ 9
285     0x747c7101,  // 33 ^ 8
286 };
287 static const uvec32 kHashMul2 = {
288     0xec41d4e1,  // 33 ^ 7
289     0x4cfa3cc1,  // 33 ^ 6
290     0x025528a1,  // 33 ^ 5
291     0x00121881,  // 33 ^ 4
292 };
293 static const uvec32 kHashMul3 = {
294     0x00008c61,  // 33 ^ 3
295     0x00000441,  // 33 ^ 2
296     0x00000021,  // 33 ^ 1
297     0x00000001,  // 33 ^ 0
298 };
299 
HashDjb2_SSE41(const uint8_t * src,int count,uint32_t seed)300 uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
301   uint32_t hash;
302   asm volatile(
303       "movd        %2,%%xmm0                     \n"
304       "pxor        %%xmm7,%%xmm7                 \n"
305       "movdqa      %4,%%xmm6                     \n"
306 
307       LABELALIGN
308       "1:                                        \n"
309       "movdqu      (%0),%%xmm1                   \n"
310       "lea         0x10(%0),%0                   \n"
311       "pmulld      %%xmm6,%%xmm0                 \n"
312       "movdqa      %5,%%xmm5                     \n"
313       "movdqa      %%xmm1,%%xmm2                 \n"
314       "punpcklbw   %%xmm7,%%xmm2                 \n"
315       "movdqa      %%xmm2,%%xmm3                 \n"
316       "punpcklwd   %%xmm7,%%xmm3                 \n"
317       "pmulld      %%xmm5,%%xmm3                 \n"
318       "movdqa      %6,%%xmm5                     \n"
319       "movdqa      %%xmm2,%%xmm4                 \n"
320       "punpckhwd   %%xmm7,%%xmm4                 \n"
321       "pmulld      %%xmm5,%%xmm4                 \n"
322       "movdqa      %7,%%xmm5                     \n"
323       "punpckhbw   %%xmm7,%%xmm1                 \n"
324       "movdqa      %%xmm1,%%xmm2                 \n"
325       "punpcklwd   %%xmm7,%%xmm2                 \n"
326       "pmulld      %%xmm5,%%xmm2                 \n"
327       "movdqa      %8,%%xmm5                     \n"
328       "punpckhwd   %%xmm7,%%xmm1                 \n"
329       "pmulld      %%xmm5,%%xmm1                 \n"
330       "paddd       %%xmm4,%%xmm3                 \n"
331       "paddd       %%xmm2,%%xmm1                 \n"
332       "paddd       %%xmm3,%%xmm1                 \n"
333       "pshufd      $0xe,%%xmm1,%%xmm2            \n"
334       "paddd       %%xmm2,%%xmm1                 \n"
335       "pshufd      $0x1,%%xmm1,%%xmm2            \n"
336       "paddd       %%xmm2,%%xmm1                 \n"
337       "paddd       %%xmm1,%%xmm0                 \n"
338       "sub         $0x10,%1                      \n"
339       "jg          1b                            \n"
340       "movd        %%xmm0,%3                     \n"
341       : "+r"(src),        // %0
342         "+r"(count),      // %1
343         "+rm"(seed),      // %2
344         "=g"(hash)        // %3
345       : "m"(kHash16x33),  // %4
346         "m"(kHashMul0),   // %5
347         "m"(kHashMul1),   // %6
348         "m"(kHashMul2),   // %7
349         "m"(kHashMul3)    // %8
350       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
351         "xmm7");
352   return hash;
353 }
354 #endif  // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
355 
356 #ifdef __cplusplus
357 }  // extern "C"
358 }  // namespace libyuv
359 #endif
360