1 /* adler32.c -- compute the Adler-32 checksum of a data stream
2  * Copyright (C) 1995-2011 Mark Adler
3  * Authors:
4  *   Brian Bockelman <bockelman@gmail.com>
5  * For conditions of distribution and use, see copyright notice in zlib.h
6  */
7 
8 #include "../../zbuild.h"
9 #include "../../zutil.h"
10 
11 #include "../../adler32_p.h"
12 
13 #ifdef X86_SSSE3_ADLER32
14 
15 #include <immintrin.h>
16 
adler32_ssse3(uint32_t adler,const unsigned char * buf,size_t len)17 Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len) {
18     uint32_t sum2;
19 
20      /* split Adler-32 into component sums */
21     sum2 = (adler >> 16) & 0xffff;
22     adler &= 0xffff;
23 
24     /* in case user likes doing a byte at a time, keep it fast */
25     if (UNLIKELY(len == 1))
26         return adler32_len_1(adler, buf, sum2);
27 
28     /* initial Adler-32 value (deferred check for len == 1 speed) */
29     if (UNLIKELY(buf == NULL))
30         return 1L;
31 
32     /* in case short lengths are provided, keep it somewhat fast */
33     if (UNLIKELY(len < 16))
34         return adler32_len_16(adler, buf, len, sum2);
35 
36     uint32_t ALIGNED_(16) s1[4], s2[4];
37 
38     s1[0] = s1[1] = s1[2] = 0; s1[3] = adler;
39     s2[0] = s2[1] = s2[2] = 0; s2[3] = sum2;
40 
41     char ALIGNED_(16) dot1[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
42     __m128i dot1v = _mm_load_si128((__m128i*)dot1);
43     char ALIGNED_(16) dot2[16] = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
44     __m128i dot2v = _mm_load_si128((__m128i*)dot2);
45     short ALIGNED_(16) dot3[8] = {1, 1, 1, 1, 1, 1, 1, 1};
46     __m128i dot3v = _mm_load_si128((__m128i*)dot3);
47 
48     // We will need to multiply by
49     //char ALIGNED_(16) shift[4] = {0, 0, 0, 4}; //{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4};
50 
51     char ALIGNED_(16) shift[16] = {4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
52     __m128i shiftv = _mm_load_si128((__m128i*)shift);
53 
54     while (len >= 16) {
55        __m128i vs1 = _mm_load_si128((__m128i*)s1);
56        __m128i vs2 = _mm_load_si128((__m128i*)s2);
57        __m128i vs1_0 = vs1;
58 
59        int k = (len < NMAX ? (int)len : NMAX);
60        k -= k % 16;
61        len -= k;
62 
63        while (k >= 16) {
64            /*
65               vs1 = adler + sum(c[i])
66               vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
67 
68               NOTE: 256-bit equivalents are:
69                 _mm256_maddubs_epi16 <- operates on 32 bytes to 16 shorts
70                 _mm256_madd_epi16    <- Sums 16 shorts to 8 int32_t.
71               We could rewrite the below to use 256-bit instructions instead of 128-bit.
72            */
73            __m128i vbuf = _mm_loadu_si128((__m128i*)buf);
74            buf += 16;
75            k -= 16;
76 
77            __m128i v_short_sum1 = _mm_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts.
78            __m128i vsum1 = _mm_madd_epi16(v_short_sum1, dot3v);  // sum 8 shorts to 4 int32_t;
79            __m128i v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
80            vs1 = _mm_add_epi32(vsum1, vs1);
81            __m128i vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
82            vs1_0 = _mm_sll_epi32(vs1_0, shiftv);
83            vsum2 = _mm_add_epi32(vsum2, vs2);
84            vs2   = _mm_add_epi32(vsum2, vs1_0);
85            vs1_0 = vs1;
86        }
87 
88        // At this point, we have partial sums stored in vs1 and vs2.  There are AVX512 instructions that
89        // would allow us to sum these quickly (VP4DPWSSD).  For now, just unpack and move on.
90 
91        uint32_t ALIGNED_(16) s1_unpack[4];
92        uint32_t ALIGNED_(16) s2_unpack[4];
93 
94        _mm_store_si128((__m128i*)s1_unpack, vs1);
95        _mm_store_si128((__m128i*)s2_unpack, vs2);
96 
97        adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE);
98        adler %= BASE;
99        s1[3] = adler;
100 
101        sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE);
102        sum2 %= BASE;
103        s2[3] = sum2;
104     }
105 
106     while (len) {
107         len--;
108         adler += *buf++;
109         sum2 += adler;
110     }
111     adler %= BASE;
112     sum2 %= BASE;
113 
114     /* return recombined sums */
115     return adler | (sum2 << 16);
116 }
117 
118 #endif
119