1 /*
2  * Copyright (c) 2019,2021 Genome Research Ltd.
3  * Author(s): James Bonfield
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  *    1. Redistributions of source code must retain the above copyright notice,
9  *       this list of conditions and the following disclaimer.
10  *
11  *    2. Redistributions in binary form must reproduce the above
12  *       copyright notice, this list of conditions and the following
13  *       disclaimer in the documentation and/or other materials provided
14  *       with the distribution.
15  *
16  *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17  *       Institute nor the names of its contributors may be used to endorse
18  *       or promote products derived from this software without specific
19  *       prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22  * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25  * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <string.h>
35 
36 /*
37  * Data transpose by N.  Common to rANS4x16 and arith_dynamic decoders.
38  *
39  * Tuned for specific common cases of N.
40  */
unstripe(unsigned char * out,unsigned char * outN,unsigned int ulen,unsigned int N,unsigned int idxN[256])41 static inline void unstripe(unsigned char *out, unsigned char *outN,
42 			    unsigned int ulen, unsigned int N,
43 			    unsigned int idxN[256]) {
44     int j = 0, k;
45 
46     if (ulen >= N) {
47 	switch (N) {
48 	case 4:
49 	    while (j < ulen-4) {
50 		for (k = 0; k < 4; k++)
51 		    out[j++] = outN[idxN[k]++];
52 	    }
53 	    break;
54 
55 	case 2:
56 	    while (j < ulen-2) {
57 		for (k = 0; k < 2; k++)
58 		    out[j++] = outN[idxN[k]++];
59 	    }
60 	    break;
61 
62 	default:
63 	    // General case, around 25% slower overall decode
64 	    while (j < ulen-N) {
65 		for (k = 0; k < N; k++)
66 		    out[j++] = outN[idxN[k]++];
67 	    }
68 	    break;
69 	}
70     }
71     for (k = 0; j < ulen; k++)
72 	out[j++] = outN[idxN[k]++];
73 }
74 
75 #define MAGIC 8
76 
77 /*
78  * Order 0 histogram construction.  8-way unrolled to avoid cache collisions.
79  */
80 static inline
hist8(unsigned char * in,unsigned int in_size,uint32_t F0[256])81 void hist8(unsigned char *in, unsigned int in_size, uint32_t F0[256]) {
82     uint32_t F1[256+MAGIC] = {0}, F2[256+MAGIC] = {0}, F3[256+MAGIC] = {0};
83     uint32_t F4[256+MAGIC] = {0}, F5[256+MAGIC] = {0}, F6[256+MAGIC] = {0};
84     uint32_t F7[256+MAGIC] = {0};
85 
86     unsigned int i, i8 = in_size & ~7;
87     for (i = 0; i < i8; i+=8) {
88 	F0[in[i+0]]++;
89 	F1[in[i+1]]++;
90 	F2[in[i+2]]++;
91 	F3[in[i+3]]++;
92 	F4[in[i+4]]++;
93 	F5[in[i+5]]++;
94 	F6[in[i+6]]++;
95 	F7[in[i+7]]++;
96     }
97     while (i < in_size)
98 	F0[in[i++]]++;
99 
100     for (i = 0; i < 256; i++)
101 	F0[i] += F1[i] + F2[i] + F3[i] + F4[i] + F5[i] + F6[i] + F7[i];
102 }
103 
104 /*
105  * A variant of hist8 that simply marks the presence of a symbol rather
106  * than its frequency.
107  */
108 static inline
present8(unsigned char * in,unsigned int in_size,uint32_t F0[256])109 void present8(unsigned char *in, unsigned int in_size,
110 	      uint32_t F0[256]) {
111     uint32_t F1[256+MAGIC] = {0}, F2[256+MAGIC] = {0}, F3[256+MAGIC] = {0};
112     uint32_t F4[256+MAGIC] = {0}, F5[256+MAGIC] = {0}, F6[256+MAGIC] = {0};
113     uint32_t F7[256+MAGIC] = {0};
114 
115     unsigned int i, i8 = in_size & ~7;
116     for (i = 0; i < i8; i+=8) {
117 	F0[in[i+0]]=1;
118 	F1[in[i+1]]=1;
119 	F2[in[i+2]]=1;
120 	F3[in[i+3]]=1;
121 	F4[in[i+4]]=1;
122 	F5[in[i+5]]=1;
123 	F6[in[i+6]]=1;
124 	F7[in[i+7]]=1;
125     }
126     while (i < in_size)
127 	F0[in[i++]]=1;
128 
129     for (i = 0; i < 256; i++)
130 	F0[i] += F1[i] + F2[i] + F3[i] + F4[i] + F5[i] + F6[i] + F7[i];
131 }
132 
133 /*
134  * Order 1 histogram construction.  4-way unrolled to avoid cache collisions.
135  */
136 static inline
hist1_4(unsigned char * in,unsigned int in_size,uint32_t F0[256][256],uint32_t * T0)137 void hist1_4(unsigned char *in, unsigned int in_size,
138 	     uint32_t F0[256][256], uint32_t *T0) {
139     uint32_t T1[256+MAGIC] = {0}, T2[256+MAGIC] = {0}, T3[256+MAGIC] = {0};
140 
141     unsigned char l = 0, c;
142     unsigned char *in_end = in + in_size;
143 
144     unsigned char cc[5] = {0};
145     if (in_size > 500000) {
146 	uint32_t F1[256][259] = {{0}};
147 	while (in < in_end-8) {
148 	    memcpy(cc, in, 4); in += 4;
149 	    T0[cc[4]]++; F0[cc[4]][cc[0]]++;
150 	    T1[cc[0]]++; F1[cc[0]][cc[1]]++;
151 	    T2[cc[1]]++; F0[cc[1]][cc[2]]++;
152 	    T3[cc[2]]++; F1[cc[2]][cc[3]]++;
153 	    cc[4] = cc[3];
154 
155 	    memcpy(cc, in, 4); in += 4;
156 	    T0[cc[4]]++; F0[cc[4]][cc[0]]++;
157 	    T1[cc[0]]++; F1[cc[0]][cc[1]]++;
158 	    T2[cc[1]]++; F0[cc[1]][cc[2]]++;
159 	    T3[cc[2]]++; F1[cc[2]][cc[3]]++;
160 	    cc[4] = cc[3];
161 	}
162 	l = cc[3];
163 
164 	while (in < in_end) {
165 	    F0[l][c = *in++]++;
166 	    T0[l]++;
167 	    l = c;
168 	}
169 
170 	int i, j;
171 	for (i = 0; i < 256; i++)
172 	    for (j = 0; j < 256; j++)
173 		F0[i][j] += F1[i][j];
174     } else {
175 	while (in < in_end-8) {
176 	    memcpy(cc, in, 4); in += 4;
177 	    T0[cc[4]]++; F0[cc[4]][cc[0]]++;
178 	    T1[cc[0]]++; F0[cc[0]][cc[1]]++;
179 	    T2[cc[1]]++; F0[cc[1]][cc[2]]++;
180 	    T3[cc[2]]++; F0[cc[2]][cc[3]]++;
181 	    cc[4] = cc[3];
182 
183 	    memcpy(cc, in, 4); in += 4;
184 	    T0[cc[4]]++; F0[cc[4]][cc[0]]++;
185 	    T1[cc[0]]++; F0[cc[0]][cc[1]]++;
186 	    T2[cc[1]]++; F0[cc[1]][cc[2]]++;
187 	    T3[cc[2]]++; F0[cc[2]][cc[3]]++;
188 	    cc[4] = cc[3];
189 	}
190 	l = cc[3];
191 
192 	while (in < in_end) {
193 	    F0[l][c = *in++]++;
194 	    T0[l]++;
195 	    l = c;
196 	}
197     }
198 
199     int i;
200     for (i = 0; i < 256; i++)
201 	T0[i]+=T1[i]+T2[i]+T3[i];
202 }
203