1 /* This code is part of the tng compression routines.
2 *
3 * Written by Daniel Spangberg
4 * Copyright (c) 2010, 2013, The GROMACS development team.
5 *
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the Revised BSD License.
9 */
10
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include "../../include/compression/warnmalloc.h"
16 #include "../../include/compression/bwt.h"
17 #include "../../include/compression/lz77.h"
18
19 /* This is a simple Lempel-Ziv-77 compressor. It has not been set up
20 to remove every possible repeating pattern, but it might be better
21 than simple RLE.
22
23 Lempel-Ziv 77 with separate outputs for length, data, and offsets.
24 */
25
26 #if 0
27 /* Sort the strings (similar to BWT) to find similar patterns in the
28 input data. The output is an array with two values for each
29 input value. The sorted index value is the first in each doublet.
30 The second value is the "inverse" of the first value and can be
31 used to find the locations of similar strings. */
32 static void sort_strings(unsigned int *vals, int nvals,
33 unsigned int *output)
34 {
35 int i;
36 int *indices=warnmalloc(2*nvals*sizeof *indices);
37 unsigned int *nrepeat=warnmalloc(nvals*sizeof *nrepeat);
38 int *warr=indices+nvals;
39
40 if (nvals>0xFFFFFF)
41 {
42 fprintf(stderr,"BWT cannot pack more than %d values.\n",0xFFFFFF);
43 exit(1);
44 }
45
46 /* Also note that repeat pattern k (kmax) cannot be larger than 255. */
47 for (i=0; i<nvals; i++)
48 indices[i]=i;
49 /* Find the length of the initial repeating pattern for the strings. */
50 /* First mark that the index does not have a found repeating string. */
51 for (i=0; i<nvals; i++)
52 nrepeat[i]=0U;
53 for (i=0; i<nvals; i++)
54 {
55 /* If we have not already found a repeating string we must find
56 it. */
57 if (!nrepeat[i])
58 {
59 int maxrepeat=nvals*2;
60 int j,k,m;
61 int good_j=-1, good_k=0;
62 int kmax=16;
63 /* Track repeating patterns.
64 k=1 corresponds to AAAAA...
65 k=2 corresponds to ABABAB...
66 k=3 corresponds to ABCABCABCABC...
67 k=4 corresponds to ABCDABCDABCD...
68 etc. */
69 for (k=kmax; k>=1; k--)
70 {
71 try_next_k:
72 if (k>=1)
73 {
74 for (j=k; j<maxrepeat; j+=k)
75 {
76 int is_equal=1;
77 for (m=0; m<k; m++)
78 if (vals[(i+m)%nvals]!=vals[(i+j+m)%nvals])
79 {
80 is_equal=0;
81 break;
82 }
83 if (is_equal)
84 {
85 int new_j=j+k;
86 if (new_j>maxrepeat)
87 new_j=j;
88 if ((new_j>good_j) || ((new_j==good_j) && (k<good_k)))
89 {
90 good_j=new_j; /* We have found that
91 the strings repeat
92 for this length... */
93 good_k=k; /* ...and with this
94 length of the
95 repeating
96 pattern. */
97 }
98 }
99 else
100 {
101 /* We know that it is no point in trying
102 with more than m */
103 if (j==0)
104 {
105 k=m;
106 }
107 else
108 k--;
109 goto try_next_k;
110 }
111 }
112 }
113 }
114 /* From good_j and good_k we know the repeat for a large
115 number of strings. The very last repeat length should not
116 be assigned, since it can be much longer if a new test is
117 done. */
118 for (m=0; (m+good_k<good_j) && (i+m<nvals); m+=good_k)
119 {
120 int repeat=good_j-m;
121 if (repeat>nvals)
122 repeat=nvals;
123 nrepeat[i+m]=((unsigned int) (good_k)) | (((unsigned int) (repeat))<<8);
124 }
125 /* If no repetition was found for this value signal that here. */
126 if (!nrepeat[i])
127 nrepeat[i+m]=257U; /* This is 1<<8 | 1 */
128 }
129 }
130
131 /* Sort cyclic shift matrix. */
132 Ptngc_bwt_merge_sort_inner(indices,nvals,vals,0,nvals,nrepeat,warr);
133
134 /* Form output. */
135 for (i=0; i<nvals; i++)
136 {
137 output[i*2]=indices[i];
138 output[indices[i]*2+1]=i;
139 }
140 free(nrepeat);
141 free(indices);
142 }
143 #endif
144
145 #define NUM_PREVIOUS 4
146 #define MAX_LEN 0xFFFF
147 #define MAX_OFFSET 0xFFFF
148 #define MAX_STRING_SEARCH 8
149
add_circular(int * previous,const int v,const int i)150 static void add_circular(int *previous, const int v, const int i)
151 {
152 if (previous[(NUM_PREVIOUS+3)*v+2]!=i-1)
153 {
154 previous[(NUM_PREVIOUS+3)*v]++;
155 if (previous[(NUM_PREVIOUS+3)*v]>NUM_PREVIOUS)
156 previous[(NUM_PREVIOUS+3)*v]=NUM_PREVIOUS;
157 previous[(NUM_PREVIOUS+3)*v+3+previous[(NUM_PREVIOUS+3)*v+1]]=i;
158 previous[(NUM_PREVIOUS+3)*v+1]++;
159 if (previous[(NUM_PREVIOUS+3)*v+1]>=NUM_PREVIOUS)
160 previous[(NUM_PREVIOUS+3)*v+1]=0;
161 }
162 previous[(NUM_PREVIOUS+3)*v+2]=i;
163 }
164
Ptngc_comp_to_lz77(unsigned int * vals,const int nvals,unsigned int * data,int * ndata,unsigned int * len,int * nlens,unsigned int * offsets,int * noffsets)165 void Ptngc_comp_to_lz77(unsigned int *vals, const int nvals,
166 unsigned int *data, int *ndata,
167 unsigned int *len, int *nlens,
168 unsigned int *offsets, int *noffsets)
169 {
170 int noff=0;
171 int ndat=0;
172 int nlen=0;
173 int i,j;
174 int *previous=warnmalloc(0x20000*(NUM_PREVIOUS+3)*sizeof *previous);
175 #if 0
176 unsigned int *info=warnmalloc(2*nvals*sizeof *info);
177 sort_strings(vals,nvals,info);
178 #endif
179 for (i=0; i<0x20000; i++)
180 {
181 previous[(NUM_PREVIOUS+3)*i]=0; /* Number of items in a circular buffer */
182 previous[(NUM_PREVIOUS+3)*i+1]=0; /* Pointer to beginning of circular buffer. */
183 previous[(NUM_PREVIOUS+3)*i+2]=-2; /* Last offset that had this value. -2 is really never... */
184 }
185 for (i=0; i<nvals; i++)
186 {
187 int k;
188 #if 0
189 int kmin,kmax;
190 #endif
191 int firstoffset=i-MAX_OFFSET;
192 if (firstoffset<0)
193 firstoffset=0;
194 if (i!=0)
195 {
196 int largest_len=0;
197 int largest_offset=0;
198 int icirc, ncirc;
199 /* Is this identical to a previous offset? Prefer close
200 values for offset. Search through circular buffer for the
201 possible values for the start of this string. */
202 ncirc=previous[(NUM_PREVIOUS+3)*vals[i]];
203 for (icirc=0; icirc<ncirc; icirc++)
204 {
205 int iptr=previous[(NUM_PREVIOUS+3)*vals[i]+1]-icirc-1;
206 if (iptr<0)
207 iptr+=NUM_PREVIOUS;
208 j=previous[(NUM_PREVIOUS+3)*vals[i]+3+iptr];
209 if (j<firstoffset)
210 break;
211 #if 0
212 fprintf(stderr,"Starting search for %d at %d. Found %d\n",vals[i],j,vals[j]);
213 #endif
214 while ((j<i) && (vals[j]==vals[i]))
215 {
216 if (j>=firstoffset)
217 {
218 for (k=0; i+k<nvals; k++)
219 if (vals[j+k]!=vals[i+k])
220 break;
221 if ((k>largest_len) && ((k>=(i-j)+16) || ((k>4) && (i-j==1))))
222 {
223 largest_len=k;
224 largest_offset=j;
225 }
226 }
227 j++;
228 }
229 }
230 #if 0
231 /* Search in sorted string buffer too. */
232 kmin=info[i*2+1]-MAX_STRING_SEARCH;
233 kmax=info[i*2+1]+MAX_STRING_SEARCH;
234 if (kmin<0)
235 kmin=0;
236 if (kmax>=nvals)
237 kmax=nvals;
238 for (k=kmin; k<kmax; k++)
239 {
240 int m;
241 int s=info[k*2];
242 if ((s<i) && (s+MAX_OFFSET>=i))
243 {
244 for (m=0; i+m<nvals; m++)
245 if (vals[i+m]!=vals[(s+m)%nvals])
246 break;
247 if ((m>largest_len) && (m>4) && (m+2>=(i-s)))
248 {
249 largest_len=m;
250 largest_offset=s;
251 #if 0
252 fprintf(stderr,"Offset: %d %d\n",m,i-s);
253 #endif
254 }
255 }
256 }
257 #endif
258 /* Check how to write this info. */
259 if (largest_len>MAX_LEN)
260 largest_len=MAX_LEN;
261 if (largest_len)
262 {
263 if (i-largest_offset==1)
264 {
265 data[ndat++]=0;
266 }
267 else
268 {
269 data[ndat++]=1;
270 offsets[noff++]=i-largest_offset;
271 }
272 len[nlen++]=largest_len;
273 #if 0
274 fprintf(stderr,"l:o: %d:%d data=%d i=%d\n",largest_len,i-largest_offset,ndat,i);
275 fflush(stderr);
276 #endif
277
278 #if 0
279 fprintf(stderr,"Found largest len %d at %d.\n",largest_len,i-largest_offset);
280 #endif
281 /* Add these values to the circular buffer. */
282 for (k=0; k<largest_len; k++)
283 add_circular(previous,vals[i+k],i+k);
284 i+=largest_len-1;
285 }
286 else
287 {
288 data[ndat++]=vals[i]+2;
289 /* Add this value to circular buffer. */
290 add_circular(previous,vals[i],i);
291 }
292 }
293 else
294 {
295 data[ndat++]=vals[i]+2;
296 /* Add this value to circular buffer. */
297 add_circular(previous,vals[i],i);
298 }
299 }
300 *noffsets=noff;
301 *ndata=ndat;
302 *nlens=nlen;
303 #if 0
304 free(info);
305 #endif
306 free(previous);
307 }
308
Ptngc_comp_from_lz77(unsigned int * data,const int ndata,unsigned int * len,const int nlens,unsigned int * offsets,const int noffsets,unsigned int * vals,const int nvals)309 void Ptngc_comp_from_lz77(unsigned int *data, const int ndata,
310 unsigned int *len, const int nlens,
311 unsigned int *offsets, const int noffsets,
312 unsigned int *vals, const int nvals)
313 {
314 int i=0;
315 int joff=0;
316 int jdat=0;
317 int jlen=0;
318 (void)ndata;
319 (void)nlens;
320 (void)noffsets;
321 while (i<nvals)
322 {
323 unsigned int v=data[jdat++];
324 if (v<2)
325 {
326 int offset=1;
327 int k;
328 int length=(int)len[jlen++];
329 #if 0
330 fprintf(stderr,"len=%d off=%d i=%d\n",length,offset,i);
331 #endif
332 if (v==1)
333 offset=offsets[joff++];
334 for (k=0; k<length; k++)
335 {
336 vals[i]=vals[i-offset];
337 if (i>=nvals)
338 {
339 fprintf(stderr,"too many vals.\n");
340 exit(EXIT_FAILURE);
341 }
342 i++;
343 }
344 }
345 else
346 vals[i++]=v-2;
347 }
348 }
349