1 #include "compressed_seq.h"
2 #include <stdlib.h>
3 #include <stdio.h>
4 #include <assert.h>
5 #include <limits.h>
6 #include <string.h>
7 
8 #include "bitbool.h"
9 
10 // #define DEBUG
11 #include "debug.h"
12 
compressed_seq_i_log2(cmph_uint32 x)13 static inline cmph_uint32 compressed_seq_i_log2(cmph_uint32 x)
14 {
15 	register cmph_uint32 res = 0;
16 
17 	while(x > 1)
18 	{
19 		x >>= 1;
20 		res++;
21 	}
22 	return res;
23 };
24 
compressed_seq_init(compressed_seq_t * cs)25 void compressed_seq_init(compressed_seq_t * cs)
26 {
27 	select_init(&cs->sel);
28 	cs->n = 0;
29 	cs->rem_r = 0;
30 	cs->length_rems = 0;
31 	cs->total_length = 0;
32 	cs->store_table = 0;
33 }
34 
compressed_seq_destroy(compressed_seq_t * cs)35 void compressed_seq_destroy(compressed_seq_t * cs)
36 {
37 	free(cs->store_table);
38 	cs->store_table = 0;
39 	free(cs->length_rems);
40 	cs->length_rems = 0;
41 	select_destroy(&cs->sel);
42 };
43 
44 
compressed_seq_generate(compressed_seq_t * cs,cmph_uint32 * vals_table,cmph_uint32 n)45 void compressed_seq_generate(compressed_seq_t * cs, cmph_uint32 * vals_table, cmph_uint32 n)
46 {
47 	register cmph_uint32 i;
48 	// lengths: represents lengths of encoded values
49 	register cmph_uint32 * lengths = (cmph_uint32 *)calloc(n, sizeof(cmph_uint32));
50 	register cmph_uint32 rems_mask;
51 	register cmph_uint32 stored_value;
52 
53 	cs->n = n;
54 	cs->total_length = 0;
55 
56 	for(i = 0; i < cs->n; i++)
57 	{
58 		if(vals_table[i] == 0)
59 		{
60 			lengths[i] = 0;
61 		}
62 		else
63 		{
64 			lengths[i] = compressed_seq_i_log2(vals_table[i] + 1);
65 			cs->total_length += lengths[i];
66 		};
67 	};
68 
69 	if(cs->store_table)
70 	{
71 		free(cs->store_table);
72 	}
73 	cs->store_table = (cmph_uint32 *) calloc(((cs->total_length + 31) >> 5), sizeof(cmph_uint32));
74 	cs->total_length = 0;
75 
76 	for(i = 0; i < cs->n; i++)
77 	{
78 		if(vals_table[i] == 0)
79 			continue;
80 		stored_value = vals_table[i] - ((1U << lengths[i]) - 1U);
81 		set_bits_at_pos(cs->store_table, cs->total_length, stored_value, lengths[i]);
82 		cs->total_length += lengths[i];
83 	};
84 
85 	cs->rem_r = compressed_seq_i_log2(cs->total_length/cs->n);
86 
87 	if(cs->rem_r == 0)
88 	{
89 		cs->rem_r = 1;
90 	}
91 
92 	if(cs->length_rems)
93 	{
94 		free(cs->length_rems);
95 	}
96 
97 	cs->length_rems = (cmph_uint32 *) calloc(BITS_TABLE_SIZE(cs->n, cs->rem_r), sizeof(cmph_uint32));
98 
99 	rems_mask = (1U << cs->rem_r) - 1U;
100 	cs->total_length = 0;
101 
102 	for(i = 0; i < cs->n; i++)
103 	{
104 		cs->total_length += lengths[i];
105 		set_bits_value(cs->length_rems, i, cs->total_length & rems_mask, cs->rem_r, rems_mask);
106 		lengths[i] = cs->total_length >> cs->rem_r;
107 	};
108 
109 	select_init(&cs->sel);
110 
111 	// FABIANO: before it was (cs->total_length >> cs->rem_r) + 1. But I wiped out the + 1 because
112 	// I changed the select structure to work up to m, instead of up to m - 1.
113 	select_generate(&cs->sel, lengths, cs->n, (cs->total_length >> cs->rem_r));
114 
115 	free(lengths);
116 };
117 
compressed_seq_get_space_usage(compressed_seq_t * cs)118 cmph_uint32 compressed_seq_get_space_usage(compressed_seq_t * cs)
119 {
120 	register cmph_uint32 space_usage = select_get_space_usage(&cs->sel);
121 	space_usage += ((cs->total_length + 31) >> 5) * (cmph_uint32)sizeof(cmph_uint32) * 8;
122 	space_usage += BITS_TABLE_SIZE(cs->n, cs->rem_r) * (cmph_uint32)sizeof(cmph_uint32) * 8;
123 	return  4 * (cmph_uint32)sizeof(cmph_uint32) * 8 + space_usage;
124 }
125 
compressed_seq_query(compressed_seq_t * cs,cmph_uint32 idx)126 cmph_uint32 compressed_seq_query(compressed_seq_t * cs, cmph_uint32 idx)
127 {
128 	register cmph_uint32 enc_idx, enc_length;
129 	register cmph_uint32 rems_mask;
130 	register cmph_uint32 stored_value;
131 	register cmph_uint32 sel_res;
132 
133 	assert(idx < cs->n); // FABIANO ADDED
134 
135 	rems_mask = (1U << cs->rem_r) - 1U;
136 
137 	if(idx == 0)
138 	{
139 		enc_idx = 0;
140 		sel_res = select_query(&cs->sel, idx);
141 	}
142 	else
143 	{
144 		sel_res = select_query(&cs->sel, idx - 1);
145 
146 		enc_idx = (sel_res - (idx - 1)) << cs->rem_r;
147 		enc_idx += get_bits_value(cs->length_rems, idx-1, cs->rem_r, rems_mask);
148 
149 		sel_res = select_next_query(&cs->sel, sel_res);
150 	};
151 
152 	enc_length = (sel_res - idx) << cs->rem_r;
153 	enc_length += get_bits_value(cs->length_rems, idx, cs->rem_r, rems_mask);
154 	enc_length -= enc_idx;
155 	if(enc_length == 0)
156 		return 0;
157 
158 	stored_value = get_bits_at_pos(cs->store_table, enc_idx, enc_length);
159 	return stored_value + ((1U << enc_length) - 1U);
160 };
161 
compressed_seq_dump(compressed_seq_t * cs,char ** buf,cmph_uint32 * buflen)162 void compressed_seq_dump(compressed_seq_t * cs, char ** buf, cmph_uint32 * buflen)
163 {
164 	register cmph_uint32 sel_size = select_packed_size(&(cs->sel));
165 	register cmph_uint32 length_rems_size = BITS_TABLE_SIZE(cs->n, cs->rem_r) * 4;
166 	register cmph_uint32 store_table_size = ((cs->total_length + 31) >> 5) * 4;
167 	register cmph_uint32 pos = 0;
168 	char * buf_sel = 0;
169 	cmph_uint32 buflen_sel = 0;
170 
171 	*buflen = 4*(cmph_uint32)sizeof(cmph_uint32) + sel_size +  length_rems_size + store_table_size;
172 
173 	DEBUGP("sel_size = %u\n", sel_size);
174 	DEBUGP("length_rems_size = %u\n", length_rems_size);
175 	DEBUGP("store_table_size = %u\n", store_table_size);
176 	*buf = (char *)calloc(*buflen, sizeof(char));
177 
178 	if (!*buf)
179 	{
180 		*buflen = UINT_MAX;
181 		return;
182 	}
183 
184 	// dumping n, rem_r and total_length
185 	memcpy(*buf, &(cs->n), sizeof(cmph_uint32));
186 	pos += (cmph_uint32)sizeof(cmph_uint32);
187 	DEBUGP("n = %u\n", cs->n);
188 
189 	memcpy(*buf + pos, &(cs->rem_r), sizeof(cmph_uint32));
190 	pos += (cmph_uint32)sizeof(cmph_uint32);
191 	DEBUGP("rem_r = %u\n", cs->rem_r);
192 
193 	memcpy(*buf + pos, &(cs->total_length), sizeof(cmph_uint32));
194 	pos += (cmph_uint32)sizeof(cmph_uint32);
195 	DEBUGP("total_length = %u\n", cs->total_length);
196 
197 
198 	// dumping sel
199 	select_dump(&cs->sel, &buf_sel, &buflen_sel);
200 	memcpy(*buf + pos, &buflen_sel, sizeof(cmph_uint32));
201 	pos += (cmph_uint32)sizeof(cmph_uint32);
202 	DEBUGP("buflen_sel = %u\n", buflen_sel);
203 
204 	memcpy(*buf + pos, buf_sel, buflen_sel);
205 	#ifdef DEBUG
206 	cmph_uint32 i = 0;
207 	for(i = 0; i < buflen_sel; i++)
208 	{
209 	    DEBUGP("pos = %u  -- buf_sel[%u] = %u\n", pos, i, *(*buf + pos + i));
210 	}
211 	#endif
212 	pos += buflen_sel;
213 
214 	free(buf_sel);
215 
216 	// dumping length_rems
217 	memcpy(*buf + pos, cs->length_rems, length_rems_size);
218 	#ifdef DEBUG
219 	for(i = 0; i < length_rems_size; i++)
220 	{
221 	    DEBUGP("pos = %u -- length_rems_size = %u  -- length_rems[%u] = %u\n", pos, length_rems_size, i, *(*buf + pos + i));
222 	}
223 	#endif
224 	pos += length_rems_size;
225 
226 	// dumping store_table
227 	memcpy(*buf + pos, cs->store_table, store_table_size);
228 
229 	#ifdef DEBUG
230 	for(i = 0; i < store_table_size; i++)
231 	{
232 	    DEBUGP("pos = %u -- store_table_size = %u  -- store_table[%u] = %u\n", pos, store_table_size, i, *(*buf + pos + i));
233 	}
234 	#endif
235 	DEBUGP("Dumped compressed sequence structure with size %u bytes\n", *buflen);
236 }
237 
compressed_seq_load(compressed_seq_t * cs,const char * buf,cmph_uint32 buflen)238 void compressed_seq_load(compressed_seq_t * cs, const char * buf, cmph_uint32 buflen)
239 {
240 	register cmph_uint32 pos = 0;
241 	cmph_uint32 buflen_sel = 0;
242 	register cmph_uint32 length_rems_size = 0;
243 	register cmph_uint32 store_table_size = 0;
244 
245 	// loading n, rem_r and total_length
246 	memcpy(&(cs->n), buf, sizeof(cmph_uint32));
247 	pos += (cmph_uint32)sizeof(cmph_uint32);
248 	DEBUGP("n = %u\n", cs->n);
249 
250 	memcpy(&(cs->rem_r), buf + pos, sizeof(cmph_uint32));
251 	pos += (cmph_uint32)sizeof(cmph_uint32);
252 	DEBUGP("rem_r = %u\n", cs->rem_r);
253 
254 	memcpy(&(cs->total_length), buf + pos, sizeof(cmph_uint32));
255 	pos += (cmph_uint32)sizeof(cmph_uint32);
256 	DEBUGP("total_length = %u\n", cs->total_length);
257 
258 	// loading sel
259 	memcpy(&buflen_sel, buf + pos, sizeof(cmph_uint32));
260 	pos += (cmph_uint32)sizeof(cmph_uint32);
261 	DEBUGP("buflen_sel = %u\n", buflen_sel);
262 
263 	select_load(&cs->sel, buf + pos, buflen_sel);
264 	#ifdef DEBUG
265 	cmph_uint32 i = 0;
266 	for(i = 0; i < buflen_sel; i++)
267 	{
268 	    DEBUGP("pos = %u  -- buf_sel[%u] = %u\n", pos, i, *(buf + pos + i));
269 	}
270 	#endif
271 	pos += buflen_sel;
272 
273 	// loading length_rems
274 	if(cs->length_rems)
275 	{
276 		free(cs->length_rems);
277 	}
278 	length_rems_size = BITS_TABLE_SIZE(cs->n, cs->rem_r);
279 	cs->length_rems = (cmph_uint32 *) calloc(length_rems_size, sizeof(cmph_uint32));
280 	length_rems_size *= 4;
281 	memcpy(cs->length_rems, buf + pos, length_rems_size);
282 
283 	#ifdef DEBUG
284 	for(i = 0; i < length_rems_size; i++)
285 	{
286 	    DEBUGP("pos = %u -- length_rems_size = %u  -- length_rems[%u] = %u\n", pos, length_rems_size, i, *(buf + pos + i));
287 	}
288 	#endif
289 	pos += length_rems_size;
290 
291 	// loading store_table
292 	store_table_size = ((cs->total_length + 31) >> 5);
293 	if(cs->store_table)
294 	{
295 		free(cs->store_table);
296 	}
297 	cs->store_table = (cmph_uint32 *) calloc(store_table_size, sizeof(cmph_uint32));
298         store_table_size *= 4;
299 	memcpy(cs->store_table, buf + pos, store_table_size);
300 
301 	#ifdef DEBUG
302 	for(i = 0; i < store_table_size; i++)
303 	{
304 	    DEBUGP("pos = %u -- store_table_size = %u  -- store_table[%u] = %u\n", pos, store_table_size, i, *(buf + pos + i));
305 	}
306 	#endif
307 
308 	DEBUGP("Loaded compressed sequence structure with size %u bytes\n", buflen);
309 }
310 
compressed_seq_pack(compressed_seq_t * cs,void * cs_packed)311 void compressed_seq_pack(compressed_seq_t *cs, void *cs_packed)
312 {
313 	if (cs && cs_packed)
314 	{
315 		char *buf = NULL;
316 		cmph_uint32 buflen = 0;
317 		compressed_seq_dump(cs, &buf, &buflen);
318 		memcpy(cs_packed, buf, buflen);
319 		free(buf);
320 	}
321 
322 }
323 
compressed_seq_packed_size(compressed_seq_t * cs)324 cmph_uint32 compressed_seq_packed_size(compressed_seq_t *cs)
325 {
326 	register cmph_uint32 sel_size = select_packed_size(&cs->sel);
327 	register cmph_uint32 store_table_size = ((cs->total_length + 31) >> 5) * (cmph_uint32)sizeof(cmph_uint32);
328 	register cmph_uint32 length_rems_size = BITS_TABLE_SIZE(cs->n, cs->rem_r) * (cmph_uint32)sizeof(cmph_uint32);
329 	return 4 * (cmph_uint32)sizeof(cmph_uint32) + sel_size + store_table_size + length_rems_size;
330 }
331 
332 
compressed_seq_query_packed(void * cs_packed,cmph_uint32 idx)333 cmph_uint32 compressed_seq_query_packed(void * cs_packed, cmph_uint32 idx)
334 {
335 	// unpacking cs_packed
336 	register cmph_uint32 *ptr = (cmph_uint32 *)cs_packed;
337 	register cmph_uint32 n = *ptr++;
338 	register cmph_uint32 rem_r = *ptr++;
339 	ptr++; // skipping total_length
340 // 	register cmph_uint32 total_length = *ptr++;
341 	register cmph_uint32 buflen_sel = *ptr++;
342 	register cmph_uint32 * sel_packed = ptr;
343 	register cmph_uint32 * length_rems = (ptr += (buflen_sel >> 2));
344 	register cmph_uint32 length_rems_size = BITS_TABLE_SIZE(n, rem_r);
345 	register cmph_uint32 * store_table = (ptr += length_rems_size);
346 
347 	// compressed sequence query computation
348 	register cmph_uint32 enc_idx, enc_length;
349 	register cmph_uint32 rems_mask;
350 	register cmph_uint32 stored_value;
351 	register cmph_uint32 sel_res;
352 
353 	rems_mask = (1U << rem_r) - 1U;
354 
355 	if(idx == 0)
356 	{
357 		enc_idx = 0;
358 		sel_res = select_query_packed(sel_packed, idx);
359 	}
360 	else
361 	{
362 		sel_res = select_query_packed(sel_packed, idx - 1);
363 
364 		enc_idx = (sel_res - (idx - 1)) << rem_r;
365 		enc_idx += get_bits_value(length_rems, idx-1, rem_r, rems_mask);
366 
367 		sel_res = select_next_query_packed(sel_packed, sel_res);
368 	};
369 
370 	enc_length = (sel_res - idx) << rem_r;
371 	enc_length += get_bits_value(length_rems, idx, rem_r, rems_mask);
372 	enc_length -= enc_idx;
373 	if(enc_length == 0)
374 		return 0;
375 
376 	stored_value = get_bits_at_pos(store_table, enc_idx, enc_length);
377 	return stored_value + ((1U << enc_length) - 1U);
378 }
379