1 static char rcsid[] = "$Id: compress-write.c 222808 2020-06-03 22:01:24Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 #ifndef HAVE_MEMCPY
6 # define memcpy(d,s,n) bcopy((s),(d),(n))
7 #endif
8 #ifndef HAVE_MEMMOVE
9 # define memmove(d,s,n) bcopy((s),(d),(n))
10 #endif
11
12 #include "compress-write.h"
13
14 #include <stdlib.h>
15 #include <stddef.h>
16 #include <string.h>
17 #include <ctype.h> /* For isalpha, toupper */
18 #ifdef WORDS_BIGENDIAN
19 #include "bigendian.h"
20 #else
21 #include "littleendian.h"
22 #endif
23 #ifdef HAVE_SYS_TYPES_H
24 #include <sys/types.h> /* For off_t */
25 #endif
26 #include "complement.h"
27 #include "mem.h" /* For Compress_new */
28 #include "getline.h"
29
30
31 /* Compress_cat */
32 #ifdef DEBUG1
33 #define debug1(x) x
34 #else
35 #define debug1(x)
36 #endif
37
38
39 /* Another MONITOR_INTERVAL is in indexdb.c */
40 #define MONITOR_INTERVAL 10000000 /* 10 million nt */
41
42 #define MAX_BADCHAR_MESSAGES 10
43 #define BADCHAR_INTERVAL 1000000
44
45
46 static char uppercaseCode[128] = UPPERCASE_U2T;
47
48 /* We use int *, rather than char *, because we eventually return an int,
49 and we see problems converting from char to int */
50 static void
fill_buffer(int * Buffer,Genomecomp_T high,Genomecomp_T low,Genomecomp_T flags,Univcoord_T position)51 fill_buffer (int *Buffer, Genomecomp_T high, Genomecomp_T low, Genomecomp_T flags, Univcoord_T position) {
52 int i;
53
54 /* printf("%08X %08X %08X => ",high,low,flags); */
55 for (i = 0; i < 16; i++) {
56 switch (low & 3U) {
57 case 0U: Buffer[i] = 'A'; break;
58 case 1U: Buffer[i] = 'C'; break;
59 case 2U: Buffer[i] = 'G'; break;
60 case 3U: Buffer[i] = 'T'; break;
61 default: abort();
62 }
63 low >>= 2;
64 }
65 for ( ; i < 32; i++) {
66 switch (high & 3U) {
67 case 0U: Buffer[i] = 'A'; break;
68 case 1U: Buffer[i] = 'C'; break;
69 case 2U: Buffer[i] = 'G'; break;
70 case 3U: Buffer[i] = 'T'; break;
71 default: abort();
72 }
73 high >>= 2;
74 }
75 for (i = 0; i < 32; i++) {
76 if ((flags & 1U) == 1U) {
77 if (Buffer[i] == 'A') {
78 Buffer[i] = 'N';
79 } else if (Buffer[i] == 'T') {
80 Buffer[i] = 'X';
81 } else {
82 #if defined(LARGE_GENOMES) || (defined(UTILITYP) && defined(HAVE_64_BIT))
83 printf("Parsing error; saw non-ACGT flag plus %c at position %llu = %llu + %d\n",Buffer[i],position+i,position,i);
84 #else
85 printf("Parsing error; saw non-ACGT flag plus %c at position %u = %u + %d\n",Buffer[i],position+i,position,i);
86 #endif
87 printf("flags: %08X\n",flags);
88 abort();
89 }
90 }
91 flags >>= 1;
92 }
93
94 return;
95 }
96
97
98 /* Based on genomecomp */
99 int
Compress_get_char(FILE * sequence_fp,Univcoord_T position,bool uncompressedp)100 Compress_get_char (FILE *sequence_fp, Univcoord_T position, bool uncompressedp) {
101 Genomecomp_T high, low, flags;
102 static int SAVEBUFFER[32];
103 int ptr, c;
104
105 if (uncompressedp == true) {
106 while ((c = fgetc(sequence_fp)) != EOF && isspace(c)) {
107 }
108 if (c == EOF) {
109 return EOF;
110 } else {
111 return c;
112 }
113 } else if ((ptr = position % 32) == 0) {
114 if (FREAD_UINT(&high,sequence_fp) <= 0 ||
115 FREAD_UINT(&low,sequence_fp) <= 0 ||
116 FREAD_UINT(&flags,sequence_fp) <= 0) {
117 return EOF;
118 } else {
119 fill_buffer(SAVEBUFFER,high,low,flags,position);
120 return SAVEBUFFER[0];
121 }
122 } else {
123 return SAVEBUFFER[ptr];
124 }
125 }
126
127
128 /************************************************************************
129 * Compression and uncompression of the genome
130 ************************************************************************/
131
132 /* 87654321 */
133 #define UINT4_LEFT_A 0x00000000
134 #define UINT4_LEFT_C 0x40000000
135 #define UINT4_LEFT_G 0x80000000
136 #define UINT4_LEFT_T 0xC0000000
137 #define UINT4_LEFT_BIT 0x80000000
138
139 /* 87654321 */
140 #define UINT4_LEFT_0 0x00000000
141 #define UINT4_LEFT_1 0x40000000
142 #define UINT4_LEFT_2 0x80000000
143 #define UINT4_LEFT_3 0xC0000000
144
145
146 /* Genomecomp format */
147 /* A = 000 Stored as 00 in first two bytes, 0 in flag byte
148 C = 001 01 0
149 G = 010 10 0
150 T = 011 11 0
151 N = 100 00 1
152 X = 111 11 1
153 */
154
155
156 /* 87654321 */
157 #define UINT4_LEFT_SET 0x80000000
158 #define UINT4_LEFT_CLEAR 0x00000000
159
160
161 /* Genome128 format */
162 /* High bit Low bit Flag bit
163 A = 000 0 0 0
164 C = 001 0 1 0
165 G = 010 1 0 0
166 T = 011 1 1 1
167 N = 100 0 0 1
168 X = 111 1 1 1
169 */
170
171
172
173
174 /************************************************************************/
175
176 static void
genomecomp_move_absolute(FILE * fp,Univcoord_T ptr)177 genomecomp_move_absolute (FILE *fp, Univcoord_T ptr) {
178 #ifdef HAVE_FSEEKO
179 off_t offset = ptr*((off_t) sizeof(Genomecomp_T));
180
181 if (fseeko(fp,offset,SEEK_SET) < 0) {
182 perror("Error in gmapindex, genomecomp_move_absolute");
183 exit(9);
184 }
185 #else
186 long int offset = ptr*((long int) sizeof(Genomecomp_T));
187
188 if (fseek(fp,offset,SEEK_SET) < 0) {
189 perror("Error in gmapindex, genomecomp_move_absolute");
190 exit(9);
191 }
192 #endif
193
194 return;
195 }
196
197 static void
genomecomp_read_current(Genomecomp_T * high,Genomecomp_T * low,Genomecomp_T * flags,FILE * fp,int index1part)198 genomecomp_read_current (Genomecomp_T *high, Genomecomp_T *low, Genomecomp_T *flags, FILE *fp,
199 int index1part) {
200 char section[15];
201
202 if (fread(section,sizeof(char),index1part,fp) < (unsigned int) index1part) {
203 *high = 0xFFFFFFFF;
204 *low = 0xFFFFFFFF;
205 *flags = 0xFFFFFFFF;
206 return;
207 }
208
209 *high = (section[3] & 0xff);
210 *high <<= 8;
211 *high |= (section[2] & 0xff);
212 *high <<= 8;
213 *high |= (section[1] & 0xff);
214 *high <<= 8;
215 *high |= (section[0] & 0xff);
216
217 *low = (section[7] & 0xff);
218 *low <<= 8;
219 *low |= (section[6] & 0xff);
220 *low <<= 8;
221 *low |= (section[5] & 0xff);
222 *low <<= 8;
223 *low |= (section[4] & 0xff);
224
225 *flags = (section[11] & 0xff);
226 *flags <<= 8;
227 *flags |= (section[10] & 0xff);
228 *flags <<= 8;
229 *flags |= (section[9] & 0xff);
230 *flags <<= 8;
231 *flags |= (section[8] & 0xff);
232
233 return;
234 }
235
236
237 static void
write_compressed_one(FILE * fp,int * nbadchars,char Buffer[],Univcoord_T position)238 write_compressed_one (FILE *fp, int *nbadchars, char Buffer[], Univcoord_T position) {
239 Genomecomp_T high = 0U, low = 0U, flags = 0U, carry;
240 int i;
241
242 for (i = 0; i < 32; i++) {
243 carry = high & 3U;
244 high >>= 2;
245 low >>= 2;
246 flags >>= 1;
247 switch (carry) {
248 case 0U: break;
249 case 1U: low |= UINT4_LEFT_C; break;
250 case 2U: low |= UINT4_LEFT_G; break;
251 case 3U: low |= UINT4_LEFT_T; break;
252 default: abort();
253 }
254
255 switch (uppercaseCode[(int) Buffer[i]]) {
256 case 'A': break;
257 case 'C': high |= UINT4_LEFT_C; break;
258 case 'G': high |= UINT4_LEFT_G; break;
259 case 'T': high |= UINT4_LEFT_T; break;
260 case 'N': flags |= UINT4_LEFT_BIT; break;
261 case 'X': high |= UINT4_LEFT_T; flags |= UINT4_LEFT_BIT; break;
262 default:
263 (*nbadchars) += 1;
264 if (*nbadchars < MAX_BADCHAR_MESSAGES) {
265 #if defined(LARGE_GENOMES) || (defined(UTILITYP) && defined(HAVE_64_BIT))
266 fprintf(stderr,"Don't recognize character %c at position %llu. Using N instead\n",Buffer[i],position+i);
267 #else
268 fprintf(stderr,"Don't recognize character %c at position %u. Using N instead\n",Buffer[i],position+i);
269 #endif
270 } else if (*nbadchars == MAX_BADCHAR_MESSAGES) {
271 fprintf(stderr,"Too many non-recognizable characters. Not reporting each individual occurrence anymore.\n");
272 } else if ((*nbadchars) % BADCHAR_INTERVAL == 0) {
273 fprintf(stderr,"A total of %d non-ACGTNX characters seen so far.\n",*nbadchars);
274 }
275 flags |= UINT4_LEFT_BIT;
276 break;
277 }
278 }
279
280 FWRITE_UINT(high,fp);
281 FWRITE_UINT(low,fp);
282 FWRITE_UINT(flags,fp);
283
284 return;
285 }
286
287
288 static void
put_compressed_one(Genomecomp_T * sectioncomp,int * nbadchars,char Buffer[],Univcoord_T position)289 put_compressed_one (Genomecomp_T *sectioncomp, int *nbadchars, char Buffer[], Univcoord_T position) {
290 Genomecomp_T high = 0U, low = 0U, flags = 0U, carry;
291 int i;
292
293 for (i = 0; i < 32; i++) {
294 carry = high & 3U;
295 high >>= 2;
296 low >>= 2;
297 flags >>= 1;
298 switch (carry) {
299 case 0U: break;
300 case 1U: low |= UINT4_LEFT_C; break;
301 case 2U: low |= UINT4_LEFT_G; break;
302 case 3U: low |= UINT4_LEFT_T; break;
303 default: abort();
304 }
305
306 switch (uppercaseCode[(int) Buffer[i]]) {
307 case 'A': break;
308 case 'C': high |= UINT4_LEFT_C; break;
309 case 'G': high |= UINT4_LEFT_G; break;
310 case 'T': high |= UINT4_LEFT_T; break;
311 case 'N': flags |= UINT4_LEFT_BIT; break;
312 case 'X': high |= UINT4_LEFT_T; flags |= UINT4_LEFT_BIT; break;
313 default:
314 (*nbadchars) += 1;
315 if (*nbadchars < MAX_BADCHAR_MESSAGES) {
316 #if defined(LARGE_GENOMES) || (defined(UTILITYP) && defined(HAVE_64_BIT))
317 fprintf(stderr,"Don't recognize character %c at position %llu. Using N instead\n",Buffer[i],position+i);
318 #else
319 fprintf(stderr,"Don't recognize character %c at position %u. Using N instead\n",Buffer[i],position+i);
320 #endif
321 } else if (*nbadchars == MAX_BADCHAR_MESSAGES) {
322 fprintf(stderr,"Too many non-recognizable characters. Not reporting each individual occurrence anymore.\n");
323 } else if ((*nbadchars) % BADCHAR_INTERVAL == 0) {
324 fprintf(stderr,"A total of %d non-ACGTNX characters seen so far.\n",*nbadchars);
325 }
326 flags |= UINT4_LEFT_BIT;
327 break;
328 }
329 }
330
331 sectioncomp[0] = high;
332 sectioncomp[1] = low;
333 sectioncomp[2] = flags;
334
335 return;
336 }
337
338
339 static char acgt[4] = {'A','C','G','T'};
340 static char non_acgt[4] = {'N','?','?','X'};
341
342 /* if gbuffer is NULL, then we fill with X's */
343 /* Based on genomecomp. Version for genome128 not implemented yet */
344 int
Compress_update_file(int nbadchars,FILE * fp,char * gbuffer,Univcoord_T startpos,Univcoord_T endpos,int index1part)345 Compress_update_file (int nbadchars, FILE *fp, char *gbuffer, Univcoord_T startpos,
346 Univcoord_T endpos, int index1part) {
347 /* Chrpos_T length = endpos - startpos; */
348 Univcoord_T startblock, endblock, ptr;
349 unsigned int startdiscard, enddiscard, i;
350 Genomecomp_T high, low, flags;
351 char Buffer[32];
352
353
354 ptr = startblock = startpos/32U*3;
355 endblock = endpos/32U*3;
356 startdiscard = startpos % 32;
357 enddiscard = endpos % 32;
358
359 if (endblock == startblock) {
360 /* Special case */
361 genomecomp_move_absolute(fp,ptr);
362 genomecomp_read_current(&high,&low,&flags,fp,index1part);
363
364 for (i = 0; i < 16; i++) {
365 Buffer[i] = flags & 1U ? non_acgt[low & 3U] : acgt[low & 3U];
366 low >>= 2;
367 flags >>= 1;
368 }
369 for ( ; i < 32; i++) {
370 Buffer[i] = flags & 1U ? non_acgt[high & 3U] : acgt[high & 3U];
371 high >>= 2;
372 flags >>= 1;
373 }
374 for (i = startdiscard; i < enddiscard; i++) {
375 Buffer[i] = gbuffer ? *gbuffer++ : 'X';
376 }
377 genomecomp_move_absolute(fp,ptr);
378 write_compressed_one(fp,&nbadchars,Buffer,startpos);
379
380 } else {
381
382 genomecomp_move_absolute(fp,ptr);
383 genomecomp_read_current(&high,&low,&flags,fp,index1part);
384
385 for (i = 0; i < 16; i++) {
386 Buffer[i] = flags & 1U ? non_acgt[low & 3U] : acgt[low & 3U];
387 low >>= 2;
388 flags >>= 1;
389 }
390 for ( ; i < 32; i++) {
391 Buffer[i] = flags & 1U ? non_acgt[high & 3U] : acgt[high & 3U];
392 high >>= 2;
393 flags >>= 1;
394 }
395 for (i = startdiscard; i < 32; i++) {
396 Buffer[i] = gbuffer ? *gbuffer++ : 'X';
397 }
398 genomecomp_move_absolute(fp,ptr);
399 write_compressed_one(fp,&nbadchars,Buffer,startpos);
400 ptr += 3;
401
402 while (ptr < endblock) {
403 for (i = 0; i < 32; i++) {
404 Buffer[i] = gbuffer ? *gbuffer++ : 'X';
405 }
406 write_compressed_one(fp,&nbadchars,Buffer,ptr/3*32U);
407 ptr += 3;
408 }
409
410 if (enddiscard > 0) {
411 genomecomp_move_absolute(fp,ptr);
412 genomecomp_read_current(&high,&low,&flags,fp,index1part);
413
414 for (i = 0; i < 16; i++) {
415 Buffer[i] = flags & 1U ? non_acgt[low & 3U] : acgt[low & 3U];
416 low >>= 2;
417 flags >>= 1;
418 }
419 for ( ; i < 32; i++) {
420 Buffer[i] = flags & 1U ? non_acgt[high & 3U] : acgt[high & 3U];
421 high >>= 2;
422 flags >>= 1;
423 }
424 for (i = 0; i < enddiscard; i++) {
425 Buffer[i] = gbuffer ? *gbuffer++ : 'X';
426 }
427 genomecomp_move_absolute(fp,ptr);
428 write_compressed_one(fp,&nbadchars,Buffer,ptr/3*32U);
429 }
430 }
431
432 return nbadchars;
433 }
434
435
436 int
Compress_update_memory(int nbadchars,Genomecomp_T * genomecomp,char * gbuffer,Univcoord_T startpos,Univcoord_T endpos)437 Compress_update_memory (int nbadchars, Genomecomp_T *genomecomp, char *gbuffer, Univcoord_T startpos,
438 Univcoord_T endpos) {
439 /* Chrpos_T length = endpos - startpos; */
440 Univcoord_T startblock, endblock, ptr;
441 Genomecomp_T high, low, flags;
442 char Buffer[32];
443 unsigned int startdiscard, enddiscard, i;
444
445
446 ptr = startblock = startpos/32U*3;
447 endblock = endpos/32U*3;
448 startdiscard = startpos % 32;
449 enddiscard = endpos % 32;
450
451 if (endblock == startblock) {
452 /* Special case */
453 high = genomecomp[ptr];
454 low = genomecomp[ptr+1];
455 flags = genomecomp[ptr+2];
456
457 /* Fill Buffer with original contents */
458 for (i = 0; i < 16; i++) {
459 Buffer[i] = flags & 1U ? non_acgt[low & 3U] : acgt[low & 3U];
460 low >>= 2;
461 flags >>= 1;
462 }
463 for ( ; i < 32; i++) {
464 Buffer[i] = flags & 1U ? non_acgt[high & 3U] : acgt[high & 3U];
465 high >>= 2;
466 flags >>= 1;
467 }
468 for (i = startdiscard; i < enddiscard; i++) {
469 Buffer[i] = gbuffer ? *gbuffer++ : 'X';
470 }
471 put_compressed_one(&(genomecomp[ptr]),&nbadchars,Buffer,startpos);
472
473 } else {
474
475 high = genomecomp[ptr];
476 low = genomecomp[ptr+1];
477 flags = genomecomp[ptr+2];
478
479 /* Fill Buffer with original contents */
480 for (i = 0; i < 16; i++) {
481 Buffer[i] = flags & 1U ? non_acgt[low & 3U] : acgt[low & 3U];
482 low >>= 2;
483 flags >>= 1;
484 }
485 for ( ; i < 32; i++) {
486 Buffer[i] = flags & 1U ? non_acgt[high & 3U] : acgt[high & 3U];
487 high >>= 2;
488 flags >>= 1;
489 }
490 for (i = startdiscard; i < 32; i++) {
491 Buffer[i] = gbuffer ? *gbuffer++ : 'X';
492 }
493 put_compressed_one(&(genomecomp[ptr]),&nbadchars,Buffer,startpos);
494 ptr += 3;
495
496 while (ptr < endblock) {
497 for (i = 0; i < 32; i++) {
498 Buffer[i] = gbuffer ? *gbuffer++ : 'X';
499 }
500 put_compressed_one(&(genomecomp[ptr]),&nbadchars,Buffer,ptr/3*32U);
501 ptr += 3;
502 }
503
504 if (enddiscard > 0) {
505 high = genomecomp[ptr];
506 low = genomecomp[ptr+1];
507 flags = genomecomp[ptr+2];
508
509 /* Fill Buffer with original contents */
510 for (i = 0; i < 16; i++) {
511 Buffer[i] = flags & 1U ? non_acgt[low & 3U] : acgt[low & 3U];
512 low >>= 2;
513 flags >>= 1;
514 }
515 for ( ; i < 32; i++) {
516 Buffer[i] = flags & 1U ? non_acgt[high & 3U] : acgt[high & 3U];
517 high >>= 2;
518 flags >>= 1;
519 }
520 for (i = 0; i < enddiscard; i++) {
521 Buffer[i] = gbuffer ? *gbuffer++ : 'X';
522 }
523 put_compressed_one(&(genomecomp[ptr]),&nbadchars,Buffer,ptr/3*32U);
524 }
525 }
526
527 return nbadchars;
528 }
529
530
531 void
Compress_compress(char ** files,int nfiles,bool stdin_p)532 Compress_compress (char **files, int nfiles, bool stdin_p) {
533 Genomecomp_T low = 0U, high = 0U, flags = 0U, carry;
534 Univcoord_T position = 0UL;
535 FILE *fp;
536 char *line, *p;
537 int c;
538 int in_counter = 0, filei;
539
540
541 fprintf(stderr,"Compressing genome");
542
543 if (stdin_p == true) {
544 nfiles = 1;
545 }
546 for (filei = 0; filei < nfiles; filei++) {
547 if (stdin_p == true) {
548 fp = stdin;
549 } else if ((fp = fopen(files[filei],"r")) == NULL) {
550 fprintf(stderr,"Could not open file %s\n",files[filei]);
551 exit(9);
552 }
553
554 while ((line = Getline(fp)) != NULL) {
555 if (line[0] == '>') {
556 /* Skip header */
557 } else {
558 p = line;
559 while ((c = (int) *p++) != '\0') {
560 if (isalpha(c)) {
561 in_counter++;
562
563 carry = high & 3U;
564 high >>= 2;
565 low >>= 2;
566 flags >>= 1;
567 switch (carry) {
568 case 0U: break;
569 case 1U: low |= UINT4_LEFT_C; break;
570 case 2U: low |= UINT4_LEFT_G; break;
571 case 3U: low |= UINT4_LEFT_T; break;
572 default: abort();
573 }
574
575 switch (uppercaseCode[c]) {
576 case 'A': break;
577 case 'C': high |= UINT4_LEFT_C; break;
578 case 'G': high |= UINT4_LEFT_G; break;
579 case 'T': high |= UINT4_LEFT_T; break;
580 case 'N': flags |= UINT4_LEFT_BIT; break;
581 case 'X': high |= UINT4_LEFT_T; flags |= UINT4_LEFT_BIT; break;
582 default:
583 #if defined(LARGE_GENOMES) || (defined(UTILITYP) && defined(HAVE_64_BIT))
584 fprintf(stderr,"Non-standard nucleotide %c at position %llu. Using N instead\n",c,position);
585 #else
586 fprintf(stderr,"Non-standard nucleotide %c at position %u. Using N instead\n",c,position);
587 #endif
588 flags |= UINT4_LEFT_BIT;
589 break;
590 }
591
592 /* 8 is simply bits per byte */
593 if (in_counter == 8 * (int) sizeof(Genomecomp_T)) {
594 FWRITE_UINT(high,stdout);
595 FWRITE_UINT(low,stdout);
596 FWRITE_UINT(flags,stdout);
597
598 low = high = flags = 0U;
599 in_counter = 0;
600 }
601
602 position++;
603 if (position % MONITOR_INTERVAL == 0) {
604 fprintf(stderr,".");
605 }
606 }
607 }
608 }
609
610 FREE(line);
611 }
612
613 if (stdin_p == false) {
614 fclose(fp);
615 }
616 }
617
618 if (in_counter > 0) {
619 while (in_counter < 8 * (int) sizeof(Genomecomp_T)) {
620 carry = high & 3U;
621 high >>= 2;
622 low >>= 2;
623 flags >>= 1;
624 switch (carry) {
625 case 0U: break;
626 case 1U: low |= UINT4_LEFT_C; break;
627 case 2U: low |= UINT4_LEFT_G; break;
628 case 3U: low |= UINT4_LEFT_T; break;
629 default: abort();
630 }
631 high |= UINT4_LEFT_T; flags |= UINT4_LEFT_BIT; /* Create X's at end */
632 in_counter++;
633 }
634
635 FWRITE_UINT(high,stdout);
636 FWRITE_UINT(low,stdout);
637 FWRITE_UINT(flags,stdout);
638 }
639
640 /* Plus extra high and low */
641 high = low = 0xFFFFFFFF;
642 FWRITE_UINT(high,stdout);
643 FWRITE_UINT(low,stdout);
644
645 fprintf(stderr,"\n");
646
647 return;
648 }
649
650
651
652 #ifdef HAVE_64_BIT
653 static inline void
nt_unshuffle(UINT4 * highbits,UINT4 * lowbits,UINT4 high,UINT4 low)654 nt_unshuffle (UINT4 *highbits, UINT4 *lowbits, UINT4 high, UINT4 low) {
655 UINT8 x, t;
656
657 x = (UINT8) high;
658 x <<= 32;
659 x |= low;
660
661 t = (x ^ (x >> 1)) & 0x2222222222222222; x = x ^ t ^ (t << 1);
662 t = (x ^ (x >> 2)) & 0x0C0C0C0C0C0C0C0C; x = x ^ t ^ (t << 2);
663 t = (x ^ (x >> 4)) & 0x00F000F000F000F0; x = x ^ t ^ (t << 4);
664 t = (x ^ (x >> 8)) & 0x0000FF000000FF00; x = x ^ t ^ (t << 8);
665 t = (x ^ (x >> 16)) & 0x00000000FFFF0000; x = x ^ t ^ (t << 16);
666
667 *highbits = (UINT4) (x >> 32);
668 *lowbits = (UINT4) x;
669
670 return;
671 }
672
673 #else
674
675 static inline void
nt_unshuffle(UINT4 * highbits,UINT4 * lowbits,UINT4 high,UINT4 low)676 nt_unshuffle (UINT4 *highbits, UINT4 *lowbits, UINT4 high, UINT4 low) {
677 UINT4 t;
678
679 /* unshuffle high */
680 t = (high ^ (high >> 1)) & 0x22222222; high = high ^ t ^ (t << 1);
681 t = (high ^ (high >> 2)) & 0x0C0C0C0C; high = high ^ t ^ (t << 2);
682 t = (high ^ (high >> 4)) & 0x00F000F0; high = high ^ t ^ (t << 4);
683 t = (high ^ (high >> 8)) & 0x0000FF00; high = high ^ t ^ (t << 8);
684
685 /* unshuffle low */
686 t = (low ^ (low >> 1)) & 0x22222222; low = low ^ t ^ (t << 1);
687 t = (low ^ (low >> 2)) & 0x0C0C0C0C; low = low ^ t ^ (t << 2);
688 t = (low ^ (low >> 4)) & 0x00F000F0; low = low ^ t ^ (t << 4);
689 t = (low ^ (low >> 8)) & 0x0000FF00; low = low ^ t ^ (t << 8);
690
691 *highbits = (high & 0xFFFF0000) | (low >> 16);
692 *lowbits = (high << 16) | (low & 0x0000FFFF);
693
694 return;
695 }
696 #endif
697
698
699 void
Compress_unshuffle(FILE * out,FILE * in)700 Compress_unshuffle (FILE *out, FILE *in) {
701 Genomecomp_T high, low, flags;
702 Genomecomp_T highbits, lowbits;
703
704 while (FREAD_UINT(&high,in) > 0 &&
705 FREAD_UINT(&low,in) > 0 &&
706 FREAD_UINT(&flags,in) > 0) {
707 nt_unshuffle(&highbits,&lowbits,high,low);
708 FWRITE_UINT(highbits,out);
709 FWRITE_UINT(lowbits,out);
710 FWRITE_UINT(flags,out);
711 }
712
713 return;
714 }
715
716 void
Compress_unshuffle_bits128(FILE * out,FILE * in)717 Compress_unshuffle_bits128 (FILE *out, FILE *in) {
718 UINT4 high0, high1, high2, high3, low0, low1, low2, low3,
719 flags0, flags1, flags2, flags3;
720 UINT4 highbits0, highbits1, highbits2, highbits3,
721 lowbits0, lowbits1, lowbits2, lowbits3;
722
723 while (!feof(in)) {
724 if (FREAD_UINT(&high0,in) == 0 ||
725 FREAD_UINT(&low0,in) == 0 ||
726 FREAD_UINT(&flags0,in) == 0) {
727 highbits0 = -1U;
728 lowbits0 = -1U;
729 flags0 = -1U;
730 } else {
731 nt_unshuffle(&highbits0,&lowbits0,high0,low0);
732 }
733
734 if (FREAD_UINT(&high1,in) == 0 ||
735 FREAD_UINT(&low1,in) == 0 ||
736 FREAD_UINT(&flags1,in) == 0) {
737 highbits1 = -1U;
738 lowbits1 = -1U;
739 flags1 = -1U;
740 } else {
741 nt_unshuffle(&highbits1,&lowbits1,high1,low1);
742 }
743
744 if (FREAD_UINT(&high2,in) == 0 ||
745 FREAD_UINT(&low2,in) == 0 ||
746 FREAD_UINT(&flags2,in) == 0) {
747 highbits2 = -1U;
748 lowbits2 = -1U;
749 flags2 = -1U;
750 } else {
751 nt_unshuffle(&highbits2,&lowbits2,high2,low2);
752 }
753
754 if (FREAD_UINT(&high3,in) == 0 ||
755 FREAD_UINT(&low3,in) == 0 ||
756 FREAD_UINT(&flags3,in) == 0) {
757 highbits3 = -1U;
758 lowbits3 = -1U;
759 flags3 = -1U;
760 } else {
761 nt_unshuffle(&highbits3,&lowbits3,high3,low3);
762 }
763
764 FWRITE_UINT(highbits0,out);
765 FWRITE_UINT(highbits1,out);
766 FWRITE_UINT(highbits2,out);
767 FWRITE_UINT(highbits3,out);
768 FWRITE_UINT(lowbits0,out);
769 FWRITE_UINT(lowbits1,out);
770 FWRITE_UINT(lowbits2,out);
771 FWRITE_UINT(lowbits3,out);
772 FWRITE_UINT(flags0,out);
773 FWRITE_UINT(flags1,out);
774 FWRITE_UINT(flags2,out);
775 FWRITE_UINT(flags3,out);
776 }
777
778 return;
779 }
780
781
782 /* Needed for user-provided segment in GMAP */
783 Genomecomp_T *
Compress_create_blocks_comp(char * genomicseg,Univcoord_T genomelength)784 Compress_create_blocks_comp (char *genomicseg, Univcoord_T genomelength) {
785 Genomecomp_T *genomecomp;
786 size_t nuint4;
787
788 nuint4 = ((genomelength + 31)/32U)*3;
789 genomecomp = (Genomecomp_T *) CALLOC(nuint4+4,sizeof(Genomecomp_T));
790 /* Add 4 because Oligoindex_hr procedures point to nextlow as ptr+4 */
791
792 /* Creates X's at end */
793 genomecomp[nuint4-3] = 0xFFFFFFFF;
794 genomecomp[nuint4-2] = 0xFFFFFFFF;
795 genomecomp[nuint4-1] = 0xFFFFFFFF;
796
797 /* Plus extra 4 */
798 genomecomp[nuint4] = 0xFFFFFFFF;
799 genomecomp[nuint4+1] = 0xFFFFFFFF;
800 genomecomp[nuint4+2] = 0xFFFFFFFF;
801 genomecomp[nuint4+3] = 0xFFFFFFFF;
802
803 Compress_update_memory(/*nbadchars*/0,genomecomp,genomicseg,/*currposition*/0,genomelength);
804
805 return genomecomp;
806 }
807
808
809 /* Needed for user-provided segment in GMAP */
810 Genomecomp_T *
Compress_create_blocks_bits(Genomecomp_T * genomecomp,Univcoord_T genomelength)811 Compress_create_blocks_bits (Genomecomp_T *genomecomp, Univcoord_T genomelength) {
812 Genomecomp_T *genomebits, highbits, lowbits, high, low, flags;
813 size_t nuint4, ptr;
814
815 nuint4 = ((genomelength + 31)/32U)*3;
816 genomebits = (Genomecomp_T *) CALLOC(nuint4+4,sizeof(Genomecomp_T));
817
818 for (ptr = 0; ptr < nuint4; ptr += 3) {
819 high = genomecomp[ptr];
820 low = genomecomp[ptr+1];
821 flags = genomecomp[ptr+2];
822
823 nt_unshuffle(&highbits,&lowbits,high,low);
824 genomebits[ptr] = highbits;
825 genomebits[ptr+1] = lowbits;
826 genomebits[ptr+2] = flags;
827 }
828
829 return genomebits;
830 }
831
832
833 #define GENOMECOMP_LENGTH 32 /* In bp */
834
835 void
Compress_cat(FILE * out,char ** files,Univcoord_T * genomelengths,int nfiles)836 Compress_cat (FILE *out, char **files, Univcoord_T *genomelengths, int nfiles) {
837 FILE *fp;
838 char *file;
839 int current_pos = 0, bufferlen, shift;
840 UINT4 Buffer[3];
841 #ifdef HAVE_64_BIT
842 UINT8 current_highlow = 0, buffer_highlow = 0;
843 #else
844 UINT4 buffer_high = 0, buffer_low = 0;
845 #endif
846 UINT4 current_high = 0, current_low = 0, current_flags = 0, buffer_flags = 0, high, low;
847 #ifdef DEBUG1
848 int k;
849 #endif
850
851 #ifdef DEBUG1
852 printf("nfiles: %d\n",nfiles);
853 printf("* Genomelengths");
854 for (k = 0; k < nfiles; k++) {
855 printf(" %llu",genomelengths[k]);
856 }
857 printf("\n");
858 #endif
859
860
861 if ((fp = fopen((file = *files++),"rb")) == NULL) {
862 fprintf(stderr,"Genomecomp file %s is not valid\n",file);
863 exit(9);
864 } else {
865 debug1(printf("*** Opening genomecomp file %s\n",file));
866 }
867
868 while (fp != NULL) {
869 #ifdef DEBUG1
870 printf("* Genomelengths");
871 for (k = 0; k < nfiles; k++) {
872 printf(" %llu",genomelengths[k]);
873 }
874 printf("\n");
875 #endif
876
877 FREAD_UINTS(Buffer,3,fp);
878 debug1(printf("Reading buffer: %08x %08x %08x\n",Buffer[0],Buffer[1],Buffer[2]));
879 #ifdef HAVE_64_BIT
880 buffer_highlow = (((UINT8) Buffer[0]) << 32) | ((UINT8) Buffer[1]);
881 #else
882 buffer_high = Buffer[0];
883 buffer_low = Buffer[1];
884 #endif
885 buffer_flags = Buffer[2];
886
887 if (*genomelengths <= GENOMECOMP_LENGTH) {
888 bufferlen = *genomelengths;
889 *genomelengths = 0;
890
891 #ifdef HAVE_64_BIT
892 buffer_highlow &= 0xFFFFFFFFFFFFFFFF >> (64 - 2*bufferlen); /* mask trailing FFFFFFFF */
893 #else
894 if (bufferlen <= 16) {
895 buffer_high = 0;
896 buffer_low &= 0xFFFFFFFF >> (32 - 2*bufferlen);
897 } else {
898 buffer_high &= 0xFFFFFFFF >> (64 - 2*bufferlen);
899 }
900 #endif
901 buffer_flags &= 0xFFFFFFFF >> (32 - bufferlen); /* mask trailing FFFFFFFF */
902
903 fclose(fp);
904 if (--nfiles == 0) {
905 fp = (FILE *) NULL;
906 } else if ((fp = fopen((file = *files++),"rb")) == NULL) {
907 fprintf(stderr,"Regiondb file %s is not valid\n",file);
908 exit(9);
909 } else {
910 debug1(printf("*** Opening regiondb file %s\n",file));
911 genomelengths++;
912 }
913
914 } else {
915 bufferlen = GENOMECOMP_LENGTH;
916 *genomelengths -= GENOMECOMP_LENGTH;
917 }
918 #ifdef HAVE_64_BIT
919 debug1(printf("Buffer with bufferlen %d: %016llx %08x\n",bufferlen,buffer_highlow,buffer_flags));
920 debug1(printf("Current with current_pos %d: %016llx %08x\n",current_pos,current_highlow,current_flags));
921 #else
922 debug1(printf("Buffer with bufferlen %d: %08x %08x %08x\n",bufferlen,buffer_high,buffer_low,buffer_flags));
923 debug1(printf("Current with current_pos %d: %08x %08x %08x\n",current_pos,current_high,current_low,current_flags));
924 #endif
925
926 if (current_pos + bufferlen < GENOMECOMP_LENGTH) {
927 debug1(printf("Case 1\n"));
928 debug1(printf("Appending buffer with bufferlen %d\n",bufferlen));
929 #ifdef HAVE_64_BIT
930 current_highlow |= buffer_highlow << 2*current_pos; /* append */
931 #else
932 /* append */
933 if (current_pos == 0) {
934 current_low = buffer_low;
935 current_high = buffer_high;
936 } else if (current_pos < 16) {
937 current_low |= buffer_low << 2*current_pos;
938 current_high = (buffer_low >> (32 - 2*current_pos)) | (buffer_high << 2*current_pos);
939 } else if (current_pos == 16) {
940 current_low = 0;
941 current_high = buffer_low;
942 } else {
943 current_high |= buffer_low << (2*current_pos - 32);
944 }
945 #endif
946 current_flags |= buffer_flags << current_pos; /* append */
947 current_pos += bufferlen;
948
949 } else if (current_pos + bufferlen == GENOMECOMP_LENGTH) {
950 debug1(printf("Case 2\n"));
951 debug1(printf("Appending buffer with bufferlen %d\n",bufferlen));
952 #ifdef HAVE_64_BIT
953 current_highlow |= buffer_highlow << 2*current_pos; /* append */
954 #else
955 /* append */
956 if (current_pos == 0) {
957 current_low = buffer_low;
958 current_high = buffer_high;
959 } else if (current_pos < 16) {
960 current_low |= buffer_low << 2*current_pos;
961 current_high = (buffer_low >> (32 - 2*current_pos)) | (buffer_high << 2*current_pos);
962 } else if (current_pos == 16) {
963 current_low = 0;
964 current_high = buffer_low;
965 } else {
966 current_high |= buffer_low << (2*current_pos - 32);
967 }
968 #endif
969 current_flags |= buffer_flags << current_pos; /* append */
970
971 #ifdef HAVE_64_BIT
972 current_high = (UINT4) (current_highlow >> 32); current_low = (UINT4) (current_highlow & 0xFFFFFFFF); /* write */
973 #endif
974 FWRITE_UINT(current_high,out); FWRITE_UINT(current_low,out); FWRITE_UINT(current_flags,out); /* write */
975 debug1(printf("Writing current: %08x %08x %08x\n",current_high,current_low,current_flags));
976
977 #ifdef HAVE_64_BIT
978 current_highlow = 0; current_flags = 0; /* clear */
979 #else
980 current_high = current_low = current_flags = 0; /* clear */
981 #endif
982
983 current_pos = 0;
984
985 } else {
986 debug1(printf("Case 3\n"));
987 debug1(printf("Appending buffer with truncated bufferlen %d\n",GENOMECOMP_LENGTH-current_pos));
988 #ifdef HAVE_64_BIT
989 current_highlow |= buffer_highlow << 2*current_pos; /* append */
990 #else
991 /* append */
992 if (current_pos == 0) {
993 current_low = buffer_low;
994 current_high = buffer_high;
995 } else if (current_pos < 16) {
996 current_low |= buffer_low << 2*current_pos;
997 current_high = (buffer_low >> (32 - 2*current_pos)) | (buffer_high << 2*current_pos);
998 } else if (current_pos == 16) {
999 current_low = 0;
1000 current_high = buffer_low;
1001 } else {
1002 current_high |= buffer_low << (2*current_pos - 32);
1003 }
1004 #endif
1005 current_flags |= buffer_flags << current_pos; /* append */
1006
1007 #ifdef HAVE_64_BIT
1008 current_high = (UINT4) (current_highlow >> 32); current_low = (UINT4) (current_highlow & 0xFFFFFFFF); /* write */
1009 #endif
1010 FWRITE_UINT(current_high,out); FWRITE_UINT(current_low,out); FWRITE_UINT(current_flags,out); /* write */
1011 debug1(printf("Writing current: %08x %08x %08x\n",current_high,current_low,current_flags));
1012
1013 shift = GENOMECOMP_LENGTH - current_pos;
1014 debug1(printf("Saving remainder with shift %d\n",shift));
1015 #ifdef HAVE_64_BIT
1016 debug1(printf("Buffer: %016llx %08x\n",buffer_highlow,buffer_flags));
1017 current_highlow = buffer_highlow >> 2*shift; /* remainder */
1018 #else
1019 debug1(printf("Buffer: %08x %08x %08x\n",buffer_high,buffer_low,buffer_flags));
1020 /* remainder */
1021 if (shift == 0) {
1022 current_high = buffer_high;
1023 current_low = buffer_low;
1024 } else if (shift < 16) {
1025 current_high = buffer_high >> 2*shift;
1026 current_low = (buffer_low >> 2*shift) | (buffer_high << (32 - 2*shift));
1027 } else if (shift == 16) {
1028 current_high = 0;
1029 current_low = buffer_high;
1030 } else {
1031 current_high = 0;
1032 current_low = buffer_high >> (2*shift - 32);
1033 }
1034 #endif
1035 current_flags = buffer_flags >> shift; /* remainder */
1036 #ifdef HAVE_64_BIT
1037 debug1(printf("Remainder: %016llx %08x\n",current_highlow,current_flags));
1038 #else
1039 debug1(printf("Remainder: %08x %08x %08x\n",current_high,current_low,current_flags));
1040 #endif
1041
1042 current_pos = current_pos + bufferlen - GENOMECOMP_LENGTH;
1043 }
1044 #ifdef HAVE_64_BIT
1045 debug1(printf("Now current with current_pos %d: %016llx %08x\n",current_pos,current_highlow,current_flags));
1046 #else
1047 debug1(printf("Now current with current_pos %d: %08x %08x %08x\n",current_pos,current_high,current_low,current_flags));
1048 #endif
1049 }
1050
1051 if (current_pos > 0) {
1052 /* Introduce trailing FFFFFFFF. Based on current_pos. */
1053 debug1(printf("Final block. current_pos %d\n",current_pos));
1054 #ifdef HAVE_64_BIT
1055 current_highlow |= 0xFFFFFFFFFFFFFFFF << 2*current_pos; /* add trailing FFFFFFFF */
1056 current_high = (UINT4) (current_highlow >> 32); current_low = (UINT4) (current_highlow & 0xFFFFFFFF); /* write */
1057 #else
1058 /* add trailing FFFFFFFF */
1059 if (current_pos < 16) {
1060 current_high = 0xFFFFFFFF;
1061 current_low |= 0xFFFFFFFF << 2*current_pos;
1062 } else {
1063 current_high |= 0xFFFFFFFF << (2*current_pos - 32);
1064 }
1065 #endif
1066 current_flags |= 0xFFFFFFFF << current_pos; /* add trailing FFFFFFFF */
1067 FWRITE_UINT(current_high,out); FWRITE_UINT(current_low,out); FWRITE_UINT(current_flags,out); /* write */
1068 debug1(printf("Writing current: %08x %08x %08x\n",current_high,current_low,current_flags));
1069 }
1070
1071 /* Write an extra two UINT4 */
1072 high = low = 0xFFFFFFFF;
1073 FWRITE_UINT(high,out);
1074 FWRITE_UINT(low,out);
1075
1076 return;
1077 }
1078
1079