1 static char rcsid[] = "$Id: compress-write.c 222808 2020-06-03 22:01:24Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 #ifndef HAVE_MEMCPY
6 # define memcpy(d,s,n) bcopy((s),(d),(n))
7 #endif
8 #ifndef HAVE_MEMMOVE
9 # define memmove(d,s,n) bcopy((s),(d),(n))
10 #endif
11 
12 #include "compress-write.h"
13 
14 #include <stdlib.h>
15 #include <stddef.h>
16 #include <string.h>
17 #include <ctype.h>		/* For isalpha, toupper */
18 #ifdef WORDS_BIGENDIAN
19 #include "bigendian.h"
20 #else
21 #include "littleendian.h"
22 #endif
23 #ifdef HAVE_SYS_TYPES_H
24 #include <sys/types.h>		/* For off_t */
25 #endif
26 #include "complement.h"
27 #include "mem.h"		/* For Compress_new */
28 #include "getline.h"
29 
30 
31 /* Compress_cat */
32 #ifdef DEBUG1
33 #define debug1(x) x
34 #else
35 #define debug1(x)
36 #endif
37 
38 
39 /* Another MONITOR_INTERVAL is in indexdb.c */
40 #define MONITOR_INTERVAL 10000000 /* 10 million nt */
41 
42 #define MAX_BADCHAR_MESSAGES 10
43 #define BADCHAR_INTERVAL 1000000
44 
45 
46 static char uppercaseCode[128] = UPPERCASE_U2T;
47 
48 /* We use int *, rather than char *, because we eventually return an int,
49    and we see problems converting from char to int */
50 static void
fill_buffer(int * Buffer,Genomecomp_T high,Genomecomp_T low,Genomecomp_T flags,Univcoord_T position)51 fill_buffer (int *Buffer, Genomecomp_T high, Genomecomp_T low, Genomecomp_T flags, Univcoord_T position) {
52   int i;
53 
54   /* printf("%08X %08X %08X => ",high,low,flags); */
55   for (i = 0; i < 16; i++) {
56     switch (low & 3U) {
57     case 0U: Buffer[i] = 'A'; break;
58     case 1U: Buffer[i] = 'C'; break;
59     case 2U: Buffer[i] = 'G'; break;
60     case 3U: Buffer[i] = 'T'; break;
61     default: abort();
62     }
63     low >>= 2;
64   }
65   for ( ; i < 32; i++) {
66     switch (high & 3U) {
67     case 0U: Buffer[i] = 'A'; break;
68     case 1U: Buffer[i] = 'C'; break;
69     case 2U: Buffer[i] = 'G'; break;
70     case 3U: Buffer[i] = 'T'; break;
71     default: abort();
72     }
73     high >>= 2;
74   }
75   for (i = 0; i < 32; i++) {
76     if ((flags & 1U) == 1U) {
77       if (Buffer[i] == 'A') {
78 	Buffer[i] = 'N';
79       } else if (Buffer[i] == 'T') {
80 	Buffer[i] = 'X';
81       } else {
82 #if defined(LARGE_GENOMES) || (defined(UTILITYP) && defined(HAVE_64_BIT))
83 	printf("Parsing error; saw non-ACGT flag plus %c at position %llu = %llu + %d\n",Buffer[i],position+i,position,i);
84 #else
85 	printf("Parsing error; saw non-ACGT flag plus %c at position %u = %u + %d\n",Buffer[i],position+i,position,i);
86 #endif
87 	printf("flags: %08X\n",flags);
88 	abort();
89       }
90     }
91     flags >>= 1;
92   }
93 
94   return;
95 }
96 
97 
98 /* Based on genomecomp */
99 int
Compress_get_char(FILE * sequence_fp,Univcoord_T position,bool uncompressedp)100 Compress_get_char (FILE *sequence_fp, Univcoord_T position, bool uncompressedp) {
101   Genomecomp_T high, low, flags;
102   static int SAVEBUFFER[32];
103   int ptr, c;
104 
105   if (uncompressedp == true) {
106     while ((c = fgetc(sequence_fp)) != EOF && isspace(c)) {
107     }
108     if (c == EOF) {
109       return EOF;
110     } else {
111       return c;
112     }
113   } else if ((ptr = position % 32) == 0) {
114     if (FREAD_UINT(&high,sequence_fp) <= 0 ||
115 	FREAD_UINT(&low,sequence_fp) <= 0 ||
116 	FREAD_UINT(&flags,sequence_fp) <= 0) {
117       return EOF;
118     } else {
119       fill_buffer(SAVEBUFFER,high,low,flags,position);
120       return SAVEBUFFER[0];
121     }
122   } else {
123     return SAVEBUFFER[ptr];
124   }
125 }
126 
127 
128 /************************************************************************
129  *   Compression and uncompression of the genome
130  ************************************************************************/
131 
132 /*                       87654321 */
133 #define UINT4_LEFT_A   0x00000000
134 #define UINT4_LEFT_C   0x40000000
135 #define UINT4_LEFT_G   0x80000000
136 #define UINT4_LEFT_T   0xC0000000
137 #define UINT4_LEFT_BIT 0x80000000
138 
139 /*                     87654321 */
140 #define UINT4_LEFT_0 0x00000000
141 #define UINT4_LEFT_1 0x40000000
142 #define UINT4_LEFT_2 0x80000000
143 #define UINT4_LEFT_3 0xC0000000
144 
145 
146 /* Genomecomp format */
147 /* A = 000  Stored as 00 in first two bytes, 0 in flag byte
148    C = 001            01                     0
149    G = 010            10                     0
150    T = 011            11                     0
151    N = 100            00                     1
152    X = 111            11                     1
153 */
154 
155 
156 /*                         87654321 */
157 #define UINT4_LEFT_SET   0x80000000
158 #define UINT4_LEFT_CLEAR 0x00000000
159 
160 
161 /* Genome128 format */
162 /*          High bit   Low bit   Flag bit
163    A = 000     0          0         0
164    C = 001     0          1         0
165    G = 010     1          0         0
166    T = 011     1          1         1
167    N = 100     0          0         1
168    X = 111     1          1         1
169 */
170 
171 
172 
173 
174 /************************************************************************/
175 
176 static void
genomecomp_move_absolute(FILE * fp,Univcoord_T ptr)177 genomecomp_move_absolute (FILE *fp, Univcoord_T ptr) {
178 #ifdef HAVE_FSEEKO
179   off_t offset = ptr*((off_t) sizeof(Genomecomp_T));
180 
181   if (fseeko(fp,offset,SEEK_SET) < 0) {
182     perror("Error in gmapindex, genomecomp_move_absolute");
183     exit(9);
184   }
185 #else
186   long int offset = ptr*((long int) sizeof(Genomecomp_T));
187 
188   if (fseek(fp,offset,SEEK_SET) < 0) {
189     perror("Error in gmapindex, genomecomp_move_absolute");
190     exit(9);
191   }
192 #endif
193 
194   return;
195 }
196 
197 static void
genomecomp_read_current(Genomecomp_T * high,Genomecomp_T * low,Genomecomp_T * flags,FILE * fp,int index1part)198 genomecomp_read_current (Genomecomp_T *high, Genomecomp_T *low, Genomecomp_T *flags, FILE *fp,
199 			 int index1part) {
200   char section[15];
201 
202   if (fread(section,sizeof(char),index1part,fp) < (unsigned int) index1part) {
203     *high = 0xFFFFFFFF;
204     *low = 0xFFFFFFFF;
205     *flags = 0xFFFFFFFF;
206     return;
207   }
208 
209   *high = (section[3] & 0xff);
210   *high <<= 8;
211   *high |= (section[2] & 0xff);
212   *high <<= 8;
213   *high |= (section[1] & 0xff);
214   *high <<= 8;
215   *high |= (section[0] & 0xff);
216 
217   *low = (section[7] & 0xff);
218   *low <<= 8;
219   *low |= (section[6] & 0xff);
220   *low <<= 8;
221   *low |= (section[5] & 0xff);
222   *low <<= 8;
223   *low |= (section[4] & 0xff);
224 
225   *flags = (section[11] & 0xff);
226   *flags <<= 8;
227   *flags |= (section[10] & 0xff);
228   *flags <<= 8;
229   *flags |= (section[9] & 0xff);
230   *flags <<= 8;
231   *flags |= (section[8] & 0xff);
232 
233   return;
234 }
235 
236 
237 static void
write_compressed_one(FILE * fp,int * nbadchars,char Buffer[],Univcoord_T position)238 write_compressed_one (FILE *fp, int *nbadchars, char Buffer[], Univcoord_T position) {
239   Genomecomp_T high = 0U, low = 0U, flags = 0U, carry;
240   int i;
241 
242   for (i = 0; i < 32; i++) {
243     carry = high & 3U;
244     high >>= 2;
245     low >>= 2;
246     flags >>= 1;
247     switch (carry) {
248     case 0U: break;
249     case 1U: low |= UINT4_LEFT_C; break;
250     case 2U: low |= UINT4_LEFT_G; break;
251     case 3U: low |= UINT4_LEFT_T; break;
252     default: abort();
253     }
254 
255     switch (uppercaseCode[(int) Buffer[i]]) {
256     case 'A': break;
257     case 'C': high |= UINT4_LEFT_C; break;
258     case 'G': high |= UINT4_LEFT_G; break;
259     case 'T': high |= UINT4_LEFT_T; break;
260     case 'N': flags |= UINT4_LEFT_BIT; break;
261     case 'X': high |= UINT4_LEFT_T; flags |= UINT4_LEFT_BIT; break;
262     default:
263       (*nbadchars) += 1;
264       if (*nbadchars < MAX_BADCHAR_MESSAGES) {
265 #if defined(LARGE_GENOMES) || (defined(UTILITYP) && defined(HAVE_64_BIT))
266 	fprintf(stderr,"Don't recognize character %c at position %llu.  Using N instead\n",Buffer[i],position+i);
267 #else
268 	fprintf(stderr,"Don't recognize character %c at position %u.  Using N instead\n",Buffer[i],position+i);
269 #endif
270       } else if (*nbadchars == MAX_BADCHAR_MESSAGES) {
271 	fprintf(stderr,"Too many non-recognizable characters.  Not reporting each individual occurrence anymore.\n");
272       } else if ((*nbadchars) % BADCHAR_INTERVAL == 0) {
273 	fprintf(stderr,"A total of %d non-ACGTNX characters seen so far.\n",*nbadchars);
274       }
275       flags |= UINT4_LEFT_BIT;
276       break;
277     }
278   }
279 
280   FWRITE_UINT(high,fp);
281   FWRITE_UINT(low,fp);
282   FWRITE_UINT(flags,fp);
283 
284   return;
285 }
286 
287 
288 static void
put_compressed_one(Genomecomp_T * sectioncomp,int * nbadchars,char Buffer[],Univcoord_T position)289 put_compressed_one (Genomecomp_T *sectioncomp, int *nbadchars, char Buffer[], Univcoord_T position) {
290   Genomecomp_T high = 0U, low = 0U, flags = 0U, carry;
291   int i;
292 
293   for (i = 0; i < 32; i++) {
294     carry = high & 3U;
295     high >>= 2;
296     low >>= 2;
297     flags >>= 1;
298     switch (carry) {
299     case 0U: break;
300     case 1U: low |= UINT4_LEFT_C; break;
301     case 2U: low |= UINT4_LEFT_G; break;
302     case 3U: low |= UINT4_LEFT_T; break;
303     default: abort();
304     }
305 
306     switch (uppercaseCode[(int) Buffer[i]]) {
307     case 'A': break;
308     case 'C': high |= UINT4_LEFT_C; break;
309     case 'G': high |= UINT4_LEFT_G; break;
310     case 'T': high |= UINT4_LEFT_T; break;
311     case 'N': flags |= UINT4_LEFT_BIT; break;
312     case 'X': high |= UINT4_LEFT_T; flags |= UINT4_LEFT_BIT; break;
313     default:
314       (*nbadchars) += 1;
315       if (*nbadchars < MAX_BADCHAR_MESSAGES) {
316 #if defined(LARGE_GENOMES) || (defined(UTILITYP) && defined(HAVE_64_BIT))
317 	fprintf(stderr,"Don't recognize character %c at position %llu.  Using N instead\n",Buffer[i],position+i);
318 #else
319 	fprintf(stderr,"Don't recognize character %c at position %u.  Using N instead\n",Buffer[i],position+i);
320 #endif
321       } else if (*nbadchars == MAX_BADCHAR_MESSAGES) {
322 	fprintf(stderr,"Too many non-recognizable characters.  Not reporting each individual occurrence anymore.\n");
323       } else if ((*nbadchars) % BADCHAR_INTERVAL == 0) {
324 	fprintf(stderr,"A total of %d non-ACGTNX characters seen so far.\n",*nbadchars);
325       }
326       flags |= UINT4_LEFT_BIT;
327       break;
328     }
329   }
330 
331   sectioncomp[0] = high;
332   sectioncomp[1] = low;
333   sectioncomp[2] = flags;
334 
335   return;
336 }
337 
338 
339 static char acgt[4] = {'A','C','G','T'};
340 static char non_acgt[4] = {'N','?','?','X'};
341 
342 /* if gbuffer is NULL, then we fill with X's */
343 /* Based on genomecomp.  Version for genome128 not implemented yet */
344 int
Compress_update_file(int nbadchars,FILE * fp,char * gbuffer,Univcoord_T startpos,Univcoord_T endpos,int index1part)345 Compress_update_file (int nbadchars, FILE *fp, char *gbuffer, Univcoord_T startpos,
346 		      Univcoord_T endpos, int index1part) {
347   /* Chrpos_T length = endpos - startpos; */
348   Univcoord_T startblock, endblock, ptr;
349   unsigned int startdiscard, enddiscard, i;
350   Genomecomp_T high, low, flags;
351   char Buffer[32];
352 
353 
354   ptr = startblock = startpos/32U*3;
355   endblock = endpos/32U*3;
356   startdiscard = startpos % 32;
357   enddiscard = endpos % 32;
358 
359   if (endblock == startblock) {
360     /* Special case */
361     genomecomp_move_absolute(fp,ptr);
362     genomecomp_read_current(&high,&low,&flags,fp,index1part);
363 
364     for (i = 0; i < 16; i++) {
365       Buffer[i] = flags & 1U ? non_acgt[low & 3U] : acgt[low & 3U];
366       low >>= 2;
367       flags >>= 1;
368     }
369     for ( ; i < 32; i++) {
370       Buffer[i] = flags & 1U ? non_acgt[high & 3U] : acgt[high & 3U];
371       high >>= 2;
372       flags >>= 1;
373     }
374     for (i = startdiscard; i < enddiscard; i++) {
375       Buffer[i] = gbuffer ? *gbuffer++ : 'X';
376     }
377     genomecomp_move_absolute(fp,ptr);
378     write_compressed_one(fp,&nbadchars,Buffer,startpos);
379 
380   } else {
381 
382     genomecomp_move_absolute(fp,ptr);
383     genomecomp_read_current(&high,&low,&flags,fp,index1part);
384 
385     for (i = 0; i < 16; i++) {
386       Buffer[i] = flags & 1U ? non_acgt[low & 3U] : acgt[low & 3U];
387       low >>= 2;
388       flags >>= 1;
389     }
390     for ( ; i < 32; i++) {
391       Buffer[i] = flags & 1U ? non_acgt[high & 3U] : acgt[high & 3U];
392       high >>= 2;
393       flags >>= 1;
394     }
395     for (i = startdiscard; i < 32; i++) {
396       Buffer[i] = gbuffer ? *gbuffer++ : 'X';
397     }
398     genomecomp_move_absolute(fp,ptr);
399     write_compressed_one(fp,&nbadchars,Buffer,startpos);
400     ptr += 3;
401 
402     while (ptr < endblock) {
403       for (i = 0; i < 32; i++) {
404 	Buffer[i] = gbuffer ? *gbuffer++ : 'X';
405       }
406       write_compressed_one(fp,&nbadchars,Buffer,ptr/3*32U);
407       ptr += 3;
408     }
409 
410     if (enddiscard > 0) {
411       genomecomp_move_absolute(fp,ptr);
412       genomecomp_read_current(&high,&low,&flags,fp,index1part);
413 
414       for (i = 0; i < 16; i++) {
415 	Buffer[i] = flags & 1U ? non_acgt[low & 3U] : acgt[low & 3U];
416 	low >>= 2;
417 	flags >>= 1;
418       }
419       for ( ; i < 32; i++) {
420 	Buffer[i] = flags & 1U ? non_acgt[high & 3U] : acgt[high & 3U];
421 	high >>= 2;
422 	flags >>= 1;
423       }
424       for (i = 0; i < enddiscard; i++) {
425 	Buffer[i] = gbuffer ? *gbuffer++ : 'X';
426       }
427       genomecomp_move_absolute(fp,ptr);
428       write_compressed_one(fp,&nbadchars,Buffer,ptr/3*32U);
429     }
430   }
431 
432   return nbadchars;
433 }
434 
435 
436 int
Compress_update_memory(int nbadchars,Genomecomp_T * genomecomp,char * gbuffer,Univcoord_T startpos,Univcoord_T endpos)437 Compress_update_memory (int nbadchars, Genomecomp_T *genomecomp, char *gbuffer, Univcoord_T startpos,
438 			Univcoord_T endpos) {
439   /* Chrpos_T length = endpos - startpos; */
440   Univcoord_T startblock, endblock, ptr;
441   Genomecomp_T high, low, flags;
442   char Buffer[32];
443   unsigned int startdiscard, enddiscard, i;
444 
445 
446   ptr = startblock = startpos/32U*3;
447   endblock = endpos/32U*3;
448   startdiscard = startpos % 32;
449   enddiscard = endpos % 32;
450 
451   if (endblock == startblock) {
452     /* Special case */
453     high = genomecomp[ptr];
454     low = genomecomp[ptr+1];
455     flags = genomecomp[ptr+2];
456 
457     /* Fill Buffer with original contents */
458     for (i = 0; i < 16; i++) {
459       Buffer[i] = flags & 1U ? non_acgt[low & 3U] : acgt[low & 3U];
460       low >>= 2;
461       flags >>= 1;
462     }
463     for ( ; i < 32; i++) {
464       Buffer[i] = flags & 1U ? non_acgt[high & 3U] : acgt[high & 3U];
465       high >>= 2;
466       flags >>= 1;
467     }
468     for (i = startdiscard; i < enddiscard; i++) {
469       Buffer[i] = gbuffer ? *gbuffer++ : 'X';
470     }
471     put_compressed_one(&(genomecomp[ptr]),&nbadchars,Buffer,startpos);
472 
473   } else {
474 
475     high = genomecomp[ptr];
476     low = genomecomp[ptr+1];
477     flags = genomecomp[ptr+2];
478 
479     /* Fill Buffer with original contents */
480     for (i = 0; i < 16; i++) {
481       Buffer[i] = flags & 1U ? non_acgt[low & 3U] : acgt[low & 3U];
482       low >>= 2;
483       flags >>= 1;
484     }
485     for ( ; i < 32; i++) {
486       Buffer[i] = flags & 1U ? non_acgt[high & 3U] : acgt[high & 3U];
487       high >>= 2;
488       flags >>= 1;
489     }
490     for (i = startdiscard; i < 32; i++) {
491       Buffer[i] = gbuffer ? *gbuffer++ : 'X';
492     }
493     put_compressed_one(&(genomecomp[ptr]),&nbadchars,Buffer,startpos);
494     ptr += 3;
495 
496     while (ptr < endblock) {
497       for (i = 0; i < 32; i++) {
498 	Buffer[i] = gbuffer ? *gbuffer++ : 'X';
499       }
500       put_compressed_one(&(genomecomp[ptr]),&nbadchars,Buffer,ptr/3*32U);
501       ptr += 3;
502     }
503 
504     if (enddiscard > 0) {
505       high = genomecomp[ptr];
506       low = genomecomp[ptr+1];
507       flags = genomecomp[ptr+2];
508 
509       /* Fill Buffer with original contents */
510       for (i = 0; i < 16; i++) {
511 	Buffer[i] = flags & 1U ? non_acgt[low & 3U] : acgt[low & 3U];
512 	low >>= 2;
513 	flags >>= 1;
514       }
515       for ( ; i < 32; i++) {
516 	Buffer[i] = flags & 1U ? non_acgt[high & 3U] : acgt[high & 3U];
517 	high >>= 2;
518 	flags >>= 1;
519       }
520       for (i = 0; i < enddiscard; i++) {
521 	Buffer[i] = gbuffer ? *gbuffer++ : 'X';
522       }
523       put_compressed_one(&(genomecomp[ptr]),&nbadchars,Buffer,ptr/3*32U);
524     }
525   }
526 
527   return nbadchars;
528 }
529 
530 
531 void
Compress_compress(char ** files,int nfiles,bool stdin_p)532 Compress_compress (char **files, int nfiles, bool stdin_p) {
533   Genomecomp_T low = 0U, high = 0U, flags = 0U, carry;
534   Univcoord_T position = 0UL;
535   FILE *fp;
536   char *line, *p;
537   int c;
538   int in_counter = 0, filei;
539 
540 
541   fprintf(stderr,"Compressing genome");
542 
543   if (stdin_p == true) {
544     nfiles = 1;
545   }
546   for (filei = 0; filei < nfiles; filei++) {
547     if (stdin_p == true) {
548       fp = stdin;
549     } else if ((fp = fopen(files[filei],"r")) == NULL) {
550       fprintf(stderr,"Could not open file %s\n",files[filei]);
551       exit(9);
552     }
553 
554     while ((line = Getline(fp)) != NULL) {
555       if (line[0] == '>') {
556 	/* Skip header */
557       } else {
558 	p = line;
559 	while ((c = (int) *p++) != '\0') {
560 	  if (isalpha(c)) {
561 	    in_counter++;
562 
563 	    carry = high & 3U;
564 	    high >>= 2;
565 	    low >>= 2;
566 	    flags >>= 1;
567 	    switch (carry) {
568 	    case 0U: break;
569 	    case 1U: low |= UINT4_LEFT_C; break;
570 	    case 2U: low |= UINT4_LEFT_G; break;
571 	    case 3U: low |= UINT4_LEFT_T; break;
572 	    default: abort();
573 	    }
574 
575 	    switch (uppercaseCode[c]) {
576 	    case 'A': break;
577 	    case 'C': high |= UINT4_LEFT_C; break;
578 	    case 'G': high |= UINT4_LEFT_G; break;
579 	    case 'T': high |= UINT4_LEFT_T; break;
580 	    case 'N': flags |= UINT4_LEFT_BIT; break;
581 	    case 'X': high |= UINT4_LEFT_T; flags |= UINT4_LEFT_BIT; break;
582 	    default:
583 #if defined(LARGE_GENOMES) || (defined(UTILITYP) && defined(HAVE_64_BIT))
584 	      fprintf(stderr,"Non-standard nucleotide %c at position %llu.  Using N instead\n",c,position);
585 #else
586 	      fprintf(stderr,"Non-standard nucleotide %c at position %u.  Using N instead\n",c,position);
587 #endif
588 	      flags |= UINT4_LEFT_BIT;
589 	      break;
590 	    }
591 
592 	    /* 8 is simply bits per byte */
593 	    if (in_counter == 8 * (int) sizeof(Genomecomp_T)) {
594 	      FWRITE_UINT(high,stdout);
595 	      FWRITE_UINT(low,stdout);
596 	      FWRITE_UINT(flags,stdout);
597 
598 	      low = high = flags = 0U;
599 	      in_counter = 0;
600 	    }
601 
602 	    position++;
603 	    if (position % MONITOR_INTERVAL == 0) {
604 	      fprintf(stderr,".");
605 	    }
606 	  }
607 	}
608       }
609 
610       FREE(line);
611     }
612 
613     if (stdin_p == false) {
614       fclose(fp);
615     }
616   }
617 
618   if (in_counter > 0) {
619     while (in_counter < 8 * (int) sizeof(Genomecomp_T)) {
620       carry = high & 3U;
621       high >>= 2;
622       low >>= 2;
623       flags >>= 1;
624       switch (carry) {
625       case 0U: break;
626       case 1U: low |= UINT4_LEFT_C; break;
627       case 2U: low |= UINT4_LEFT_G; break;
628       case 3U: low |= UINT4_LEFT_T; break;
629       default: abort();
630       }
631       high |= UINT4_LEFT_T; flags |= UINT4_LEFT_BIT; /* Create X's at end */
632       in_counter++;
633     }
634 
635     FWRITE_UINT(high,stdout);
636     FWRITE_UINT(low,stdout);
637     FWRITE_UINT(flags,stdout);
638   }
639 
640   /* Plus extra high and low */
641   high = low = 0xFFFFFFFF;
642   FWRITE_UINT(high,stdout);
643   FWRITE_UINT(low,stdout);
644 
645   fprintf(stderr,"\n");
646 
647   return;
648 }
649 
650 
651 
652 #ifdef HAVE_64_BIT
653 static inline void
nt_unshuffle(UINT4 * highbits,UINT4 * lowbits,UINT4 high,UINT4 low)654 nt_unshuffle (UINT4 *highbits, UINT4 *lowbits, UINT4 high, UINT4 low) {
655   UINT8 x, t;
656 
657   x = (UINT8) high;
658   x <<= 32;
659   x |= low;
660 
661   t = (x ^ (x >> 1))  & 0x2222222222222222;  x = x ^ t ^ (t << 1);
662   t = (x ^ (x >> 2))  & 0x0C0C0C0C0C0C0C0C;  x = x ^ t ^ (t << 2);
663   t = (x ^ (x >> 4))  & 0x00F000F000F000F0;  x = x ^ t ^ (t << 4);
664   t = (x ^ (x >> 8))  & 0x0000FF000000FF00;  x = x ^ t ^ (t << 8);
665   t = (x ^ (x >> 16)) & 0x00000000FFFF0000;  x = x ^ t ^ (t << 16);
666 
667   *highbits = (UINT4) (x >> 32);
668   *lowbits = (UINT4) x;
669 
670   return;
671 }
672 
673 #else
674 
675 static inline void
nt_unshuffle(UINT4 * highbits,UINT4 * lowbits,UINT4 high,UINT4 low)676 nt_unshuffle (UINT4 *highbits, UINT4 *lowbits, UINT4 high, UINT4 low) {
677   UINT4 t;
678 
679   /* unshuffle high */
680   t = (high ^ (high >> 1)) & 0x22222222;  high = high ^ t ^ (t << 1);
681   t = (high ^ (high >> 2)) & 0x0C0C0C0C;  high = high ^ t ^ (t << 2);
682   t = (high ^ (high >> 4)) & 0x00F000F0;  high = high ^ t ^ (t << 4);
683   t = (high ^ (high >> 8)) & 0x0000FF00;  high = high ^ t ^ (t << 8);
684 
685   /* unshuffle low */
686   t = (low ^ (low >> 1)) & 0x22222222;  low = low ^ t ^ (t << 1);
687   t = (low ^ (low >> 2)) & 0x0C0C0C0C;  low = low ^ t ^ (t << 2);
688   t = (low ^ (low >> 4)) & 0x00F000F0;  low = low ^ t ^ (t << 4);
689   t = (low ^ (low >> 8)) & 0x0000FF00;  low = low ^ t ^ (t << 8);
690 
691   *highbits = (high & 0xFFFF0000) | (low >> 16);
692   *lowbits = (high << 16) | (low & 0x0000FFFF);
693 
694   return;
695 }
696 #endif
697 
698 
699 void
Compress_unshuffle(FILE * out,FILE * in)700 Compress_unshuffle (FILE *out, FILE *in) {
701   Genomecomp_T high, low, flags;
702   Genomecomp_T highbits, lowbits;
703 
704   while (FREAD_UINT(&high,in) > 0 &&
705 	 FREAD_UINT(&low,in) > 0 &&
706 	 FREAD_UINT(&flags,in) > 0) {
707     nt_unshuffle(&highbits,&lowbits,high,low);
708     FWRITE_UINT(highbits,out);
709     FWRITE_UINT(lowbits,out);
710     FWRITE_UINT(flags,out);
711   }
712 
713   return;
714 }
715 
716 void
Compress_unshuffle_bits128(FILE * out,FILE * in)717 Compress_unshuffle_bits128 (FILE *out, FILE *in) {
718   UINT4 high0, high1, high2, high3, low0, low1, low2, low3,
719     flags0, flags1, flags2, flags3;
720   UINT4 highbits0, highbits1, highbits2, highbits3,
721     lowbits0, lowbits1, lowbits2, lowbits3;
722 
723   while (!feof(in)) {
724     if (FREAD_UINT(&high0,in) == 0 ||
725 	FREAD_UINT(&low0,in) == 0 ||
726 	FREAD_UINT(&flags0,in) == 0) {
727       highbits0 = -1U;
728       lowbits0 = -1U;
729       flags0 = -1U;
730     } else {
731       nt_unshuffle(&highbits0,&lowbits0,high0,low0);
732     }
733 
734     if (FREAD_UINT(&high1,in) == 0 ||
735 	FREAD_UINT(&low1,in) == 0 ||
736 	FREAD_UINT(&flags1,in) == 0) {
737       highbits1 = -1U;
738       lowbits1 = -1U;
739       flags1 = -1U;
740     } else {
741       nt_unshuffle(&highbits1,&lowbits1,high1,low1);
742     }
743 
744     if (FREAD_UINT(&high2,in) == 0 ||
745 	FREAD_UINT(&low2,in) == 0 ||
746 	FREAD_UINT(&flags2,in) == 0) {
747       highbits2 = -1U;
748       lowbits2 = -1U;
749       flags2 = -1U;
750     } else {
751       nt_unshuffle(&highbits2,&lowbits2,high2,low2);
752     }
753 
754     if (FREAD_UINT(&high3,in) == 0 ||
755 	FREAD_UINT(&low3,in) == 0 ||
756 	FREAD_UINT(&flags3,in) == 0) {
757       highbits3 = -1U;
758       lowbits3 = -1U;
759       flags3 = -1U;
760     } else {
761       nt_unshuffle(&highbits3,&lowbits3,high3,low3);
762     }
763 
764     FWRITE_UINT(highbits0,out);
765     FWRITE_UINT(highbits1,out);
766     FWRITE_UINT(highbits2,out);
767     FWRITE_UINT(highbits3,out);
768     FWRITE_UINT(lowbits0,out);
769     FWRITE_UINT(lowbits1,out);
770     FWRITE_UINT(lowbits2,out);
771     FWRITE_UINT(lowbits3,out);
772     FWRITE_UINT(flags0,out);
773     FWRITE_UINT(flags1,out);
774     FWRITE_UINT(flags2,out);
775     FWRITE_UINT(flags3,out);
776   }
777 
778   return;
779 }
780 
781 
782 /* Needed for user-provided segment in GMAP */
783 Genomecomp_T *
Compress_create_blocks_comp(char * genomicseg,Univcoord_T genomelength)784 Compress_create_blocks_comp (char *genomicseg, Univcoord_T genomelength) {
785   Genomecomp_T *genomecomp;
786   size_t nuint4;
787 
788   nuint4 = ((genomelength + 31)/32U)*3;
789   genomecomp = (Genomecomp_T *) CALLOC(nuint4+4,sizeof(Genomecomp_T));
790   /* Add 4 because Oligoindex_hr procedures point to nextlow as ptr+4 */
791 
792   /* Creates X's at end */
793   genomecomp[nuint4-3] = 0xFFFFFFFF;
794   genomecomp[nuint4-2] = 0xFFFFFFFF;
795   genomecomp[nuint4-1] = 0xFFFFFFFF;
796 
797   /* Plus extra 4 */
798   genomecomp[nuint4]   = 0xFFFFFFFF;
799   genomecomp[nuint4+1] = 0xFFFFFFFF;
800   genomecomp[nuint4+2] = 0xFFFFFFFF;
801   genomecomp[nuint4+3] = 0xFFFFFFFF;
802 
803   Compress_update_memory(/*nbadchars*/0,genomecomp,genomicseg,/*currposition*/0,genomelength);
804 
805   return genomecomp;
806 }
807 
808 
809 /* Needed for user-provided segment in GMAP */
810 Genomecomp_T *
Compress_create_blocks_bits(Genomecomp_T * genomecomp,Univcoord_T genomelength)811 Compress_create_blocks_bits (Genomecomp_T *genomecomp, Univcoord_T genomelength) {
812   Genomecomp_T *genomebits, highbits, lowbits, high, low, flags;
813   size_t nuint4, ptr;
814 
815   nuint4 = ((genomelength + 31)/32U)*3;
816   genomebits = (Genomecomp_T *) CALLOC(nuint4+4,sizeof(Genomecomp_T));
817 
818   for (ptr = 0; ptr < nuint4; ptr += 3) {
819     high = genomecomp[ptr];
820     low = genomecomp[ptr+1];
821     flags = genomecomp[ptr+2];
822 
823     nt_unshuffle(&highbits,&lowbits,high,low);
824     genomebits[ptr] = highbits;
825     genomebits[ptr+1] = lowbits;
826     genomebits[ptr+2] = flags;
827   }
828 
829   return genomebits;
830 }
831 
832 
833 #define GENOMECOMP_LENGTH 32	/* In bp */
834 
835 void
Compress_cat(FILE * out,char ** files,Univcoord_T * genomelengths,int nfiles)836 Compress_cat (FILE *out, char **files, Univcoord_T *genomelengths, int nfiles) {
837   FILE *fp;
838   char *file;
839   int current_pos = 0, bufferlen, shift;
840   UINT4 Buffer[3];
841 #ifdef HAVE_64_BIT
842   UINT8 current_highlow = 0, buffer_highlow = 0;
843 #else
844   UINT4 buffer_high = 0, buffer_low = 0;
845 #endif
846   UINT4 current_high = 0, current_low = 0, current_flags = 0, buffer_flags = 0, high, low;
847 #ifdef DEBUG1
848   int k;
849 #endif
850 
851 #ifdef DEBUG1
852   printf("nfiles: %d\n",nfiles);
853   printf("* Genomelengths");
854   for (k = 0; k < nfiles; k++) {
855     printf(" %llu",genomelengths[k]);
856   }
857   printf("\n");
858 #endif
859 
860 
861   if ((fp = fopen((file = *files++),"rb")) == NULL) {
862     fprintf(stderr,"Genomecomp file %s is not valid\n",file);
863     exit(9);
864   } else {
865     debug1(printf("*** Opening genomecomp file %s\n",file));
866   }
867 
868   while (fp != NULL) {
869 #ifdef DEBUG1
870     printf("* Genomelengths");
871     for (k = 0; k < nfiles; k++) {
872       printf(" %llu",genomelengths[k]);
873     }
874     printf("\n");
875 #endif
876 
877     FREAD_UINTS(Buffer,3,fp);
878     debug1(printf("Reading buffer: %08x %08x %08x\n",Buffer[0],Buffer[1],Buffer[2]));
879 #ifdef HAVE_64_BIT
880     buffer_highlow = (((UINT8) Buffer[0]) << 32) | ((UINT8) Buffer[1]);
881 #else
882     buffer_high = Buffer[0];
883     buffer_low = Buffer[1];
884 #endif
885     buffer_flags = Buffer[2];
886 
887     if (*genomelengths <= GENOMECOMP_LENGTH) {
888       bufferlen = *genomelengths;
889       *genomelengths = 0;
890 
891 #ifdef HAVE_64_BIT
892       buffer_highlow &= 0xFFFFFFFFFFFFFFFF >> (64 - 2*bufferlen); /* mask trailing FFFFFFFF */
893 #else
894       if (bufferlen <= 16) {
895 	buffer_high = 0;
896 	buffer_low &= 0xFFFFFFFF >> (32 - 2*bufferlen);
897       } else {
898 	buffer_high &= 0xFFFFFFFF >> (64 - 2*bufferlen);
899       }
900 #endif
901       buffer_flags &= 0xFFFFFFFF >> (32 - bufferlen); /* mask trailing FFFFFFFF */
902 
903       fclose(fp);
904       if (--nfiles == 0) {
905 	fp = (FILE *) NULL;
906       } else if ((fp = fopen((file = *files++),"rb")) == NULL) {
907 	fprintf(stderr,"Regiondb file %s is not valid\n",file);
908 	exit(9);
909       } else {
910 	debug1(printf("*** Opening regiondb file %s\n",file));
911 	genomelengths++;
912       }
913 
914     } else {
915       bufferlen = GENOMECOMP_LENGTH;
916       *genomelengths -= GENOMECOMP_LENGTH;
917     }
918 #ifdef HAVE_64_BIT
919     debug1(printf("Buffer with bufferlen %d: %016llx %08x\n",bufferlen,buffer_highlow,buffer_flags));
920     debug1(printf("Current with current_pos %d: %016llx %08x\n",current_pos,current_highlow,current_flags));
921 #else
922     debug1(printf("Buffer with bufferlen %d: %08x %08x %08x\n",bufferlen,buffer_high,buffer_low,buffer_flags));
923     debug1(printf("Current with current_pos %d: %08x %08x %08x\n",current_pos,current_high,current_low,current_flags));
924 #endif
925 
926     if (current_pos + bufferlen < GENOMECOMP_LENGTH) {
927       debug1(printf("Case 1\n"));
928       debug1(printf("Appending buffer with bufferlen %d\n",bufferlen));
929 #ifdef HAVE_64_BIT
930       current_highlow |= buffer_highlow << 2*current_pos; /* append */
931 #else
932       /* append */
933       if (current_pos == 0) {
934         current_low = buffer_low;
935         current_high = buffer_high;
936       } else if (current_pos < 16) {
937 	current_low |= buffer_low << 2*current_pos;
938 	current_high = (buffer_low >> (32 - 2*current_pos)) | (buffer_high << 2*current_pos);
939       } else if (current_pos == 16) {
940 	current_low = 0;
941 	current_high = buffer_low;
942       } else {
943 	current_high |= buffer_low << (2*current_pos - 32);
944       }
945 #endif
946       current_flags |= buffer_flags << current_pos; /* append */
947       current_pos += bufferlen;
948 
949     } else if (current_pos + bufferlen == GENOMECOMP_LENGTH) {
950       debug1(printf("Case 2\n"));
951       debug1(printf("Appending buffer with bufferlen %d\n",bufferlen));
952 #ifdef HAVE_64_BIT
953       current_highlow |= buffer_highlow << 2*current_pos; /* append */
954 #else
955       /* append */
956       if (current_pos == 0) {
957         current_low = buffer_low;
958         current_high = buffer_high;
959       } else if (current_pos < 16) {
960 	current_low |= buffer_low << 2*current_pos;
961 	current_high = (buffer_low >> (32 - 2*current_pos)) | (buffer_high << 2*current_pos);
962       } else if (current_pos == 16) {
963 	current_low = 0;
964 	current_high = buffer_low;
965       } else {
966 	current_high |= buffer_low << (2*current_pos - 32);
967       }
968 #endif
969       current_flags |= buffer_flags << current_pos; /* append */
970 
971 #ifdef HAVE_64_BIT
972       current_high = (UINT4) (current_highlow >> 32); current_low = (UINT4) (current_highlow & 0xFFFFFFFF); /* write */
973 #endif
974       FWRITE_UINT(current_high,out); FWRITE_UINT(current_low,out); FWRITE_UINT(current_flags,out); /* write */
975       debug1(printf("Writing current: %08x %08x %08x\n",current_high,current_low,current_flags));
976 
977 #ifdef HAVE_64_BIT
978       current_highlow = 0; current_flags = 0; /* clear */
979 #else
980       current_high = current_low = current_flags = 0; /* clear */
981 #endif
982 
983       current_pos = 0;
984 
985     } else {
986       debug1(printf("Case 3\n"));
987       debug1(printf("Appending buffer with truncated bufferlen %d\n",GENOMECOMP_LENGTH-current_pos));
988 #ifdef HAVE_64_BIT
989       current_highlow |= buffer_highlow << 2*current_pos; /* append */
990 #else
991       /* append */
992       if (current_pos == 0) {
993         current_low = buffer_low;
994         current_high = buffer_high;
995       } else if (current_pos < 16) {
996 	current_low |= buffer_low << 2*current_pos;
997 	current_high = (buffer_low >> (32 - 2*current_pos)) | (buffer_high << 2*current_pos);
998       } else if (current_pos == 16) {
999 	current_low = 0;
1000 	current_high = buffer_low;
1001       } else {
1002 	current_high |= buffer_low << (2*current_pos - 32);
1003       }
1004 #endif
1005       current_flags |= buffer_flags << current_pos; /* append */
1006 
1007 #ifdef HAVE_64_BIT
1008       current_high = (UINT4) (current_highlow >> 32); current_low = (UINT4) (current_highlow & 0xFFFFFFFF); /* write */
1009 #endif
1010       FWRITE_UINT(current_high,out); FWRITE_UINT(current_low,out); FWRITE_UINT(current_flags,out); /* write */
1011       debug1(printf("Writing current: %08x %08x %08x\n",current_high,current_low,current_flags));
1012 
1013       shift = GENOMECOMP_LENGTH - current_pos;
1014       debug1(printf("Saving remainder with shift %d\n",shift));
1015 #ifdef HAVE_64_BIT
1016       debug1(printf("Buffer: %016llx %08x\n",buffer_highlow,buffer_flags));
1017       current_highlow = buffer_highlow >> 2*shift; /* remainder */
1018 #else
1019       debug1(printf("Buffer: %08x %08x %08x\n",buffer_high,buffer_low,buffer_flags));
1020       /* remainder */
1021       if (shift == 0) {
1022 	current_high = buffer_high;
1023 	current_low = buffer_low;
1024       } else if (shift < 16) {
1025 	current_high = buffer_high >> 2*shift;
1026 	current_low = (buffer_low >> 2*shift) | (buffer_high << (32 - 2*shift));
1027       } else if (shift == 16) {
1028 	current_high = 0;
1029 	current_low = buffer_high;
1030       } else {
1031 	current_high = 0;
1032 	current_low = buffer_high >> (2*shift - 32);
1033       }
1034 #endif
1035       current_flags = buffer_flags >> shift; /* remainder */
1036 #ifdef HAVE_64_BIT
1037       debug1(printf("Remainder: %016llx %08x\n",current_highlow,current_flags));
1038 #else
1039       debug1(printf("Remainder: %08x %08x %08x\n",current_high,current_low,current_flags));
1040 #endif
1041 
1042       current_pos = current_pos + bufferlen - GENOMECOMP_LENGTH;
1043     }
1044 #ifdef HAVE_64_BIT
1045     debug1(printf("Now current with current_pos %d: %016llx %08x\n",current_pos,current_highlow,current_flags));
1046 #else
1047     debug1(printf("Now current with current_pos %d: %08x %08x %08x\n",current_pos,current_high,current_low,current_flags));
1048 #endif
1049   }
1050 
1051   if (current_pos > 0) {
1052     /* Introduce trailing FFFFFFFF.  Based on current_pos. */
1053     debug1(printf("Final block.  current_pos %d\n",current_pos));
1054 #ifdef HAVE_64_BIT
1055     current_highlow |= 0xFFFFFFFFFFFFFFFF << 2*current_pos; /* add trailing FFFFFFFF */
1056     current_high = (UINT4) (current_highlow >> 32); current_low = (UINT4) (current_highlow & 0xFFFFFFFF); /* write */
1057 #else
1058     /* add trailing FFFFFFFF */
1059     if (current_pos < 16) {
1060       current_high = 0xFFFFFFFF;
1061       current_low |= 0xFFFFFFFF << 2*current_pos;
1062     } else {
1063       current_high |= 0xFFFFFFFF << (2*current_pos - 32);
1064     }
1065 #endif
1066     current_flags |= 0xFFFFFFFF << current_pos; /* add trailing FFFFFFFF */
1067     FWRITE_UINT(current_high,out); FWRITE_UINT(current_low,out); FWRITE_UINT(current_flags,out); /* write */
1068     debug1(printf("Writing current: %08x %08x %08x\n",current_high,current_low,current_flags));
1069   }
1070 
1071   /* Write an extra two UINT4 */
1072   high = low = 0xFFFFFFFF;
1073   FWRITE_UINT(high,out);
1074   FWRITE_UINT(low,out);
1075 
1076   return;
1077 }
1078 
1079