1 static char rcsid[] = "$Id: sequence.c 221673 2020-02-11 16:17:58Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 #ifndef HAVE_MEMCPY
6 # define memcpy(d,s,n) bcopy((s),(d),(n))
7 #endif
8 #ifndef HAVE_MEMMOVE
9 # define memmove(d,s,n) bcopy((s),(d),(n))
10 #endif
11 
12 #include "sequence.h"
13 
14 #include <stddef.h>
15 #include <stdlib.h>
16 #include <string.h>
17 #include <strings.h>		/* For rindex */
18 #include <ctype.h>		/* For iscntrl, isspace, and toupper */
19 
20 #ifdef HAVE_ZLIB
21 #include <zlib.h>
22 #define GZBUFFER_SIZE 131072
23 #endif
24 
25 
26 #include "assert.h"
27 #include "mem.h"
28 #include "complement.h"
29 #include "intlist.h"
30 #include "fopen.h"
31 #include "md5.h"
32 #ifdef PMAP
33 #include "dynprog.h"
34 #endif
35 
36 /* Before setting DEBUG, may want to reduce MAXSEQLEN in sequence.h */
37 #ifdef DEBUG
38 #define debug(x) x
39 #else
40 #define debug(x)
41 #endif
42 
43 /* Pointers for first half and second half */
44 #ifdef DEBUG1
45 #define debug1(x) x
46 #else
47 #define debug1(x)
48 #endif
49 
50 
51 
52 /***********************************************************************
53  *    Definitions:
54  *
55  *   TTTTTT ACGT ...... ACGT AAAAAA
56  *          <- trimlength ->
57  *   <-------- fulllength -------->
58  *          ^trimstart
59  *   ^contents
60  *
61  ************************************************************************/
62 
63 
64 #define T Sequence_T
65 struct T {
66   char *acc;			/* Accession */
67   char *restofheader;		/* Rest of header */
68   char *contents;		/* Original sequence, ends with '\0' */
69   char *contents_alloc;		/* Allocation */
70   int fulllength;		/* Full length (not including chopped sequence) */
71 
72   int trimstart;		/* Start of trim */
73   int trimend;			/* End of trim */
74 #ifdef PMAP
75   int fulllength_given;		/* Full length minus implicit stop codon at end */
76 #endif
77   int subseq_offset;		/* Used only for subsequences */
78   int skiplength;		/* Used only for sequences longer than MAXSEQLEN */
79   /* bool free_contents_p; */
80 
81 #ifndef PMAP
82   char *quality;		/* For Illumina short reads read via extended FASTA */
83   char *quality_alloc;		/* Allocation */
84 #endif
85 
86   bool firstp;			/* First end indicated by '>', second end by '<' in extended FASTA file */
87 };
88 
89 bool
Sequence_firstp(T this)90 Sequence_firstp (T this) {
91   return this->firstp;
92 }
93 
94 char *
Sequence_accession(T this)95 Sequence_accession (T this) {
96   return this->acc;
97 }
98 
99 
100 static char *
skip_whitespace(char * string)101 skip_whitespace (char *string) {
102   int c;
103 
104   while ((c = *string) != '\0' && isspace(c)) {
105     string++;
106   }
107 
108   return string;
109 }
110 
111 char *
Sequence_restofheader(T this)112 Sequence_restofheader (T this) {
113   return skip_whitespace(this->restofheader);
114 }
115 
116 char *
Sequence_restofheader_wannot(T this)117 Sequence_restofheader_wannot (T this) {
118   char *header, *p;
119 
120   p = skip_whitespace(this->restofheader);
121   if (*p == '\0') {
122     return (char *) NULL;
123   } else {
124     header = MALLOC(strlen("Annot=\"")+(strlen(p)+strlen("\"")+1)*sizeof(char));
125     sprintf(header,"Annot=\"%s\"",p);
126     return header;
127   }
128 }
129 
130 char *
Sequence_restofheader_keyvalue(T this)131 Sequence_restofheader_keyvalue (T this) {
132   char *header, *p, *q, lastc, c;
133   int bracket_level = 0;
134 
135   p = skip_whitespace(this->restofheader);
136   if (*p == '\0') {
137     return (char *) NULL;
138   } else {
139     q = header = MALLOC((strlen(p)+1)*sizeof(char));
140 
141     lastc = '\0';
142     while ((c = *p++) != '\0') {
143       if (c == '{') {
144 	if (bracket_level == 0) {
145 	  *q++ = '"';
146 	} else {
147 	  *q++ = c;
148 	}
149 	bracket_level++;
150 
151       } else if (c == '}') {
152 	bracket_level--;
153 	if (bracket_level == 0) {
154 	  *q++ = '"';
155 	} else {
156 	  *q++ = c;
157 	}
158 
159       } else if (c == ' ') {
160 	if (bracket_level > 0) {
161 	  *q++ = ' ';
162 	} else if (lastc == ' ') {
163 	  /* Skip */
164 	} else {
165 	  *q++ = ';';
166 	}
167 
168       } else {
169 	*q++ = c;
170       }
171 
172       lastc = c;
173     }
174 
175     *q = '\0';
176     return header;
177   }
178 }
179 
180 char *
Sequence_fullpointer(T this)181 Sequence_fullpointer (T this) {
182   return this->contents;
183 }
184 
185 char *
Sequence_trimpointer(T this)186 Sequence_trimpointer (T this) {
187   return &(this->contents[this->trimstart]);
188 }
189 
190 #ifndef PMAP
191 char *
Sequence_quality_string(T this)192 Sequence_quality_string (T this) {
193   return this->quality;
194 }
195 #endif
196 
197 
198 int
Sequence_ntlength(T this)199 Sequence_ntlength (T this) {
200 #ifdef PMAP
201   return 3*this->fulllength;
202 #else
203   return this->fulllength;
204 #endif
205 }
206 
207 int
Sequence_fulllength(T this)208 Sequence_fulllength (T this) {
209   return this->fulllength;
210 }
211 
212 int
Sequence_fulllength_given(T this)213 Sequence_fulllength_given (T this) {
214 #ifdef PMAP
215   return this->fulllength_given;
216 #else
217   return this->fulllength;
218 #endif
219 }
220 
221 char *
Sequence_subseq_pointer(T this,int querystart)222 Sequence_subseq_pointer (T this, int querystart) {
223   return &(this->contents[querystart]);
224 }
225 
226 int
Sequence_subseq_length(T this,int querystart)227 Sequence_subseq_length (T this, int querystart) {
228   return this->fulllength - querystart;
229 }
230 
231 
232 
233 int
Sequence_trimlength(T this)234 Sequence_trimlength (T this) {
235   return this->trimend - this->trimstart;
236 }
237 
238 void
Sequence_trim(T this,int trim_start,int trim_end)239 Sequence_trim (T this, int trim_start, int trim_end) {
240   this->trimstart = trim_start;
241   this->trimend = trim_end;
242   return;
243 }
244 
245 int
Sequence_trim_start(T this)246 Sequence_trim_start (T this) {
247   return this->trimstart;
248 }
249 
250 int
Sequence_trim_end(T this)251 Sequence_trim_end (T this) {
252   return this->trimend;
253 }
254 
255 int
Sequence_subseq_offset(T this)256 Sequence_subseq_offset (T this) {
257   return this->subseq_offset;
258 }
259 
260 int
Sequence_skiplength(T this)261 Sequence_skiplength (T this) {
262   return this->skiplength;
263 }
264 
265 void
Sequence_free(T * old)266 Sequence_free (T *old) {
267   if (*old) {
268     if ((*old)->restofheader != NULL) {
269       FREE_IN((*old)->restofheader);
270     }
271     if ((*old)->acc != NULL) {
272       FREE_IN((*old)->acc);
273     }
274 
275 #ifndef PMAP
276     if ((*old)->quality_alloc != NULL) {
277       FREE_IN((*old)->quality_alloc);
278     }
279 #endif
280 
281     FREE_IN((*old)->contents_alloc);
282 
283     FREE_IN(*old);
284   }
285   return;
286 }
287 
288 #ifdef PMAP
289 static int aa_index_table[128] =
290   { -1,
291     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
292     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
293     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
294     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
295 
296   /*     *                                  */
297     -1, 20, -1, -1, -1, -1, -1, -1, -1, -1,
298     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
299     -1, -1, -1, -1,
300 
301   /* A,  B,  C,  D,  E,  F,  G,  H,  I,  J, */
302      0, -1,  1,  2,  3,  4,  5,  6,  7, -1,
303 
304   /* K,  L,  M,  N,  O,  P,  Q,  R,  S,  T  */
305      8,  9, 10, 11, -1, 12, 13, 14, 15, 16,
306 
307   /* U,  V,  W,  X,  Y,  Z  */
308     21, 17, 18, -1, 19, -1,
309 
310     -1, -1, -1, -1, -1, -1,
311 
312   /* a,  b,  c,  d,  e,  f,  g,  h,  i,  j, */
313      0, -1,  1,  2,  3,  4,  5,  6,  7, -1,
314 
315   /* k,  l,  m,  n,  o,  p,  q,  r,  s,  t  */
316      8,  9, 10, 11, -1, 12, 13, 14, 15, 16,
317 
318   /* u,  v,  w,  x,  y,  z  */
319     21, 17, 18, -1, 19, -1,
320 
321     -1, -1, -1, -1, -1};
322 
323 
324 static char *iupac_table[21+1] =
325   {"GCN",			/* A */
326    "TGY",			/* C */
327    "GAY",			/* D */
328    "GAR",			/* E */
329    "TTY",			/* F */
330    "GGN",			/* G */
331    "CAY",			/* H */
332    "ATH",			/* I */
333    "AAR",			/* K */
334    "YTN",			/* L */
335    "ATG",			/* M */
336    "AAY",			/* N */
337    "CCN",			/* P */
338    "CAR",			/* Q */
339    "MGN",			/* R */
340    "WSN",			/* S */
341    "ACN",			/* T */
342    "GTN",			/* V */
343    "TGG",			/* W */
344    "TAY",			/* Y */
345    "TRR",			/* STOP */
346    "TGA"};			/* U */
347 
348 
349 static char aa_table[21+1] = "ACDEFGHIKLMNPQRSTVWY*U";
350 
351 char
Sequence_codon_char(char aa,int codonpos)352 Sequence_codon_char (char aa, int codonpos) {
353   int index;
354   char *codon;
355 
356   if ((index = aa_index_table[(int) aa]) < 0) {
357     return 'N';
358   } else {
359     codon = iupac_table[index];
360     return codon[codonpos];
361   }
362 }
363 
364 
365 
366 static char *
instantiate_codons(char * aasequence,int ntlength)367 instantiate_codons (char *aasequence, int ntlength) {
368   char *newntsequence;
369   int aapos, ntpos;
370   int index, frame;
371   char *codon, aa;
372 
373   newntsequence = (char *) CALLOC(ntlength+1,sizeof(char));
374 
375   for (ntpos = 0; ntpos < ntlength; ntpos++) {
376     aapos = (/*offset+*/ntpos)/3;
377 
378     aa = aasequence[aapos];
379     index = aa_index_table[(int) aa];
380     if (index < 0) {
381       newntsequence[ntpos] = 'N';
382     } else {
383       codon = iupac_table[index];
384 
385       frame = (/*offset+*/ntpos) % 3;
386       newntsequence[ntpos] = codon[frame];
387     }
388   }
389 
390 #if 0
391   printf("New sequence: %s\n",newntsequence);
392 #endif
393   return newntsequence;
394 }
395 
396 
397 T
Sequence_convert_to_nucleotides(T this)398 Sequence_convert_to_nucleotides (T this) {
399   T new = (T) MALLOC_IN(sizeof(*new));
400   int i;
401 
402   new->acc = (char *) NULL;
403   new->restofheader = (char *) NULL;
404   new->fulllength = this->fulllength*3;
405   new->fulllength_given = this->fulllength_given*3;
406   new->contents = new->contents_alloc =
407     instantiate_codons(/*aasequence*/this->contents,/*ntlength*/this->fulllength*3);
408 #if 0
409   for (i = 0; i < new->fulllength; i++) {
410     new->contents[i] = '?';
411   }
412 #endif
413 
414   new->trimstart = 0;
415   new->trimend = new->fulllength_given;
416   /* new->free_contents_p = true; */
417   new->subseq_offset = 0;
418   new->skiplength = this->skiplength;
419   new->firstp = this->firstp;
420 
421   return new;
422 }
423 #endif
424 
425 
426 #if 0
427 int
428 Sequence_count_bad (T this, int pos, int max, int direction) {
429   int nbad = 0;
430 
431   if (direction > 0) {
432     while (--max >= 0 && pos < this->fulllength) {
433       if (this->contents[pos] == 'X') {
434 	nbad++;
435       }
436       pos++;
437     }
438   } else {
439     while (--max >= 0 && pos >= 0) {
440       if (this->contents[pos] == 'X') {
441 	nbad++;
442       }
443       pos--;
444     }
445   }
446 
447   return nbad;
448 }
449 #endif
450 
451 
452 #define HEADERLEN 512
453 #define DISCARDLEN 8192
454 
455 static char Header[HEADERLEN];
456 static char Discard[DISCARDLEN];
457 
458 static char Sequence[1+MAXSEQLEN+1]; /* Used by Sequence_read_unlimited */
459 static char Sequence1[HALFLEN+1]; /* 1 at end for '\0' */
460 static char Sequence2[HALFLEN+3]; /* 1 at end for '\0' and 2 extra in cyclic part for '\n' and '\0' */
461 
462 static char *Firsthalf;
463 static char *Secondhalf;
464 /* static int Initc = '\0'; */
465 
466 
467 /* The first element of Sequence is always the null character, to mark
468    the end of the string */
469 
470 /* Skipping of dashes might still be buggy */
471 /*
472 #define DASH '-'
473 */
474 
475 
476 /* Returns '@', '>', or '<' if FASTA file, first sequence char if not */
477 int
Sequence_input_init(FILE * fp)478 Sequence_input_init (FILE *fp) {
479   int c;
480   bool okayp = false;
481 
482   Header[0] = '\0';
483   Sequence[0] = '\0';
484   Firsthalf = &(Sequence1[0]);
485   Secondhalf = &(Sequence2[0]);
486 
487   while (okayp == false && (c = fgetc(fp)) != EOF) {
488     debug(printf("Read character %c\n",c));
489     if (iscntrl(c)) {
490 #ifdef DASH
491     } else if (c == DASH) {
492 #endif
493     } else if (isspace(c)) {
494     } else {
495       okayp = true;
496     }
497   }
498 
499   debug(printf("Returning initial character %c\n",c));
500   return c;
501 }
502 
503 #ifdef HAVE_ZLIB
504 /* Returns '@', '>', or '<' if FASTA file, first sequence char if not */
505 int
Sequence_input_init_gzip(gzFile fp)506 Sequence_input_init_gzip (gzFile fp) {
507   int c;
508   bool okayp = false;
509 
510   Header[0] = '\0';
511   Sequence[0] = '\0';
512   Firsthalf = &(Sequence1[0]);
513   Secondhalf = &(Sequence2[0]);
514 
515   while (okayp == false && (c = gzgetc(fp)) != EOF) {
516     debug(printf("Read character %c\n",c));
517     if (iscntrl(c)) {
518 #ifdef DASH
519     } else if (c == DASH) {
520 #endif
521     } else if (isspace(c)) {
522     } else {
523       okayp = true;
524     }
525   }
526 
527   debug(printf("Returning initial character %c\n",c));
528   return c;
529 }
530 #endif
531 
532 #ifdef HAVE_BZLIB
533 /* Returns '@', '>', or '<' if FASTA file, first sequence char if not */
534 int
Sequence_input_init_bzip2(Bzip2_T fp)535 Sequence_input_init_bzip2 (Bzip2_T fp) {
536   int c;
537   bool okayp = false;
538 
539   Header[0] = '\0';
540   Sequence[0] = '\0';
541   Firsthalf = &(Sequence1[0]);
542   Secondhalf = &(Sequence2[0]);
543 
544   while (okayp == false && (c = bzgetc(fp)) != EOF) {
545     debug(printf("Read character %c\n",c));
546     if (iscntrl(c)) {
547 #ifdef DASH
548     } else if (c == DASH) {
549 #endif
550     } else if (isspace(c)) {
551     } else {
552       okayp = true;
553     }
554   }
555 
556   debug(printf("Returning initial character %c\n",c));
557   return c;
558 }
559 #endif
560 
561 
562 static void
blank_header(T this)563 blank_header (T this) {
564   this->acc = (char *) CALLOC_IN(strlen("NO_HEADER")+1,sizeof(char));
565   strcpy(this->acc,"NO_HEADER");
566   this->restofheader = (char *) CALLOC_IN(1,sizeof(char));
567   this->restofheader[0] = '\0';
568   return;
569 }
570 
571 static char *
input_header(FILE * fp,T this)572 input_header (FILE *fp, T this) {
573   char *p;
574   size_t length;
575 
576   if (feof(fp)) {
577     return NULL;
578   } else if (fgets(&(Header[0]),HEADERLEN,fp) == NULL) {
579     /* File must terminate after > */
580     return NULL;
581   }
582 
583   if (Header[0] == '\n') {
584     Header[0] = '\0';
585   } else if ((p = rindex(&(Header[0]),'\n')) != NULL) {
586     if (p[-1] == '\r') {
587       p--;
588     }
589     *p = '\0';
590   } else {
591     /* Eliminate rest of header from input */
592     while (fgets(&(Discard[0]),DISCARDLEN,fp) != NULL &&
593 	   rindex(&(Discard[0]),'\n') == NULL) ;
594   }
595 
596   p = &(Header[0]);
597   while (*p != '\0' && !isspace((int) *p)) {
598     p++;
599   }
600   if (*p == '\0') {
601     /* Accession only */
602     length = (p - &(Header[0]))/sizeof(char);
603     this->acc = (char *) CALLOC_IN(length+1,sizeof(char));
604     strcpy(this->acc,Header);
605     this->restofheader = (char *) CALLOC_IN(1,sizeof(char));
606     this->restofheader[0] = '\0';
607   } else {
608     *p = '\0';
609     length = (p - &(Header[0]))/sizeof(char);
610     this->acc = (char *) CALLOC_IN(length+1,sizeof(char));
611     strcpy(this->acc,Header);
612     p++;
613     this->restofheader = (char *) CALLOC_IN(strlen(p)+1,sizeof(char));
614     strcpy(this->restofheader,p);
615   }
616 
617   return this->acc;
618 }
619 
620 static bool
skip_header(FILE * fp)621 skip_header (FILE *fp) {
622 
623   if (feof(fp)) {
624     return false;
625   } else if (fgets(&(Header[0]),HEADERLEN,fp) == NULL) {
626     /* File must terminate after > */
627     return false;
628   }
629 
630   if (rindex(&(Header[0]),'\n') == NULL) {
631     /* Eliminate rest of header from input */
632     while (fgets(&(Discard[0]),DISCARDLEN,fp) != NULL &&
633 	   rindex(&(Discard[0]),'\n') == NULL) ;
634   }
635 
636   return true;
637 }
638 
639 #ifdef DEBUG
640 static void
print_contents(char * p,int length)641 print_contents (char *p, int length) {
642   int i;
643   FILE *fp = stdout;
644 
645   fprintf(fp,"\"");
646   for (i = 0; i < length; i++) {
647     if (*p == '\0') {
648       fprintf(fp,"_");
649     } else {
650       fprintf(fp,"%c",*p);
651     }
652     p++;
653   }
654   fprintf(fp,"\"\n");
655   return;
656 }
657 #endif
658 
659 
660 #define CONTROLM 13		/* From PC */
661 #define SPACE 32
662 
663 #if 0
664 static char *
665 find_bad_char (char *line) {
666   char *first, *p1, *p2;
667 #ifdef DASH
668   char *p3;
669 #endif
670 
671   /* p1 = index(line,CONTROLM); */
672   p2 = index(line,SPACE);
673   /* p3 = index(line,DASH); */
674 
675   if (/* p1 == NULL && */ p2 == NULL /* && p3 == NULL*/) {
676     return NULL;
677   } else {
678 #if 0
679     if (p1) {
680       first = p1;
681     }
682     if (p2) {
683       first = p2;
684     }
685     /*
686     if (p3) {
687       first = p3;
688     }
689     */
690     if (p1 && p1 < first) {
691       first = p1;
692     }
693     if (p2 && p2 < first) {
694       first = p2;
695     }
696     /*
697     if (p3 && p3 < first) {
698       first = p3;
699     }
700     */
701     return first;
702 #else
703     return p2;
704 #endif
705   }
706 }
707 #endif
708 
709 
710 static int
read_first_half(int * nextchar,bool * eolnp,FILE * fp,bool possible_fasta_header_p)711 read_first_half (int *nextchar, bool *eolnp, FILE *fp, bool possible_fasta_header_p) {
712   int remainder, strlenp;
713   char *ptr, *p = NULL;
714   int c;
715   bool init_char_p = true;
716 
717   ptr = &(Firsthalf[0]);
718   if (possible_fasta_header_p == false || (*nextchar != '@' && *nextchar != '>' && *nextchar != '<' && *nextchar != '+')) {
719     *ptr++ = (char) *nextchar;
720   }
721   remainder = (&(Firsthalf[HALFLEN]) - ptr)/sizeof(char);
722 
723   while (1) {
724     if (remainder <= 0) {
725       debug(printf("remainder <= 0.  Returning false\n"));
726       *nextchar = EOF;
727       debug1(printf("read_first_half returning length1 of %d\n",(ptr - &(Firsthalf[0]))/sizeof(char)));
728       return (ptr - &(Firsthalf[0]))/sizeof(char);
729 
730     } else if (feof(fp)) {
731       /* EOF in middle of line */
732       debug(printf("EOF.  Returning true\n"));
733       *nextchar = EOF;
734       debug1(printf("read_first_half returning length1 of %d\n",(ptr - &(Firsthalf[0]))/sizeof(char)));
735       return (ptr - &(Firsthalf[0]))/sizeof(char);
736 
737     } else if (*eolnp == true) {
738       /* Peek at character after eoln */
739       if ((c = fgetc(fp)) == EOF || (init_char_p == false && (c == '@' || c == '>' || c == '<' || c == '+'))) {
740 	debug(printf("c == EOF or > or < or +.   Returning true\n"));
741 	*nextchar = c;
742 	return (ptr - &(Firsthalf[0]))/sizeof(char);
743       } else if (iscntrl(c)) {
744 	debug(printf("c == control char.  Continuing\n"));
745 #ifdef DASH
746       } else if (c == DASH) {
747 	debug(printf("c == dash.  Continuing\n"));
748 #endif
749       } else if (isspace(c)) {
750 	*eolnp = true;
751 	debug(printf("c == NULL.  Continuing\n"));
752       } else {
753 	*ptr++ = (char) c;
754 	remainder--;
755 	*eolnp = false;
756 	p = NULL;
757 	debug(printf("c == sth.  Continuing\n"));
758       }
759 
760     } else {
761       debug(printf("Trying to read remainder of %d\n",remainder));
762       if (p != NULL) {
763 	strlenp = strlen(p);
764 	memmove(ptr,p,strlenp);
765         ptr[strlenp] = '\0';
766 	debug(printf("Did memmove of %d chars at %p to %p\n",strlenp,p,ptr));
767       } else {
768 	p = fgets(ptr,remainder+1,fp);
769       }
770       if (p == NULL) {
771 	debug(printf("line == NULL.  Returning true\n"));
772 	*nextchar = EOF;
773 	debug1(printf("read_first_half returning length1 of %d\n",(ptr - &(Firsthalf[0]))/sizeof(char)));
774 	return (ptr - &(Firsthalf[0]))/sizeof(char);
775       } else {
776 	debug(printf("Read %s.\n",ptr));
777 	/* Was a call to find_bad_char */
778 	while ((p = index(ptr,SPACE)) != NULL) {
779 	  ptr = p++;
780 	  strlenp = strlen(p);
781 	  memmove(ptr,p,strlenp);
782 	  ptr[strlenp] = '\0';
783 	  debug(printf("Found space.  Did memmove of %d chars at %p to %p\n",strlenp,p,ptr));
784 	}
785 	if (*ptr == '\n') {
786 	  *eolnp = true;
787 	  debug(printf("line == EOLN.  Continuing\n"));
788 	} else if ((p = index(ptr,'\n')) != NULL) {
789 	  if (p[-1] == '\r') {
790 	    p--;
791 	  }
792 	  ptr = p;
793 	  *eolnp = true;
794 	  debug(printf("line == EOLN.  Continuing\n"));
795 	} else {
796 	  ptr += strlen(ptr);
797 	  *eolnp = false;
798 	  p = NULL;
799 	  debug(printf("line != EOLN.  Continuing\n"));
800 	}
801 	remainder = (&(Firsthalf[HALFLEN]) - ptr)/sizeof(char);
802       }
803     }
804 
805     debug(print_contents(&(Firsthalf[0]),HALFLEN+1));
806     init_char_p = false;
807   }
808 }
809 
810 /* returns skip length */
811 static int
read_second_half(int * nextchar,char ** pointer2a,int * length2a,char ** pointer2b,int * length2b,bool eolnp,FILE * fp)812 read_second_half (int *nextchar, char **pointer2a, int *length2a, char **pointer2b, int *length2b,
813 		  bool eolnp, FILE *fp) {
814   int skiplength, ncycles = 0, remainder, terminator, strlenp;
815   char *ptr;
816   char *p = NULL;
817   int c;
818   bool init_char_p = true;
819 
820   ptr = &(Secondhalf[0]);
821   remainder = (&(Secondhalf[HALFLEN+2]) - ptr)/sizeof(char);
822 
823   while (1) {
824     debug(printf("\nEnd: %d\n",remainder));
825 
826     if (feof(fp)) {
827       debug(printf("EOF.  Returning\n"));
828       *nextchar = EOF;
829       break;
830 
831     } else if (remainder <= 0) {
832       ptr = &(Secondhalf[0]);
833       remainder = (&(Secondhalf[HALFLEN+2]) - ptr)/sizeof(char);
834       ncycles++;
835       debug(printf("remainder <= 0.  Cycling\n"));
836 
837     } else if (eolnp == true) {
838       /* Peek at character after eoln */
839       if ((c = fgetc(fp)) == EOF || (init_char_p == false && (c == '@' || c == '>' || c == '<' || c == '+'))) {
840 	debug(printf("c == EOF or > or < or +.  Returning\n"));
841 	*nextchar = c;
842 	break;
843       } else if (iscntrl(c)) {
844 	debug(printf("c == control char.  Continuing\n"));
845 #ifdef DASH
846       } else if (c == DASH) {
847 	debug(printf("c == dash.  Continuing\n"));
848 #endif
849       } else if (isspace(c)) {
850 	debug(printf("c == NULL.  Continuing\n"));
851       } else {
852 	*ptr++ = (char) c;
853 	remainder--;
854 	eolnp = false;
855 	p = NULL;
856 	debug(printf("c == sth.  Continuing\n"));
857       }
858 
859     } else {
860       if (p != NULL) {
861         strlenp = strlen(p);
862 	memmove(ptr,p,strlenp);
863         ptr[strlenp] = '\0';
864 	debug(printf("Did memmove of %d chars at %p to %p\n",strlenp,p,ptr));
865       } else {
866 	p = fgets(ptr,remainder+1,fp);
867       }
868       if (p == NULL) {
869 	debug(printf("line == NULL.  Returning\n"));
870 	*nextchar = EOF;
871 	break;
872       } else {
873 	debug(printf("Read %s.\n",ptr));
874 	/* Was a call to find_bad_char */
875 	while ((p = index(ptr,SPACE)) != NULL) {
876 	  ptr = p++;
877 	  strlenp = strlen(p);
878 	  memmove(ptr,p,strlenp);
879 	  ptr[strlenp] = '\0';
880 	  debug(printf("Found space.  Did memmove of %d chars at %p to %p\n",strlenp,p,ptr));
881 	}
882 	if (*ptr == '\n') {
883 	  eolnp = true;
884 	  debug(printf("line == EOLN.  Continuing\n"));
885 	} else if ((p = index(ptr,'\n')) != NULL) {
886 	  if (p[-1] == '\r') {
887 	    p--;
888 	  }
889 	  ptr = p;
890 	  eolnp = true;
891 	  debug(printf("line == EOLN.  Continuing\n"));
892 	} else {
893 	  ptr += strlen(ptr);
894 	  eolnp = false;
895 	  p = NULL;
896 	  debug(printf("line != EOLN.  Continuing\n"));
897 	}
898 	remainder = (&(Secondhalf[HALFLEN+2]) - ptr)/sizeof(char);
899       }
900     }
901 
902     debug(print_contents(&(Secondhalf[0]),HALFLEN+3));
903     init_char_p = false;
904   }
905 
906   terminator = (ptr - &(Secondhalf[0]))/sizeof(char);
907   debug(printf("ncycles = %d, terminator is %d\n",ncycles,terminator));
908   if (ncycles == 0) {
909     *length2a = 0;
910     if (terminator < HALFLEN) {
911       skiplength = 0;
912     } else {
913       skiplength = terminator-HALFLEN;
914     }
915   } else {
916     *length2a = HALFLEN-terminator;
917     skiplength = ncycles*(HALFLEN+2) + terminator-HALFLEN;
918   }
919   if (*length2a <= 0) {
920     *length2a = 0;
921     *pointer2a = (char *) NULL;
922   } else {
923     *pointer2a = &(Secondhalf[HALFLEN+2-(*length2a)]);
924   }
925   if (terminator == 0) {
926     *length2b = 0;
927     *pointer2b = (char *) NULL;
928   } else if (terminator > HALFLEN) {
929     *length2b = HALFLEN;
930     *pointer2b = &(ptr[-(*length2b)]);
931   } else {
932     *length2b = terminator;
933     *pointer2b = &(Secondhalf[0]);
934   }
935 
936   return skiplength;
937 }
938 
939 
940 /* Returns sequence length */
941 static int
input_sequence(int * nextchar,char ** pointer1,int * length1,char ** pointer2a,int * length2a,char ** pointer2b,int * length2b,int * skiplength,FILE * fp,bool possible_fasta_header_p)942 input_sequence (int *nextchar, char **pointer1, int *length1, char **pointer2a, int *length2a,
943 		char **pointer2b, int *length2b, int *skiplength, FILE *fp, bool possible_fasta_header_p) {
944   bool eolnp = true;
945 
946   *pointer1 = &(Firsthalf[0]);
947   *pointer2a = (char *) NULL;
948   *length2a = 0;
949   *pointer2b = (char *) NULL;
950   *length2b = 0;
951 
952   /* printf("Beginning input_sequence with nextchar = %c\n",*nextchar); */
953   if ((*length1 = read_first_half(&(*nextchar),&eolnp,fp,possible_fasta_header_p)) == 0) {
954     *pointer1 = (char *) NULL;
955     *skiplength = 0;
956   } else if (*length1 < HALFLEN) {
957     *skiplength = 0;
958   } else {
959     *skiplength = read_second_half(&(*nextchar),&(*pointer2a),&(*length2a),
960 				   &(*pointer2b),&(*length2b),eolnp,fp);
961     debug1(printf("read_second_half returns skiplength of %d, length2a=%d, length2b=%d\n",
962 		  *skiplength,*length2a,*length2b));
963   }
964 
965   debug1(printf("length1 = %d, length2a = %d, length2b = %d\n",
966 		*length1,*length2a,*length2b));
967 
968   return (*length1) + (*length2a) + (*length2b);
969 }
970 
971 
972 /* Used only by extern procedures (outside of this file).  Internal
973    procedures have their own specialized creators. */
974 T
Sequence_genomic_new(char * contents,int length,bool copyp)975 Sequence_genomic_new (char *contents, int length, bool copyp) {
976   T new = (T) MALLOC_IN(sizeof(*new));
977   char *copy;
978 
979   new->acc = (char *) NULL;
980   new->restofheader = (char *) NULL;
981 
982   new->trimstart = 0;
983   new->trimend = new->fulllength = length;
984 #ifdef PMAP
985   new->fulllength_given = length;
986 #endif
987 
988   if (copyp == true) {
989     copy = (char *) MALLOC_IN((length+1)*sizeof(char));
990     strncpy(copy,contents,length);
991     copy[length] = '\0';
992 
993     new->contents = copy;
994     new->contents_alloc = copy;
995   } else {
996     /* new->free_contents_p = false; */
997     new->contents = contents;
998     new->contents_alloc = contents;
999     /* new->contents_uc_alloc = (char *) NULL; -- only for GSNAP */
1000   }
1001 
1002 #ifndef PMAP
1003   new->quality = new->quality_alloc = (char *) NULL;
1004 #endif
1005 
1006   new->subseq_offset = 0;
1007   new->skiplength = 0;
1008   new->firstp = true;
1009 
1010   return new;
1011 }
1012 
1013 
1014 static char complCode[128] = COMPLEMENT_LC;
1015 
1016 static char *
make_complement(char * sequence,unsigned int length)1017 make_complement (char *sequence, unsigned int length) {
1018   char *complement;
1019   int i, j;
1020 
1021   complement = (char *) MALLOC_IN((length+1)*sizeof(char));
1022   for (i = length-1, j = 0; i >= 0; i--, j++) {
1023     complement[j] = complCode[(int) sequence[i]];
1024   }
1025   complement[length] = '\0';
1026   return complement;
1027 }
1028 
1029 static char *
make_reverse(char * sequence,unsigned int length)1030 make_reverse (char *sequence, unsigned int length) {
1031   char *reverse;
1032   int i, j;
1033 
1034   if (sequence == NULL) {
1035     return (char *) NULL;
1036   } else {
1037     reverse = (char *) MALLOC_IN((length+1)*sizeof(char));
1038     for (i = length-1, j = 0; i >= 0; i--, j++) {
1039       reverse[j] = sequence[i];
1040     }
1041     reverse[length] = '\0';
1042     return reverse;
1043   }
1044 }
1045 
1046 
1047 #if 0
1048 static char *
1049 make_complement_uppercase (char *sequence, unsigned int length) {
1050   char *complement;
1051   char uppercaseCode[128] = UPPERCASE_U2T;
1052   int i, j;
1053 
1054   complement = (char *) CALLOC_IN(length+1,sizeof(char));
1055   for (i = length-1, j = 0; i >= 0; i--, j++) {
1056     complement[j] = uppercaseCode[(int) complCode[(int) sequence[i]]];
1057   }
1058   complement[length] = '\0';
1059   return complement;
1060 }
1061 #endif
1062 
1063 
1064 #if 0
1065 static void
1066 make_complement_buffered (char *complement, char *sequence, unsigned int length) {
1067   int i, j;
1068 
1069   /* complement = (char *) CALLOC_IN(length+1,sizeof(char)); */
1070   for (i = length-1, j = 0; i >= 0; i--, j++) {
1071     complement[j] = complCode[(int) sequence[i]];
1072   }
1073   complement[length] = '\0';
1074   return;
1075 }
1076 #endif
1077 
1078 
1079 static void
make_complement_inplace(char * sequence,unsigned int length)1080 make_complement_inplace (char *sequence, unsigned int length) {
1081   char temp;
1082   unsigned int i, j;
1083 
1084   for (i = 0, j = length-1; i < length/2; i++, j--) {
1085     temp = complCode[(int) sequence[i]];
1086     sequence[i] = complCode[(int) sequence[j]];
1087     sequence[j] = temp;
1088   }
1089   if (i == j) {
1090     sequence[i] = complCode[(int) sequence[i]];
1091   }
1092 
1093   return;
1094 }
1095 
1096 
1097 /************************************************************************
1098  *  Original:
1099  *   TTTTTT ACGT ...... ACGT AAAAAA
1100  *          ^trimstart     ^trimend
1101  *   ^contents
1102  ************************************************************************
1103  *  Subsequence:
1104  *       ^start                ^end
1105  *          ^trimstart     ^trimend
1106  *       ^contents
1107  ************************************************************************/
1108 
1109 T
Sequence_subsequence(T this,int start,int end)1110 Sequence_subsequence (T this, int start, int end) {
1111   T new;
1112 
1113 #ifdef PMAP
1114   start /= 3;
1115   end /= 3;
1116 #endif
1117 
1118   if (start < 0) {
1119     start = 0;
1120   }
1121   if (end > this->fulllength) {
1122     end = this->fulllength;
1123   }
1124 
1125   if (end <= start) {
1126     return NULL;
1127   } else {
1128     new = (T) MALLOC_IN(sizeof(*new));
1129 
1130     new->acc = (char *) NULL;
1131     new->restofheader = (char *) NULL;
1132     new->contents = &(this->contents[start]);
1133 
1134     new->fulllength = end - start;
1135 #ifdef PMAP
1136     new->fulllength_given = new->fulllength;
1137 #endif
1138     if ((new->trimstart = this->trimstart - start) < 0) {
1139       new->trimstart = 0;
1140     }
1141     if ((new->trimend = this->trimend - start) > new->fulllength) {
1142       new->trimend = new->fulllength;
1143     }
1144 
1145     /* new->free_contents_p = false; */
1146     new->contents_alloc = (char *) NULL;
1147     /* new->contents_uc_alloc = (char *) NULL; -- only for GSNAP */
1148 
1149 #ifndef PMAP
1150     if (this->quality == NULL) {
1151       new->quality = (char *) NULL;
1152     } else {
1153       new->quality = &(this->quality[start]);
1154     }
1155     new->quality_alloc = (char *) NULL;
1156 #endif
1157 
1158 #ifdef PMAP
1159     new->subseq_offset = 3*start;
1160 #else
1161     new->subseq_offset = start;
1162 #endif
1163     new->skiplength = this->skiplength;
1164 
1165     new->firstp = this->firstp;
1166 
1167     return new;
1168   }
1169 }
1170 
1171 
1172 T
Sequence_revcomp(T this)1173 Sequence_revcomp (T this) {
1174   T new = (T) MALLOC_IN(sizeof(*new));
1175 
1176   new->acc = (char *) NULL;
1177   new->restofheader = (char *) NULL;
1178   new->contents = new->contents_alloc = make_complement(this->contents,this->fulllength);
1179 
1180 #ifndef PMAP
1181   new->quality = new->quality_alloc = make_reverse(this->quality,this->fulllength);
1182 #endif
1183 
1184   new->fulllength = this->fulllength;
1185 #ifdef PMAP
1186   new->fulllength_given = this->fulllength_given;
1187 #endif
1188   new->trimstart = this->trimstart;
1189   new->trimend = this->trimend;
1190   /* new->free_contents_p = true; */
1191   new->subseq_offset = 0;	/* Not sure if this is right */
1192   new->skiplength = this->skiplength;
1193   new->firstp = this->firstp;
1194   return new;
1195 }
1196 
1197 
1198 static char *
make_uppercase(char * sequence,unsigned int length)1199 make_uppercase (char *sequence, unsigned int length) {
1200   char *uppercase;
1201 #ifdef PMAP
1202   char uppercaseCode[128] = UPPERCASE_STD;
1203 #else
1204   char uppercaseCode[128] = UPPERCASE_U2T;
1205 #endif
1206   unsigned int i;
1207 
1208   uppercase = (char *) MALLOC_IN((length+1)*sizeof(char));
1209   for (i = 0; i < length; i++) {
1210     uppercase[i] = uppercaseCode[(int) sequence[i]];
1211   }
1212   uppercase[length] = '\0';
1213   return uppercase;
1214 }
1215 
1216 
1217 T
Sequence_uppercase(T this)1218 Sequence_uppercase (T this) {
1219   T new = (T) MALLOC_IN(sizeof(*new));
1220 
1221   new->acc = (char *) NULL;
1222   new->restofheader = (char *) NULL;
1223   new->contents = new->contents_alloc = make_uppercase(this->contents,this->fulllength);
1224 
1225 #ifndef PMAP
1226   if (this->quality_alloc == NULL) {
1227     new->quality = new->quality_alloc = (char *) NULL;
1228   } else {
1229     new->quality = new->quality_alloc =(char *) MALLOC_IN((this->fulllength+1)*sizeof(char));
1230     strcpy(new->quality,this->quality);
1231   }
1232 #endif
1233 
1234   new->fulllength = this->fulllength;
1235 #ifdef PMAP
1236   new->fulllength_given = this->fulllength_given;
1237 #endif
1238   new->trimstart = this->trimstart;
1239   new->trimend = this->trimend;
1240   /* new->free_contents_p = true; */
1241   new->subseq_offset = this->subseq_offset;
1242   new->skiplength = this->skiplength;
1243   new->firstp = this->firstp;
1244   return new;
1245 }
1246 
1247 
1248 T
Sequence_alias(T this)1249 Sequence_alias (T this) {
1250   T new = (T) MALLOC_IN(sizeof(*new));
1251 
1252   new->acc = (char *) NULL;
1253   new->restofheader = (char *) NULL;
1254   new->contents = this->contents;
1255 
1256 #ifndef PMAP
1257   new->quality = this->quality;
1258   new->quality_alloc = (char *) NULL;
1259 #endif
1260 
1261   new->fulllength = this->fulllength;
1262 #ifdef PMAP
1263   new->fulllength_given = this->fulllength_given;
1264 #endif
1265   new->trimstart = this->trimstart;
1266   new->trimend = this->trimend;
1267 
1268   /* new->free_contents_p = false; */
1269   new->contents_alloc = (char *) NULL;
1270   /* new->contents_uc_alloc = (char *) NULL; -- only for GSNAP */
1271 
1272 
1273   new->subseq_offset = this->subseq_offset;
1274   new->firstp = this->firstp;
1275   return new;
1276 }
1277 
1278 
1279 /*
1280 void
1281 Sequence_endstream () {
1282   Initc = '\0';
1283   return;
1284 }
1285 */
1286 
1287 
1288 T
Sequence_read(int * nextchar,FILE * input)1289 Sequence_read (int *nextchar, FILE *input) {
1290   T new;
1291   int fulllength, skiplength;
1292   char *pointer1, *pointer2a, *pointer2b;
1293   int length1, length2a, length2b;
1294 #ifdef PMAP
1295   char lastchar = '*';
1296 #else
1297   int quality_length;
1298 #endif
1299 
1300   if (feof(input)) {
1301     *nextchar = EOF;
1302     return NULL;
1303   }
1304 
1305   if (*nextchar == '\0') {
1306 #if 0 && defined(HAVE_ZLIB) && defined(HAVE_BZLIB)
1307     if (*gzipped != NULL) {
1308       *nextchar = Sequence_input_init_gzip(gzipped);
1309     } else if (*bzipped != NULL) {
1310       *nextchar = Sequence_input_init_bzip2(bzipped);
1311     } else {
1312       *nextchar = Sequence_input_init(input);
1313     }
1314 #elif 0 && defined(HAVE_ZLIB)
1315     if (*gzipped != NULL) {
1316       *nextchar = Sequence_input_init_gzip(gzipped);
1317     } else {
1318       *nextchar = Sequence_input_init(input);
1319     }
1320 #elif 0 && defined(HAVE_BZLIB)
1321     if (*bzipped != NULL) {
1322       *nextchar = Sequence_input_init_bzip2(bzipped);
1323     } else {
1324       *nextchar = Sequence_input_init(input);
1325     }
1326 #else
1327     *nextchar = Sequence_input_init(input);
1328 #endif
1329 
1330     if (*nextchar == EOF) {
1331       return NULL;
1332     }
1333   }
1334 
1335   new = (T) MALLOC_IN(sizeof(*new));
1336 
1337   if (*nextchar != '@' && *nextchar != '>' && *nextchar != '<') {
1338     new->firstp = true;		/* by default */
1339     blank_header(new);
1340   } else if (input_header(input,new) == NULL) {
1341     /* File ends after >.  Don't process. */
1342     *nextchar = EOF;
1343     FREE_IN(new);
1344     return NULL;
1345   } else if (*nextchar == '@') {
1346     new->firstp = true;		/* by default */
1347   } else if (*nextchar == '>') {
1348     new->firstp = true;
1349   } else if (*nextchar == '<') {
1350     new->firstp = false;
1351   } else {
1352     abort();
1353   }
1354 
1355   if ((fulllength = input_sequence(&(*nextchar),&pointer1,&length1,&pointer2a,&length2a,
1356 				   &pointer2b,&length2b,&skiplength,input,/*possible_fasta_header_p*/true)) == 0) {
1357     /* File ends during header.  Continue with a sequence of length 0. */
1358     /* fprintf(stderr,"File ends after header\n"); */
1359   }
1360 
1361   if (skiplength > 0) {
1362     fprintf(stderr,"Warning: cDNA sequence length of %d exceeds maximum length of %d.  Truncating %d chars in middle.\n",
1363 	    fulllength+skiplength,MAXSEQLEN,skiplength);
1364     fprintf(stderr,"  (For long sequences, perhaps you want maponly mode, by providing the '-1' flag.)\n");
1365   }
1366 
1367 #ifdef PMAP
1368   if (length1 > 0) {
1369     lastchar = pointer1[length1-1];
1370     if (length2a > 0) {
1371       lastchar = pointer2a[length2a-1];
1372     }
1373     if (length2b > 0) {
1374       lastchar = pointer2b[length2b-1];
1375     }
1376   }
1377 
1378   new->fulllength_given = fulllength;
1379   if (lastchar != '*') {
1380     debug(printf("Sequence does not end with *, so adding it\n"));
1381     fulllength++;
1382   }
1383 #endif
1384 
1385   debug(printf("fulllength = %d\n",fulllength));
1386   new->fulllength = fulllength;
1387   new->skiplength = skiplength;
1388 
1389   new->trimstart = 0;
1390 #ifdef PMAP
1391   new->trimend = new->fulllength_given;
1392 #else
1393   new->trimend = fulllength;
1394 #endif
1395 
1396   new->contents = new->contents_alloc = (char *) MALLOC_IN((fulllength+1)*sizeof(char));
1397   if (length1 > 0) {
1398     strncpy(new->contents,pointer1,length1);
1399     new->contents[length1] = '\0';
1400 
1401     if (length2a > 0) {
1402       strncpy(&(new->contents[length1]),pointer2a,length2a);
1403       new->contents[length1+length2a] = '\0';
1404 
1405     }
1406     if (length2b > 0) {
1407       strncpy(&(new->contents[length1+length2a]),pointer2b,length2b);
1408       new->contents[length1+length2a+length2b] = '\0';
1409 
1410     }
1411   }
1412 
1413 #ifdef PMAP
1414   if (lastchar != '*') {
1415     new->contents[fulllength-1] = '*';
1416   }
1417 #endif
1418   /* new->free_contents_p = true; */
1419   new->subseq_offset = 0;
1420 
1421 #ifndef PMAP
1422   /* Quality string */
1423   new->quality = new->quality_alloc = (char *) NULL;
1424   if (*nextchar == '+') {
1425     skip_header(input);
1426     *nextchar = fgetc(input);
1427     quality_length = input_sequence(&(*nextchar),&pointer1,&length1,&pointer2a,&length2a,
1428 				    &pointer2b,&length2b,&skiplength,input,/*possible_fasta_header_p*/false);
1429     if (quality_length != fulllength) {
1430       fprintf(stderr,"Length %d of quality score differs from length %d of nucleotides in sequence %s\n",
1431 	      quality_length,fulllength,new->acc);
1432       exit(9);
1433     } else {
1434       new->quality = new->quality_alloc = (char *) MALLOC_IN((fulllength+1)*sizeof(char));
1435       if (length1 > 0) {
1436 	strncpy(new->quality,pointer1,length1);
1437 	new->quality[length1] = '\0';
1438 
1439 	if (length2a > 0) {
1440 	  strncpy(&(new->quality[length1]),pointer2a,length2a);
1441 	  new->quality[length1+length2a] = '\0';
1442 	}
1443 	if (length2b > 0) {
1444 	  strncpy(&(new->quality[length1+length2a]),pointer2b,length2b);
1445 	  new->quality[length1+length2a+length2b] = '\0';
1446 	}
1447       }
1448     }
1449   }
1450 #endif
1451 
1452   debug(printf("Final query sequence is:\n"));
1453   debug(Sequence_stdout(new,/*uppercasep*/false,/*wraplength*/60,/*trimmedp*/false));
1454   return new;
1455 }
1456 
1457 
1458 T
Sequence_read_multifile(int * nextchar,FILE ** input,char * read_files_command,char *** files,int * nfiles)1459 Sequence_read_multifile (int *nextchar, FILE **input,
1460 			 char *read_files_command, char ***files, int *nfiles) {
1461   T queryseq;
1462 
1463   while (1) {
1464     if (*input == NULL || *nextchar == EOF) { /* was feof(*input) */
1465       if (*input != NULL) {
1466 	if (read_files_command != NULL) {
1467 	  pclose(*input);
1468 	} else {
1469 	  fclose(*input);
1470 	}
1471 	*input = NULL;
1472       }
1473 
1474       if (*nfiles == 0) {
1475 	*nextchar = EOF;
1476 	return NULL;
1477       } else {
1478 	while (*nfiles > 0 && (*input = Fopen_read_text(read_files_command,(*files)[0])) == NULL) {
1479 	  (*files)++;
1480 	  (*nfiles)--;
1481 	}
1482 	if (*input == NULL) {
1483 	  *nextchar = EOF;
1484 	  return NULL;
1485 	} else {
1486 	  (*files)++;
1487 	  (*nfiles)--;
1488 	  *nextchar = '\0';
1489 	}
1490       }
1491     }
1492     if ((queryseq = Sequence_read(&(*nextchar),*input)) != NULL) {
1493       return queryseq;
1494     }
1495   }
1496 }
1497 
1498 
1499 #if 0 && defined(HAVE_ZLIB)
1500 T
1501 Sequence_read_multifile_gzip (int *nextchar, gzFile *input, char ***files, int *nfiles) {
1502   T queryseq;
1503 
1504   while (1) {
1505     if (*input == NULL || *nextchar == EOF) { /* was gzeof(*input) */
1506       if (*input != NULL) {
1507 	gzclose(*input);
1508 	*input = NULL;
1509       }
1510 
1511       if (*nfiles == 0) {
1512 	*nextchar = EOF;
1513 	return NULL;
1514       } else {
1515 	while (*nfiles > 0 && (*input = gzopen((*files)[0],"rb")) == NULL) {
1516 	  (*files)++;
1517 	  (*nfiles)--;
1518 	}
1519 	if (*input == NULL) {
1520 	  *nextchar = EOF;
1521 	  return NULL;
1522 	} else {
1523 #ifdef HAVE_ZLIB_GZBUFFER
1524 	  gzbuffer(*input,GZBUFFER_SIZE);
1525 #endif
1526 	  (*files)++;
1527 	  (*nfiles)--;
1528 	  *nextchar = '\0';
1529 	}
1530       }
1531     }
1532     if ((queryseq = Sequence_read_gzip(&(*nextchar),*input)) != NULL) {
1533       return queryseq;
1534     }
1535   }
1536 }
1537 #endif
1538 
1539 
1540 #if 0 && defined(HAVE_BZLIB)
1541 T
1542 Sequence_read_multifile_bzip2 (int *nextchar, Bzip2_T *input, char ***files, int *nfiles) {
1543   T queryseq;
1544 
1545   while (1) {
1546     if (*input == NULL || *nextchar == EOF) { /* was bzeof(*input) */
1547       if (*input != NULL) {
1548 	Bzip2_free(&(*input));
1549 	*input = NULL;
1550       }
1551 
1552       if (*nfiles == 0) {
1553 	*nextchar = EOF;
1554 	return NULL;
1555       } else {
1556 	while (*nfiles > 0 && (*input = Bzip2_new((*files)[0])) == NULL) {
1557 	  (*files)++;
1558 	  (*nfiles)--;
1559 	}
1560 	if (*input == NULL) {
1561 	  *nextchar = EOF;
1562 	  return NULL;
1563 	} else {
1564 	  (*files)++;
1565 	  (*nfiles)--;
1566 	  *nextchar = '\0';
1567 	}
1568       }
1569     }
1570     if ((queryseq = Sequence_read_bzip2(&(*nextchar),*input)) != NULL) {
1571       return queryseq;
1572     }
1573   }
1574 }
1575 #endif
1576 
1577 
1578 /* This is intended for user genomicseg, which will start with
1579    standard FASTA ">" */
1580 T
Sequence_read_unlimited(int * nextchar,FILE * input)1581 Sequence_read_unlimited (int *nextchar, FILE *input) {
1582   T new;
1583   Intlist_T intlist = NULL;
1584   char *p;
1585   int length, startpos = 1, maxseqlen = MAXSEQLEN;
1586   bool eolnp;
1587 
1588   if (feof(input)) {
1589     *nextchar = EOF;
1590     return NULL;
1591   }
1592 
1593   if (*nextchar == '\0') {
1594     if ((*nextchar = Sequence_input_init(input)) == EOF) {
1595       *nextchar = EOF;
1596       return NULL;
1597     }
1598   }
1599 
1600   new = (T) MALLOC_IN(sizeof(*new));
1601 
1602   if (*nextchar != '>' /* && *nextchar != '<' */) {
1603     new->firstp = true;		/* by default */
1604     blank_header(new);
1605     Sequence[startpos++] = (char) *nextchar;
1606     maxseqlen--;
1607   } else if (input_header(input,new) == NULL) {
1608     /* File ends after >.  Don't process. */
1609     FREE_IN(new);
1610     return NULL;
1611   } else if (*nextchar == '>') {
1612     new->firstp = true;
1613 #if 0
1614   } else if (*nextchar == '<') {
1615     new->firstp = false;
1616 #endif
1617   } else {
1618     abort();
1619   }
1620 
1621   /* Don't touch Sequence[0], because subsequent calls to
1622      Sequence_read depend on it being '\0'. */
1623   eolnp = true;
1624   while (fgets(&(Sequence[startpos]),maxseqlen,input) != NULL &&
1625 	 (eolnp == false || (Sequence[1] != '>' /* && Sequence[1] != '<' */))) {
1626     for (p = &(Sequence[1]); *p != '\r' && *p != '\n' && *p != '\0'; p++) {
1627       if (!iscntrl((int) *p)
1628 #ifdef DASH
1629 	  && *p != DASH
1630 #endif
1631 	  ) {
1632 	intlist = Intlist_push(intlist,(int) *p);
1633       }
1634     }
1635     if (*p == '\r' || *p == '\n') {
1636       eolnp = true;
1637     } else {
1638       eolnp = false;
1639     }
1640     startpos = 1;
1641     maxseqlen = MAXSEQLEN;
1642   }
1643   intlist = Intlist_reverse(intlist);
1644   new->contents = new->contents_alloc = Intlist_to_char_array_in(&length,intlist);
1645 
1646   Intlist_free(&intlist);
1647 
1648   if (length == 0) {
1649     return NULL;
1650   } else {
1651     new->fulllength = new->trimend = length;
1652 #ifdef PMAP
1653     new->fulllength_given = length;
1654 #endif
1655     new->trimstart = 0;
1656 
1657 #ifndef PMAP
1658     /* user genomic segment should not have quality */
1659     new->quality = new->quality_alloc = (char *) NULL;
1660 #endif
1661 
1662     /* new->free_contents_p = true; */
1663     new->subseq_offset = 0;
1664     new->skiplength = 0;
1665 
1666     /* Important to initialize for subsequent cDNA reads */
1667     *nextchar = '\0';
1668 
1669     return new;
1670   }
1671 }
1672 
1673 
1674 void
Sequence_print_digest(Filestring_T fp,T this)1675 Sequence_print_digest (Filestring_T fp, T this) {
1676   unsigned char *digest;
1677 
1678   digest = MD5_compute((unsigned char *) this->contents,this->fulllength);
1679   MD5_print(fp,digest);
1680   FREE(digest);
1681   return;
1682 }
1683 
1684 /* Calling procedure needs to print the initial ">", if desired */
1685 void
Sequence_print_header(Filestring_T fp,T this,bool checksump)1686 Sequence_print_header (Filestring_T fp, T this, bool checksump) {
1687 
1688   if (this->acc == NULL) {
1689     FPRINTF(fp,"NO_HEADER");
1690   } else {
1691     if (this->restofheader == NULL || this->restofheader[0] == '\0') {
1692       FPRINTF(fp,"%s",this->acc);
1693     } else {
1694       FPRINTF(fp,"%s %s",this->acc,this->restofheader);
1695     }
1696 
1697     if (checksump == true) {
1698       FPRINTF(fp," md5:");
1699       Sequence_print_digest(fp,this);
1700     }
1701   }
1702 
1703   FPRINTF(fp,"\n");
1704 
1705   return;
1706 }
1707 
1708 void
Sequence_stdout_header(T this)1709 Sequence_stdout_header (T this) {
1710   if (this->acc == NULL) {
1711     printf("NO_HEADER");
1712   } else {
1713     if (this->restofheader == NULL || this->restofheader[0] == '\0') {
1714       printf("%s",this->acc);
1715     } else {
1716       printf("%s %s",this->acc,this->restofheader);
1717     }
1718   }
1719 
1720   printf("\n");
1721 
1722   return;
1723 }
1724 
1725 
1726 #if 0
1727 /* Used by revcomp.c */
1728 void
1729 Sequence_stdout_header_revcomp (T this) {
1730   if (this->restofheader == NULL || this->restofheader[0] == '\0') {
1731     printf(">%s",this->acc);
1732   } else {
1733     printf(">%s %s",this->acc,this->restofheader);
1734   }
1735   printf(" REVCOMP");
1736   printf("\n");
1737   return;
1738 }
1739 #endif
1740 
1741 
1742 void
Sequence_print(Filestring_T fp,T this,bool uppercasep,int wraplength,bool trimmedp)1743 Sequence_print (Filestring_T fp, T this, bool uppercasep, int wraplength, bool trimmedp) {
1744   int i = 0, pos, start, end;
1745   char uppercaseCode[128] = UPPERCASE_STD;
1746 
1747   if (trimmedp == true) {
1748     start = this->trimstart;
1749     end = this->trimend;
1750   } else {
1751     start = 0;
1752     end = this->fulllength;
1753   }
1754 
1755   if (uppercasep == true) {
1756     for (pos = start; pos < end; pos++, i++) {
1757       PUTC(uppercaseCode[(int) this->contents[i]],fp);
1758       if ((i+1) % wraplength == 0) {
1759 	PUTC('\n',fp);
1760       }
1761     }
1762   } else {
1763     for (pos = start; pos < end; pos++, i++) {
1764       PUTC(this->contents[i],fp);
1765       if ((i+1) % wraplength == 0) {
1766 	PUTC('\n',fp);
1767       }
1768     }
1769   }
1770   if (i % wraplength != 0) {
1771     PUTC('\n',fp);
1772   }
1773 
1774   return;
1775 }
1776 
1777 
1778 void
Sequence_stdout(T this,bool uppercasep,int wraplength,bool trimmedp)1779 Sequence_stdout (T this, bool uppercasep, int wraplength, bool trimmedp) {
1780   int i = 0, pos, start, end;
1781   char uppercaseCode[128] = UPPERCASE_STD;
1782 
1783   if (trimmedp == true) {
1784     start = this->trimstart;
1785     end = this->trimend;
1786   } else {
1787     start = 0;
1788     end = this->fulllength;
1789   }
1790 
1791   if (uppercasep == true) {
1792     for (pos = start; pos < end; pos++, i++) {
1793       putchar(uppercaseCode[(int) this->contents[i]]);
1794       if ((i+1) % wraplength == 0) {
1795 	putchar('\n');
1796       }
1797     }
1798   } else {
1799     for (pos = start; pos < end; pos++, i++) {
1800       putchar(this->contents[i]);
1801       if ((i+1) % wraplength == 0) {
1802 	putchar('\n');
1803       }
1804     }
1805   }
1806   if (i % wraplength != 0) {
1807     putchar('\n');
1808   }
1809 
1810   return;
1811 }
1812 
1813 
1814 void
Sequence_stdout_alt(T ref,T alt,T snp,bool uppercasep,int wraplength)1815 Sequence_stdout_alt (T ref, T alt, T snp, bool uppercasep, int wraplength) {
1816   int i = 0, pos, start, end;
1817   char uppercaseCode[128] = UPPERCASE_STD;
1818 
1819   start = 0;
1820   end = alt->fulllength;
1821 
1822   if (uppercasep == true) {
1823     for (pos = start; pos < end; pos++, i++) {
1824       if (snp->contents[i] == ' ') {
1825 	/* Not a SNP, so print reference */
1826 	printf("%c",uppercaseCode[(int) ref->contents[i]]);
1827       } else if (uppercaseCode[(int) alt->contents[i]] == uppercaseCode[(int) ref->contents[i]]) {
1828 	/* Wildcard SNP */
1829 	printf("N");
1830       } else {
1831 	printf("%c",uppercaseCode[(int) alt->contents[i]]);
1832       }
1833       if ((i+1) % wraplength == 0) {
1834 	printf("\n");
1835       }
1836     }
1837   } else {
1838     for (pos = start; pos < end; pos++, i++) {
1839       if (snp->contents[i] == ' ') {
1840 	/* Not a SNP, so print reference */
1841 	printf("%c",ref->contents[i]);
1842       } else if (uppercaseCode[(int) alt->contents[i]] == uppercaseCode[(int) ref->contents[i]]) {
1843 	/* Wildcard SNP */
1844 	printf("N");
1845       } else {
1846 	printf("%c",alt->contents[i]);
1847       }
1848       if ((i+1) % wraplength == 0) {
1849 	printf("\n");
1850       }
1851     }
1852   }
1853   if (i % wraplength != 0) {
1854     printf("\n");
1855   }
1856   return;
1857 }
1858 
1859 
1860 void
Sequence_stdout_two(T ref,T alt,bool uppercasep,int wraplength)1861 Sequence_stdout_two (T ref, T alt, bool uppercasep, int wraplength) {
1862   int i = 0, pos, pos2, startpos, end;
1863   char uppercaseCode[128] = UPPERCASE_STD;
1864 
1865   end = ref->fulllength;
1866 
1867   pos = 0;
1868   i = 0;
1869   if (uppercasep == true) {
1870     printf("ref\t");
1871     startpos = pos;
1872     while (pos < end) {
1873       printf("%c",uppercaseCode[(int) ref->contents[pos]]);
1874       if (++i % wraplength == 0) {
1875 	printf("\n");
1876 	printf("alt\t");
1877 	for (pos2 = startpos, i = 0; i < wraplength; pos2++, i++) {
1878 	  if (uppercaseCode[(int) alt->contents[pos2]] == uppercaseCode[(int) ref->contents[pos2]]) {
1879 	    /* Wildcard SNP */
1880 	    printf("N");
1881 	  } else {
1882 	    printf("%c",uppercaseCode[(int) alt->contents[pos2]]);
1883 	  }
1884 	}
1885 	printf("\n\n");
1886 	if (pos+1 < end) {
1887 	  printf("ref\t");
1888 	}
1889 	startpos = pos+1;
1890       }
1891       pos++;
1892     }
1893     if (i % wraplength != 0) {
1894       printf("\n");
1895       printf("alt\t");
1896       for (pos2 = startpos; pos2 < end; pos2++) {
1897 	if (uppercaseCode[(int) alt->contents[pos2]] == uppercaseCode[(int) ref->contents[pos2]]) {
1898 	  /* Wildcard SNP */
1899 	  printf("N");
1900 	} else {
1901 	  printf("%c",uppercaseCode[(int) alt->contents[pos2]]);
1902 	}
1903       }
1904       printf("\n\n");
1905     }
1906 
1907   } else {
1908     printf("ref\t");
1909     startpos = pos;
1910     while (pos < end) {
1911       printf("%c",ref->contents[pos]);
1912       if (++i % wraplength == 0) {
1913 	printf("\n");
1914 	printf("alt\t");
1915 	for (pos2 = startpos, i = 0; i < wraplength; pos2++, i++) {
1916 	  if (uppercaseCode[(int) alt->contents[pos2]] == uppercaseCode[(int) ref->contents[pos2]]) {
1917 	    /* Wildcard SNP */
1918 	    printf("N");
1919 	  } else {
1920 	    printf("%c",alt->contents[pos2]);
1921 	  }
1922 	}
1923 	printf("\n\n");
1924 	if (pos+1 < end) {
1925 	  printf("ref\t");
1926 	}
1927 	startpos = pos+1;
1928       }
1929       pos++;
1930     }
1931     if (i % wraplength != 0) {
1932       printf("\n");
1933       printf("alt\t");
1934       for (pos2 = startpos; pos2 < end; pos2++) {
1935 	if (uppercaseCode[(int) alt->contents[pos2]] == uppercaseCode[(int) ref->contents[pos2]]) {
1936 	  /* Wildcard SNP */
1937 	  printf("N");
1938 	} else {
1939 	  printf("%c",alt->contents[pos2]);
1940 	}
1941       }
1942       printf("\n\n");
1943     }
1944   }
1945 
1946   return;
1947 }
1948 
1949 
1950 void
Sequence_stdout_raw(T this)1951 Sequence_stdout_raw (T this) {
1952   int i = 0, pos, start, end;
1953 
1954   start = 0;
1955   end = this->fulllength;
1956 
1957   for (pos = start; pos < end; pos++, i++) {
1958     printf("%d\n",(int) this->contents[i]);
1959   }
1960   return;
1961 }
1962 
1963 void
Sequence_stdout_stream_chars(T this)1964 Sequence_stdout_stream_chars (T this) {
1965   int i = 0, pos, start, end;
1966 
1967   start = 0;
1968   end = this->fulllength;
1969 
1970   for (pos = start; pos < end; pos++, i++) {
1971     switch (toupper(this->contents[i])) {
1972     case 'A': putchar('A'); break;
1973     case 'C': putchar('C'); break;
1974     case 'G': putchar('G'); break;
1975     case 'T': putchar('T'); break;
1976     default: putchar('X');
1977     }
1978   }
1979   return;
1980 }
1981 
1982 void
Sequence_stdout_stream_ints(T this)1983 Sequence_stdout_stream_ints (T this) {
1984   int i = 0, pos, start, end;
1985 
1986   start = 0;
1987   end = this->fulllength;
1988 
1989   for (pos = start; pos < end; pos++, i++) {
1990     switch (toupper(this->contents[i])) {
1991     case 'A': putchar(0); break;
1992     case 'C': putchar(1); break;
1993     case 'G': putchar(2); break;
1994     case 'T': putchar(3); break;
1995     default: putchar(4);
1996     }
1997   }
1998   return;
1999 }
2000 
2001 
2002 T
Sequence_substring(T usersegment,unsigned int left,unsigned int length,bool revcomp)2003 Sequence_substring (T usersegment, unsigned int left, unsigned int length,
2004 		    bool revcomp) {
2005   char *gbuffer;
2006 
2007   gbuffer = (char *) CALLOC(length+1,sizeof(char));
2008 
2009   memcpy(gbuffer,&(usersegment->contents[left]),length*sizeof(char));
2010   gbuffer[length] = '\0';
2011 
2012   if (revcomp == true) {
2013     /* make_complement_buffered(gbuffer2,gbuffer1,length); */
2014     make_complement_inplace(gbuffer,length);
2015     debug(fprintf(stderr,"Got sequence at %u with length %u, revcomp\n",left,length));
2016     return Sequence_genomic_new(gbuffer,length,/*copyp*/false);
2017   } else {
2018     debug(fprintf(stderr,"Got sequence at %u with length %u, forward\n",left,length));
2019     return Sequence_genomic_new(gbuffer,length,/*copyp*/false);
2020   }
2021 }
2022 
2023 
2024