1 static char rcsid[] = "$Id: sequence.c 221673 2020-02-11 16:17:58Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 #ifndef HAVE_MEMCPY
6 # define memcpy(d,s,n) bcopy((s),(d),(n))
7 #endif
8 #ifndef HAVE_MEMMOVE
9 # define memmove(d,s,n) bcopy((s),(d),(n))
10 #endif
11
12 #include "sequence.h"
13
14 #include <stddef.h>
15 #include <stdlib.h>
16 #include <string.h>
17 #include <strings.h> /* For rindex */
18 #include <ctype.h> /* For iscntrl, isspace, and toupper */
19
20 #ifdef HAVE_ZLIB
21 #include <zlib.h>
22 #define GZBUFFER_SIZE 131072
23 #endif
24
25
26 #include "assert.h"
27 #include "mem.h"
28 #include "complement.h"
29 #include "intlist.h"
30 #include "fopen.h"
31 #include "md5.h"
32 #ifdef PMAP
33 #include "dynprog.h"
34 #endif
35
36 /* Before setting DEBUG, may want to reduce MAXSEQLEN in sequence.h */
37 #ifdef DEBUG
38 #define debug(x) x
39 #else
40 #define debug(x)
41 #endif
42
43 /* Pointers for first half and second half */
44 #ifdef DEBUG1
45 #define debug1(x) x
46 #else
47 #define debug1(x)
48 #endif
49
50
51
52 /***********************************************************************
53 * Definitions:
54 *
55 * TTTTTT ACGT ...... ACGT AAAAAA
56 * <- trimlength ->
57 * <-------- fulllength -------->
58 * ^trimstart
59 * ^contents
60 *
61 ************************************************************************/
62
63
64 #define T Sequence_T
65 struct T {
66 char *acc; /* Accession */
67 char *restofheader; /* Rest of header */
68 char *contents; /* Original sequence, ends with '\0' */
69 char *contents_alloc; /* Allocation */
70 int fulllength; /* Full length (not including chopped sequence) */
71
72 int trimstart; /* Start of trim */
73 int trimend; /* End of trim */
74 #ifdef PMAP
75 int fulllength_given; /* Full length minus implicit stop codon at end */
76 #endif
77 int subseq_offset; /* Used only for subsequences */
78 int skiplength; /* Used only for sequences longer than MAXSEQLEN */
79 /* bool free_contents_p; */
80
81 #ifndef PMAP
82 char *quality; /* For Illumina short reads read via extended FASTA */
83 char *quality_alloc; /* Allocation */
84 #endif
85
86 bool firstp; /* First end indicated by '>', second end by '<' in extended FASTA file */
87 };
88
89 bool
Sequence_firstp(T this)90 Sequence_firstp (T this) {
91 return this->firstp;
92 }
93
94 char *
Sequence_accession(T this)95 Sequence_accession (T this) {
96 return this->acc;
97 }
98
99
100 static char *
skip_whitespace(char * string)101 skip_whitespace (char *string) {
102 int c;
103
104 while ((c = *string) != '\0' && isspace(c)) {
105 string++;
106 }
107
108 return string;
109 }
110
111 char *
Sequence_restofheader(T this)112 Sequence_restofheader (T this) {
113 return skip_whitespace(this->restofheader);
114 }
115
116 char *
Sequence_restofheader_wannot(T this)117 Sequence_restofheader_wannot (T this) {
118 char *header, *p;
119
120 p = skip_whitespace(this->restofheader);
121 if (*p == '\0') {
122 return (char *) NULL;
123 } else {
124 header = MALLOC(strlen("Annot=\"")+(strlen(p)+strlen("\"")+1)*sizeof(char));
125 sprintf(header,"Annot=\"%s\"",p);
126 return header;
127 }
128 }
129
130 char *
Sequence_restofheader_keyvalue(T this)131 Sequence_restofheader_keyvalue (T this) {
132 char *header, *p, *q, lastc, c;
133 int bracket_level = 0;
134
135 p = skip_whitespace(this->restofheader);
136 if (*p == '\0') {
137 return (char *) NULL;
138 } else {
139 q = header = MALLOC((strlen(p)+1)*sizeof(char));
140
141 lastc = '\0';
142 while ((c = *p++) != '\0') {
143 if (c == '{') {
144 if (bracket_level == 0) {
145 *q++ = '"';
146 } else {
147 *q++ = c;
148 }
149 bracket_level++;
150
151 } else if (c == '}') {
152 bracket_level--;
153 if (bracket_level == 0) {
154 *q++ = '"';
155 } else {
156 *q++ = c;
157 }
158
159 } else if (c == ' ') {
160 if (bracket_level > 0) {
161 *q++ = ' ';
162 } else if (lastc == ' ') {
163 /* Skip */
164 } else {
165 *q++ = ';';
166 }
167
168 } else {
169 *q++ = c;
170 }
171
172 lastc = c;
173 }
174
175 *q = '\0';
176 return header;
177 }
178 }
179
180 char *
Sequence_fullpointer(T this)181 Sequence_fullpointer (T this) {
182 return this->contents;
183 }
184
185 char *
Sequence_trimpointer(T this)186 Sequence_trimpointer (T this) {
187 return &(this->contents[this->trimstart]);
188 }
189
190 #ifndef PMAP
191 char *
Sequence_quality_string(T this)192 Sequence_quality_string (T this) {
193 return this->quality;
194 }
195 #endif
196
197
198 int
Sequence_ntlength(T this)199 Sequence_ntlength (T this) {
200 #ifdef PMAP
201 return 3*this->fulllength;
202 #else
203 return this->fulllength;
204 #endif
205 }
206
207 int
Sequence_fulllength(T this)208 Sequence_fulllength (T this) {
209 return this->fulllength;
210 }
211
212 int
Sequence_fulllength_given(T this)213 Sequence_fulllength_given (T this) {
214 #ifdef PMAP
215 return this->fulllength_given;
216 #else
217 return this->fulllength;
218 #endif
219 }
220
221 char *
Sequence_subseq_pointer(T this,int querystart)222 Sequence_subseq_pointer (T this, int querystart) {
223 return &(this->contents[querystart]);
224 }
225
226 int
Sequence_subseq_length(T this,int querystart)227 Sequence_subseq_length (T this, int querystart) {
228 return this->fulllength - querystart;
229 }
230
231
232
233 int
Sequence_trimlength(T this)234 Sequence_trimlength (T this) {
235 return this->trimend - this->trimstart;
236 }
237
238 void
Sequence_trim(T this,int trim_start,int trim_end)239 Sequence_trim (T this, int trim_start, int trim_end) {
240 this->trimstart = trim_start;
241 this->trimend = trim_end;
242 return;
243 }
244
245 int
Sequence_trim_start(T this)246 Sequence_trim_start (T this) {
247 return this->trimstart;
248 }
249
250 int
Sequence_trim_end(T this)251 Sequence_trim_end (T this) {
252 return this->trimend;
253 }
254
255 int
Sequence_subseq_offset(T this)256 Sequence_subseq_offset (T this) {
257 return this->subseq_offset;
258 }
259
260 int
Sequence_skiplength(T this)261 Sequence_skiplength (T this) {
262 return this->skiplength;
263 }
264
265 void
Sequence_free(T * old)266 Sequence_free (T *old) {
267 if (*old) {
268 if ((*old)->restofheader != NULL) {
269 FREE_IN((*old)->restofheader);
270 }
271 if ((*old)->acc != NULL) {
272 FREE_IN((*old)->acc);
273 }
274
275 #ifndef PMAP
276 if ((*old)->quality_alloc != NULL) {
277 FREE_IN((*old)->quality_alloc);
278 }
279 #endif
280
281 FREE_IN((*old)->contents_alloc);
282
283 FREE_IN(*old);
284 }
285 return;
286 }
287
288 #ifdef PMAP
289 static int aa_index_table[128] =
290 { -1,
291 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
292 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
293 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
294 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
295
296 /* * */
297 -1, 20, -1, -1, -1, -1, -1, -1, -1, -1,
298 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
299 -1, -1, -1, -1,
300
301 /* A, B, C, D, E, F, G, H, I, J, */
302 0, -1, 1, 2, 3, 4, 5, 6, 7, -1,
303
304 /* K, L, M, N, O, P, Q, R, S, T */
305 8, 9, 10, 11, -1, 12, 13, 14, 15, 16,
306
307 /* U, V, W, X, Y, Z */
308 21, 17, 18, -1, 19, -1,
309
310 -1, -1, -1, -1, -1, -1,
311
312 /* a, b, c, d, e, f, g, h, i, j, */
313 0, -1, 1, 2, 3, 4, 5, 6, 7, -1,
314
315 /* k, l, m, n, o, p, q, r, s, t */
316 8, 9, 10, 11, -1, 12, 13, 14, 15, 16,
317
318 /* u, v, w, x, y, z */
319 21, 17, 18, -1, 19, -1,
320
321 -1, -1, -1, -1, -1};
322
323
324 static char *iupac_table[21+1] =
325 {"GCN", /* A */
326 "TGY", /* C */
327 "GAY", /* D */
328 "GAR", /* E */
329 "TTY", /* F */
330 "GGN", /* G */
331 "CAY", /* H */
332 "ATH", /* I */
333 "AAR", /* K */
334 "YTN", /* L */
335 "ATG", /* M */
336 "AAY", /* N */
337 "CCN", /* P */
338 "CAR", /* Q */
339 "MGN", /* R */
340 "WSN", /* S */
341 "ACN", /* T */
342 "GTN", /* V */
343 "TGG", /* W */
344 "TAY", /* Y */
345 "TRR", /* STOP */
346 "TGA"}; /* U */
347
348
349 static char aa_table[21+1] = "ACDEFGHIKLMNPQRSTVWY*U";
350
351 char
Sequence_codon_char(char aa,int codonpos)352 Sequence_codon_char (char aa, int codonpos) {
353 int index;
354 char *codon;
355
356 if ((index = aa_index_table[(int) aa]) < 0) {
357 return 'N';
358 } else {
359 codon = iupac_table[index];
360 return codon[codonpos];
361 }
362 }
363
364
365
366 static char *
instantiate_codons(char * aasequence,int ntlength)367 instantiate_codons (char *aasequence, int ntlength) {
368 char *newntsequence;
369 int aapos, ntpos;
370 int index, frame;
371 char *codon, aa;
372
373 newntsequence = (char *) CALLOC(ntlength+1,sizeof(char));
374
375 for (ntpos = 0; ntpos < ntlength; ntpos++) {
376 aapos = (/*offset+*/ntpos)/3;
377
378 aa = aasequence[aapos];
379 index = aa_index_table[(int) aa];
380 if (index < 0) {
381 newntsequence[ntpos] = 'N';
382 } else {
383 codon = iupac_table[index];
384
385 frame = (/*offset+*/ntpos) % 3;
386 newntsequence[ntpos] = codon[frame];
387 }
388 }
389
390 #if 0
391 printf("New sequence: %s\n",newntsequence);
392 #endif
393 return newntsequence;
394 }
395
396
397 T
Sequence_convert_to_nucleotides(T this)398 Sequence_convert_to_nucleotides (T this) {
399 T new = (T) MALLOC_IN(sizeof(*new));
400 int i;
401
402 new->acc = (char *) NULL;
403 new->restofheader = (char *) NULL;
404 new->fulllength = this->fulllength*3;
405 new->fulllength_given = this->fulllength_given*3;
406 new->contents = new->contents_alloc =
407 instantiate_codons(/*aasequence*/this->contents,/*ntlength*/this->fulllength*3);
408 #if 0
409 for (i = 0; i < new->fulllength; i++) {
410 new->contents[i] = '?';
411 }
412 #endif
413
414 new->trimstart = 0;
415 new->trimend = new->fulllength_given;
416 /* new->free_contents_p = true; */
417 new->subseq_offset = 0;
418 new->skiplength = this->skiplength;
419 new->firstp = this->firstp;
420
421 return new;
422 }
423 #endif
424
425
426 #if 0
427 int
428 Sequence_count_bad (T this, int pos, int max, int direction) {
429 int nbad = 0;
430
431 if (direction > 0) {
432 while (--max >= 0 && pos < this->fulllength) {
433 if (this->contents[pos] == 'X') {
434 nbad++;
435 }
436 pos++;
437 }
438 } else {
439 while (--max >= 0 && pos >= 0) {
440 if (this->contents[pos] == 'X') {
441 nbad++;
442 }
443 pos--;
444 }
445 }
446
447 return nbad;
448 }
449 #endif
450
451
452 #define HEADERLEN 512
453 #define DISCARDLEN 8192
454
455 static char Header[HEADERLEN];
456 static char Discard[DISCARDLEN];
457
458 static char Sequence[1+MAXSEQLEN+1]; /* Used by Sequence_read_unlimited */
459 static char Sequence1[HALFLEN+1]; /* 1 at end for '\0' */
460 static char Sequence2[HALFLEN+3]; /* 1 at end for '\0' and 2 extra in cyclic part for '\n' and '\0' */
461
462 static char *Firsthalf;
463 static char *Secondhalf;
464 /* static int Initc = '\0'; */
465
466
467 /* The first element of Sequence is always the null character, to mark
468 the end of the string */
469
470 /* Skipping of dashes might still be buggy */
471 /*
472 #define DASH '-'
473 */
474
475
476 /* Returns '@', '>', or '<' if FASTA file, first sequence char if not */
477 int
Sequence_input_init(FILE * fp)478 Sequence_input_init (FILE *fp) {
479 int c;
480 bool okayp = false;
481
482 Header[0] = '\0';
483 Sequence[0] = '\0';
484 Firsthalf = &(Sequence1[0]);
485 Secondhalf = &(Sequence2[0]);
486
487 while (okayp == false && (c = fgetc(fp)) != EOF) {
488 debug(printf("Read character %c\n",c));
489 if (iscntrl(c)) {
490 #ifdef DASH
491 } else if (c == DASH) {
492 #endif
493 } else if (isspace(c)) {
494 } else {
495 okayp = true;
496 }
497 }
498
499 debug(printf("Returning initial character %c\n",c));
500 return c;
501 }
502
503 #ifdef HAVE_ZLIB
504 /* Returns '@', '>', or '<' if FASTA file, first sequence char if not */
505 int
Sequence_input_init_gzip(gzFile fp)506 Sequence_input_init_gzip (gzFile fp) {
507 int c;
508 bool okayp = false;
509
510 Header[0] = '\0';
511 Sequence[0] = '\0';
512 Firsthalf = &(Sequence1[0]);
513 Secondhalf = &(Sequence2[0]);
514
515 while (okayp == false && (c = gzgetc(fp)) != EOF) {
516 debug(printf("Read character %c\n",c));
517 if (iscntrl(c)) {
518 #ifdef DASH
519 } else if (c == DASH) {
520 #endif
521 } else if (isspace(c)) {
522 } else {
523 okayp = true;
524 }
525 }
526
527 debug(printf("Returning initial character %c\n",c));
528 return c;
529 }
530 #endif
531
532 #ifdef HAVE_BZLIB
533 /* Returns '@', '>', or '<' if FASTA file, first sequence char if not */
534 int
Sequence_input_init_bzip2(Bzip2_T fp)535 Sequence_input_init_bzip2 (Bzip2_T fp) {
536 int c;
537 bool okayp = false;
538
539 Header[0] = '\0';
540 Sequence[0] = '\0';
541 Firsthalf = &(Sequence1[0]);
542 Secondhalf = &(Sequence2[0]);
543
544 while (okayp == false && (c = bzgetc(fp)) != EOF) {
545 debug(printf("Read character %c\n",c));
546 if (iscntrl(c)) {
547 #ifdef DASH
548 } else if (c == DASH) {
549 #endif
550 } else if (isspace(c)) {
551 } else {
552 okayp = true;
553 }
554 }
555
556 debug(printf("Returning initial character %c\n",c));
557 return c;
558 }
559 #endif
560
561
562 static void
blank_header(T this)563 blank_header (T this) {
564 this->acc = (char *) CALLOC_IN(strlen("NO_HEADER")+1,sizeof(char));
565 strcpy(this->acc,"NO_HEADER");
566 this->restofheader = (char *) CALLOC_IN(1,sizeof(char));
567 this->restofheader[0] = '\0';
568 return;
569 }
570
571 static char *
input_header(FILE * fp,T this)572 input_header (FILE *fp, T this) {
573 char *p;
574 size_t length;
575
576 if (feof(fp)) {
577 return NULL;
578 } else if (fgets(&(Header[0]),HEADERLEN,fp) == NULL) {
579 /* File must terminate after > */
580 return NULL;
581 }
582
583 if (Header[0] == '\n') {
584 Header[0] = '\0';
585 } else if ((p = rindex(&(Header[0]),'\n')) != NULL) {
586 if (p[-1] == '\r') {
587 p--;
588 }
589 *p = '\0';
590 } else {
591 /* Eliminate rest of header from input */
592 while (fgets(&(Discard[0]),DISCARDLEN,fp) != NULL &&
593 rindex(&(Discard[0]),'\n') == NULL) ;
594 }
595
596 p = &(Header[0]);
597 while (*p != '\0' && !isspace((int) *p)) {
598 p++;
599 }
600 if (*p == '\0') {
601 /* Accession only */
602 length = (p - &(Header[0]))/sizeof(char);
603 this->acc = (char *) CALLOC_IN(length+1,sizeof(char));
604 strcpy(this->acc,Header);
605 this->restofheader = (char *) CALLOC_IN(1,sizeof(char));
606 this->restofheader[0] = '\0';
607 } else {
608 *p = '\0';
609 length = (p - &(Header[0]))/sizeof(char);
610 this->acc = (char *) CALLOC_IN(length+1,sizeof(char));
611 strcpy(this->acc,Header);
612 p++;
613 this->restofheader = (char *) CALLOC_IN(strlen(p)+1,sizeof(char));
614 strcpy(this->restofheader,p);
615 }
616
617 return this->acc;
618 }
619
620 static bool
skip_header(FILE * fp)621 skip_header (FILE *fp) {
622
623 if (feof(fp)) {
624 return false;
625 } else if (fgets(&(Header[0]),HEADERLEN,fp) == NULL) {
626 /* File must terminate after > */
627 return false;
628 }
629
630 if (rindex(&(Header[0]),'\n') == NULL) {
631 /* Eliminate rest of header from input */
632 while (fgets(&(Discard[0]),DISCARDLEN,fp) != NULL &&
633 rindex(&(Discard[0]),'\n') == NULL) ;
634 }
635
636 return true;
637 }
638
639 #ifdef DEBUG
640 static void
print_contents(char * p,int length)641 print_contents (char *p, int length) {
642 int i;
643 FILE *fp = stdout;
644
645 fprintf(fp,"\"");
646 for (i = 0; i < length; i++) {
647 if (*p == '\0') {
648 fprintf(fp,"_");
649 } else {
650 fprintf(fp,"%c",*p);
651 }
652 p++;
653 }
654 fprintf(fp,"\"\n");
655 return;
656 }
657 #endif
658
659
660 #define CONTROLM 13 /* From PC */
661 #define SPACE 32
662
663 #if 0
664 static char *
665 find_bad_char (char *line) {
666 char *first, *p1, *p2;
667 #ifdef DASH
668 char *p3;
669 #endif
670
671 /* p1 = index(line,CONTROLM); */
672 p2 = index(line,SPACE);
673 /* p3 = index(line,DASH); */
674
675 if (/* p1 == NULL && */ p2 == NULL /* && p3 == NULL*/) {
676 return NULL;
677 } else {
678 #if 0
679 if (p1) {
680 first = p1;
681 }
682 if (p2) {
683 first = p2;
684 }
685 /*
686 if (p3) {
687 first = p3;
688 }
689 */
690 if (p1 && p1 < first) {
691 first = p1;
692 }
693 if (p2 && p2 < first) {
694 first = p2;
695 }
696 /*
697 if (p3 && p3 < first) {
698 first = p3;
699 }
700 */
701 return first;
702 #else
703 return p2;
704 #endif
705 }
706 }
707 #endif
708
709
710 static int
read_first_half(int * nextchar,bool * eolnp,FILE * fp,bool possible_fasta_header_p)711 read_first_half (int *nextchar, bool *eolnp, FILE *fp, bool possible_fasta_header_p) {
712 int remainder, strlenp;
713 char *ptr, *p = NULL;
714 int c;
715 bool init_char_p = true;
716
717 ptr = &(Firsthalf[0]);
718 if (possible_fasta_header_p == false || (*nextchar != '@' && *nextchar != '>' && *nextchar != '<' && *nextchar != '+')) {
719 *ptr++ = (char) *nextchar;
720 }
721 remainder = (&(Firsthalf[HALFLEN]) - ptr)/sizeof(char);
722
723 while (1) {
724 if (remainder <= 0) {
725 debug(printf("remainder <= 0. Returning false\n"));
726 *nextchar = EOF;
727 debug1(printf("read_first_half returning length1 of %d\n",(ptr - &(Firsthalf[0]))/sizeof(char)));
728 return (ptr - &(Firsthalf[0]))/sizeof(char);
729
730 } else if (feof(fp)) {
731 /* EOF in middle of line */
732 debug(printf("EOF. Returning true\n"));
733 *nextchar = EOF;
734 debug1(printf("read_first_half returning length1 of %d\n",(ptr - &(Firsthalf[0]))/sizeof(char)));
735 return (ptr - &(Firsthalf[0]))/sizeof(char);
736
737 } else if (*eolnp == true) {
738 /* Peek at character after eoln */
739 if ((c = fgetc(fp)) == EOF || (init_char_p == false && (c == '@' || c == '>' || c == '<' || c == '+'))) {
740 debug(printf("c == EOF or > or < or +. Returning true\n"));
741 *nextchar = c;
742 return (ptr - &(Firsthalf[0]))/sizeof(char);
743 } else if (iscntrl(c)) {
744 debug(printf("c == control char. Continuing\n"));
745 #ifdef DASH
746 } else if (c == DASH) {
747 debug(printf("c == dash. Continuing\n"));
748 #endif
749 } else if (isspace(c)) {
750 *eolnp = true;
751 debug(printf("c == NULL. Continuing\n"));
752 } else {
753 *ptr++ = (char) c;
754 remainder--;
755 *eolnp = false;
756 p = NULL;
757 debug(printf("c == sth. Continuing\n"));
758 }
759
760 } else {
761 debug(printf("Trying to read remainder of %d\n",remainder));
762 if (p != NULL) {
763 strlenp = strlen(p);
764 memmove(ptr,p,strlenp);
765 ptr[strlenp] = '\0';
766 debug(printf("Did memmove of %d chars at %p to %p\n",strlenp,p,ptr));
767 } else {
768 p = fgets(ptr,remainder+1,fp);
769 }
770 if (p == NULL) {
771 debug(printf("line == NULL. Returning true\n"));
772 *nextchar = EOF;
773 debug1(printf("read_first_half returning length1 of %d\n",(ptr - &(Firsthalf[0]))/sizeof(char)));
774 return (ptr - &(Firsthalf[0]))/sizeof(char);
775 } else {
776 debug(printf("Read %s.\n",ptr));
777 /* Was a call to find_bad_char */
778 while ((p = index(ptr,SPACE)) != NULL) {
779 ptr = p++;
780 strlenp = strlen(p);
781 memmove(ptr,p,strlenp);
782 ptr[strlenp] = '\0';
783 debug(printf("Found space. Did memmove of %d chars at %p to %p\n",strlenp,p,ptr));
784 }
785 if (*ptr == '\n') {
786 *eolnp = true;
787 debug(printf("line == EOLN. Continuing\n"));
788 } else if ((p = index(ptr,'\n')) != NULL) {
789 if (p[-1] == '\r') {
790 p--;
791 }
792 ptr = p;
793 *eolnp = true;
794 debug(printf("line == EOLN. Continuing\n"));
795 } else {
796 ptr += strlen(ptr);
797 *eolnp = false;
798 p = NULL;
799 debug(printf("line != EOLN. Continuing\n"));
800 }
801 remainder = (&(Firsthalf[HALFLEN]) - ptr)/sizeof(char);
802 }
803 }
804
805 debug(print_contents(&(Firsthalf[0]),HALFLEN+1));
806 init_char_p = false;
807 }
808 }
809
810 /* returns skip length */
811 static int
read_second_half(int * nextchar,char ** pointer2a,int * length2a,char ** pointer2b,int * length2b,bool eolnp,FILE * fp)812 read_second_half (int *nextchar, char **pointer2a, int *length2a, char **pointer2b, int *length2b,
813 bool eolnp, FILE *fp) {
814 int skiplength, ncycles = 0, remainder, terminator, strlenp;
815 char *ptr;
816 char *p = NULL;
817 int c;
818 bool init_char_p = true;
819
820 ptr = &(Secondhalf[0]);
821 remainder = (&(Secondhalf[HALFLEN+2]) - ptr)/sizeof(char);
822
823 while (1) {
824 debug(printf("\nEnd: %d\n",remainder));
825
826 if (feof(fp)) {
827 debug(printf("EOF. Returning\n"));
828 *nextchar = EOF;
829 break;
830
831 } else if (remainder <= 0) {
832 ptr = &(Secondhalf[0]);
833 remainder = (&(Secondhalf[HALFLEN+2]) - ptr)/sizeof(char);
834 ncycles++;
835 debug(printf("remainder <= 0. Cycling\n"));
836
837 } else if (eolnp == true) {
838 /* Peek at character after eoln */
839 if ((c = fgetc(fp)) == EOF || (init_char_p == false && (c == '@' || c == '>' || c == '<' || c == '+'))) {
840 debug(printf("c == EOF or > or < or +. Returning\n"));
841 *nextchar = c;
842 break;
843 } else if (iscntrl(c)) {
844 debug(printf("c == control char. Continuing\n"));
845 #ifdef DASH
846 } else if (c == DASH) {
847 debug(printf("c == dash. Continuing\n"));
848 #endif
849 } else if (isspace(c)) {
850 debug(printf("c == NULL. Continuing\n"));
851 } else {
852 *ptr++ = (char) c;
853 remainder--;
854 eolnp = false;
855 p = NULL;
856 debug(printf("c == sth. Continuing\n"));
857 }
858
859 } else {
860 if (p != NULL) {
861 strlenp = strlen(p);
862 memmove(ptr,p,strlenp);
863 ptr[strlenp] = '\0';
864 debug(printf("Did memmove of %d chars at %p to %p\n",strlenp,p,ptr));
865 } else {
866 p = fgets(ptr,remainder+1,fp);
867 }
868 if (p == NULL) {
869 debug(printf("line == NULL. Returning\n"));
870 *nextchar = EOF;
871 break;
872 } else {
873 debug(printf("Read %s.\n",ptr));
874 /* Was a call to find_bad_char */
875 while ((p = index(ptr,SPACE)) != NULL) {
876 ptr = p++;
877 strlenp = strlen(p);
878 memmove(ptr,p,strlenp);
879 ptr[strlenp] = '\0';
880 debug(printf("Found space. Did memmove of %d chars at %p to %p\n",strlenp,p,ptr));
881 }
882 if (*ptr == '\n') {
883 eolnp = true;
884 debug(printf("line == EOLN. Continuing\n"));
885 } else if ((p = index(ptr,'\n')) != NULL) {
886 if (p[-1] == '\r') {
887 p--;
888 }
889 ptr = p;
890 eolnp = true;
891 debug(printf("line == EOLN. Continuing\n"));
892 } else {
893 ptr += strlen(ptr);
894 eolnp = false;
895 p = NULL;
896 debug(printf("line != EOLN. Continuing\n"));
897 }
898 remainder = (&(Secondhalf[HALFLEN+2]) - ptr)/sizeof(char);
899 }
900 }
901
902 debug(print_contents(&(Secondhalf[0]),HALFLEN+3));
903 init_char_p = false;
904 }
905
906 terminator = (ptr - &(Secondhalf[0]))/sizeof(char);
907 debug(printf("ncycles = %d, terminator is %d\n",ncycles,terminator));
908 if (ncycles == 0) {
909 *length2a = 0;
910 if (terminator < HALFLEN) {
911 skiplength = 0;
912 } else {
913 skiplength = terminator-HALFLEN;
914 }
915 } else {
916 *length2a = HALFLEN-terminator;
917 skiplength = ncycles*(HALFLEN+2) + terminator-HALFLEN;
918 }
919 if (*length2a <= 0) {
920 *length2a = 0;
921 *pointer2a = (char *) NULL;
922 } else {
923 *pointer2a = &(Secondhalf[HALFLEN+2-(*length2a)]);
924 }
925 if (terminator == 0) {
926 *length2b = 0;
927 *pointer2b = (char *) NULL;
928 } else if (terminator > HALFLEN) {
929 *length2b = HALFLEN;
930 *pointer2b = &(ptr[-(*length2b)]);
931 } else {
932 *length2b = terminator;
933 *pointer2b = &(Secondhalf[0]);
934 }
935
936 return skiplength;
937 }
938
939
940 /* Returns sequence length */
941 static int
input_sequence(int * nextchar,char ** pointer1,int * length1,char ** pointer2a,int * length2a,char ** pointer2b,int * length2b,int * skiplength,FILE * fp,bool possible_fasta_header_p)942 input_sequence (int *nextchar, char **pointer1, int *length1, char **pointer2a, int *length2a,
943 char **pointer2b, int *length2b, int *skiplength, FILE *fp, bool possible_fasta_header_p) {
944 bool eolnp = true;
945
946 *pointer1 = &(Firsthalf[0]);
947 *pointer2a = (char *) NULL;
948 *length2a = 0;
949 *pointer2b = (char *) NULL;
950 *length2b = 0;
951
952 /* printf("Beginning input_sequence with nextchar = %c\n",*nextchar); */
953 if ((*length1 = read_first_half(&(*nextchar),&eolnp,fp,possible_fasta_header_p)) == 0) {
954 *pointer1 = (char *) NULL;
955 *skiplength = 0;
956 } else if (*length1 < HALFLEN) {
957 *skiplength = 0;
958 } else {
959 *skiplength = read_second_half(&(*nextchar),&(*pointer2a),&(*length2a),
960 &(*pointer2b),&(*length2b),eolnp,fp);
961 debug1(printf("read_second_half returns skiplength of %d, length2a=%d, length2b=%d\n",
962 *skiplength,*length2a,*length2b));
963 }
964
965 debug1(printf("length1 = %d, length2a = %d, length2b = %d\n",
966 *length1,*length2a,*length2b));
967
968 return (*length1) + (*length2a) + (*length2b);
969 }
970
971
972 /* Used only by extern procedures (outside of this file). Internal
973 procedures have their own specialized creators. */
974 T
Sequence_genomic_new(char * contents,int length,bool copyp)975 Sequence_genomic_new (char *contents, int length, bool copyp) {
976 T new = (T) MALLOC_IN(sizeof(*new));
977 char *copy;
978
979 new->acc = (char *) NULL;
980 new->restofheader = (char *) NULL;
981
982 new->trimstart = 0;
983 new->trimend = new->fulllength = length;
984 #ifdef PMAP
985 new->fulllength_given = length;
986 #endif
987
988 if (copyp == true) {
989 copy = (char *) MALLOC_IN((length+1)*sizeof(char));
990 strncpy(copy,contents,length);
991 copy[length] = '\0';
992
993 new->contents = copy;
994 new->contents_alloc = copy;
995 } else {
996 /* new->free_contents_p = false; */
997 new->contents = contents;
998 new->contents_alloc = contents;
999 /* new->contents_uc_alloc = (char *) NULL; -- only for GSNAP */
1000 }
1001
1002 #ifndef PMAP
1003 new->quality = new->quality_alloc = (char *) NULL;
1004 #endif
1005
1006 new->subseq_offset = 0;
1007 new->skiplength = 0;
1008 new->firstp = true;
1009
1010 return new;
1011 }
1012
1013
1014 static char complCode[128] = COMPLEMENT_LC;
1015
1016 static char *
make_complement(char * sequence,unsigned int length)1017 make_complement (char *sequence, unsigned int length) {
1018 char *complement;
1019 int i, j;
1020
1021 complement = (char *) MALLOC_IN((length+1)*sizeof(char));
1022 for (i = length-1, j = 0; i >= 0; i--, j++) {
1023 complement[j] = complCode[(int) sequence[i]];
1024 }
1025 complement[length] = '\0';
1026 return complement;
1027 }
1028
1029 static char *
make_reverse(char * sequence,unsigned int length)1030 make_reverse (char *sequence, unsigned int length) {
1031 char *reverse;
1032 int i, j;
1033
1034 if (sequence == NULL) {
1035 return (char *) NULL;
1036 } else {
1037 reverse = (char *) MALLOC_IN((length+1)*sizeof(char));
1038 for (i = length-1, j = 0; i >= 0; i--, j++) {
1039 reverse[j] = sequence[i];
1040 }
1041 reverse[length] = '\0';
1042 return reverse;
1043 }
1044 }
1045
1046
1047 #if 0
1048 static char *
1049 make_complement_uppercase (char *sequence, unsigned int length) {
1050 char *complement;
1051 char uppercaseCode[128] = UPPERCASE_U2T;
1052 int i, j;
1053
1054 complement = (char *) CALLOC_IN(length+1,sizeof(char));
1055 for (i = length-1, j = 0; i >= 0; i--, j++) {
1056 complement[j] = uppercaseCode[(int) complCode[(int) sequence[i]]];
1057 }
1058 complement[length] = '\0';
1059 return complement;
1060 }
1061 #endif
1062
1063
1064 #if 0
1065 static void
1066 make_complement_buffered (char *complement, char *sequence, unsigned int length) {
1067 int i, j;
1068
1069 /* complement = (char *) CALLOC_IN(length+1,sizeof(char)); */
1070 for (i = length-1, j = 0; i >= 0; i--, j++) {
1071 complement[j] = complCode[(int) sequence[i]];
1072 }
1073 complement[length] = '\0';
1074 return;
1075 }
1076 #endif
1077
1078
1079 static void
make_complement_inplace(char * sequence,unsigned int length)1080 make_complement_inplace (char *sequence, unsigned int length) {
1081 char temp;
1082 unsigned int i, j;
1083
1084 for (i = 0, j = length-1; i < length/2; i++, j--) {
1085 temp = complCode[(int) sequence[i]];
1086 sequence[i] = complCode[(int) sequence[j]];
1087 sequence[j] = temp;
1088 }
1089 if (i == j) {
1090 sequence[i] = complCode[(int) sequence[i]];
1091 }
1092
1093 return;
1094 }
1095
1096
1097 /************************************************************************
1098 * Original:
1099 * TTTTTT ACGT ...... ACGT AAAAAA
1100 * ^trimstart ^trimend
1101 * ^contents
1102 ************************************************************************
1103 * Subsequence:
1104 * ^start ^end
1105 * ^trimstart ^trimend
1106 * ^contents
1107 ************************************************************************/
1108
1109 T
Sequence_subsequence(T this,int start,int end)1110 Sequence_subsequence (T this, int start, int end) {
1111 T new;
1112
1113 #ifdef PMAP
1114 start /= 3;
1115 end /= 3;
1116 #endif
1117
1118 if (start < 0) {
1119 start = 0;
1120 }
1121 if (end > this->fulllength) {
1122 end = this->fulllength;
1123 }
1124
1125 if (end <= start) {
1126 return NULL;
1127 } else {
1128 new = (T) MALLOC_IN(sizeof(*new));
1129
1130 new->acc = (char *) NULL;
1131 new->restofheader = (char *) NULL;
1132 new->contents = &(this->contents[start]);
1133
1134 new->fulllength = end - start;
1135 #ifdef PMAP
1136 new->fulllength_given = new->fulllength;
1137 #endif
1138 if ((new->trimstart = this->trimstart - start) < 0) {
1139 new->trimstart = 0;
1140 }
1141 if ((new->trimend = this->trimend - start) > new->fulllength) {
1142 new->trimend = new->fulllength;
1143 }
1144
1145 /* new->free_contents_p = false; */
1146 new->contents_alloc = (char *) NULL;
1147 /* new->contents_uc_alloc = (char *) NULL; -- only for GSNAP */
1148
1149 #ifndef PMAP
1150 if (this->quality == NULL) {
1151 new->quality = (char *) NULL;
1152 } else {
1153 new->quality = &(this->quality[start]);
1154 }
1155 new->quality_alloc = (char *) NULL;
1156 #endif
1157
1158 #ifdef PMAP
1159 new->subseq_offset = 3*start;
1160 #else
1161 new->subseq_offset = start;
1162 #endif
1163 new->skiplength = this->skiplength;
1164
1165 new->firstp = this->firstp;
1166
1167 return new;
1168 }
1169 }
1170
1171
1172 T
Sequence_revcomp(T this)1173 Sequence_revcomp (T this) {
1174 T new = (T) MALLOC_IN(sizeof(*new));
1175
1176 new->acc = (char *) NULL;
1177 new->restofheader = (char *) NULL;
1178 new->contents = new->contents_alloc = make_complement(this->contents,this->fulllength);
1179
1180 #ifndef PMAP
1181 new->quality = new->quality_alloc = make_reverse(this->quality,this->fulllength);
1182 #endif
1183
1184 new->fulllength = this->fulllength;
1185 #ifdef PMAP
1186 new->fulllength_given = this->fulllength_given;
1187 #endif
1188 new->trimstart = this->trimstart;
1189 new->trimend = this->trimend;
1190 /* new->free_contents_p = true; */
1191 new->subseq_offset = 0; /* Not sure if this is right */
1192 new->skiplength = this->skiplength;
1193 new->firstp = this->firstp;
1194 return new;
1195 }
1196
1197
1198 static char *
make_uppercase(char * sequence,unsigned int length)1199 make_uppercase (char *sequence, unsigned int length) {
1200 char *uppercase;
1201 #ifdef PMAP
1202 char uppercaseCode[128] = UPPERCASE_STD;
1203 #else
1204 char uppercaseCode[128] = UPPERCASE_U2T;
1205 #endif
1206 unsigned int i;
1207
1208 uppercase = (char *) MALLOC_IN((length+1)*sizeof(char));
1209 for (i = 0; i < length; i++) {
1210 uppercase[i] = uppercaseCode[(int) sequence[i]];
1211 }
1212 uppercase[length] = '\0';
1213 return uppercase;
1214 }
1215
1216
1217 T
Sequence_uppercase(T this)1218 Sequence_uppercase (T this) {
1219 T new = (T) MALLOC_IN(sizeof(*new));
1220
1221 new->acc = (char *) NULL;
1222 new->restofheader = (char *) NULL;
1223 new->contents = new->contents_alloc = make_uppercase(this->contents,this->fulllength);
1224
1225 #ifndef PMAP
1226 if (this->quality_alloc == NULL) {
1227 new->quality = new->quality_alloc = (char *) NULL;
1228 } else {
1229 new->quality = new->quality_alloc =(char *) MALLOC_IN((this->fulllength+1)*sizeof(char));
1230 strcpy(new->quality,this->quality);
1231 }
1232 #endif
1233
1234 new->fulllength = this->fulllength;
1235 #ifdef PMAP
1236 new->fulllength_given = this->fulllength_given;
1237 #endif
1238 new->trimstart = this->trimstart;
1239 new->trimend = this->trimend;
1240 /* new->free_contents_p = true; */
1241 new->subseq_offset = this->subseq_offset;
1242 new->skiplength = this->skiplength;
1243 new->firstp = this->firstp;
1244 return new;
1245 }
1246
1247
1248 T
Sequence_alias(T this)1249 Sequence_alias (T this) {
1250 T new = (T) MALLOC_IN(sizeof(*new));
1251
1252 new->acc = (char *) NULL;
1253 new->restofheader = (char *) NULL;
1254 new->contents = this->contents;
1255
1256 #ifndef PMAP
1257 new->quality = this->quality;
1258 new->quality_alloc = (char *) NULL;
1259 #endif
1260
1261 new->fulllength = this->fulllength;
1262 #ifdef PMAP
1263 new->fulllength_given = this->fulllength_given;
1264 #endif
1265 new->trimstart = this->trimstart;
1266 new->trimend = this->trimend;
1267
1268 /* new->free_contents_p = false; */
1269 new->contents_alloc = (char *) NULL;
1270 /* new->contents_uc_alloc = (char *) NULL; -- only for GSNAP */
1271
1272
1273 new->subseq_offset = this->subseq_offset;
1274 new->firstp = this->firstp;
1275 return new;
1276 }
1277
1278
1279 /*
1280 void
1281 Sequence_endstream () {
1282 Initc = '\0';
1283 return;
1284 }
1285 */
1286
1287
1288 T
Sequence_read(int * nextchar,FILE * input)1289 Sequence_read (int *nextchar, FILE *input) {
1290 T new;
1291 int fulllength, skiplength;
1292 char *pointer1, *pointer2a, *pointer2b;
1293 int length1, length2a, length2b;
1294 #ifdef PMAP
1295 char lastchar = '*';
1296 #else
1297 int quality_length;
1298 #endif
1299
1300 if (feof(input)) {
1301 *nextchar = EOF;
1302 return NULL;
1303 }
1304
1305 if (*nextchar == '\0') {
1306 #if 0 && defined(HAVE_ZLIB) && defined(HAVE_BZLIB)
1307 if (*gzipped != NULL) {
1308 *nextchar = Sequence_input_init_gzip(gzipped);
1309 } else if (*bzipped != NULL) {
1310 *nextchar = Sequence_input_init_bzip2(bzipped);
1311 } else {
1312 *nextchar = Sequence_input_init(input);
1313 }
1314 #elif 0 && defined(HAVE_ZLIB)
1315 if (*gzipped != NULL) {
1316 *nextchar = Sequence_input_init_gzip(gzipped);
1317 } else {
1318 *nextchar = Sequence_input_init(input);
1319 }
1320 #elif 0 && defined(HAVE_BZLIB)
1321 if (*bzipped != NULL) {
1322 *nextchar = Sequence_input_init_bzip2(bzipped);
1323 } else {
1324 *nextchar = Sequence_input_init(input);
1325 }
1326 #else
1327 *nextchar = Sequence_input_init(input);
1328 #endif
1329
1330 if (*nextchar == EOF) {
1331 return NULL;
1332 }
1333 }
1334
1335 new = (T) MALLOC_IN(sizeof(*new));
1336
1337 if (*nextchar != '@' && *nextchar != '>' && *nextchar != '<') {
1338 new->firstp = true; /* by default */
1339 blank_header(new);
1340 } else if (input_header(input,new) == NULL) {
1341 /* File ends after >. Don't process. */
1342 *nextchar = EOF;
1343 FREE_IN(new);
1344 return NULL;
1345 } else if (*nextchar == '@') {
1346 new->firstp = true; /* by default */
1347 } else if (*nextchar == '>') {
1348 new->firstp = true;
1349 } else if (*nextchar == '<') {
1350 new->firstp = false;
1351 } else {
1352 abort();
1353 }
1354
1355 if ((fulllength = input_sequence(&(*nextchar),&pointer1,&length1,&pointer2a,&length2a,
1356 &pointer2b,&length2b,&skiplength,input,/*possible_fasta_header_p*/true)) == 0) {
1357 /* File ends during header. Continue with a sequence of length 0. */
1358 /* fprintf(stderr,"File ends after header\n"); */
1359 }
1360
1361 if (skiplength > 0) {
1362 fprintf(stderr,"Warning: cDNA sequence length of %d exceeds maximum length of %d. Truncating %d chars in middle.\n",
1363 fulllength+skiplength,MAXSEQLEN,skiplength);
1364 fprintf(stderr," (For long sequences, perhaps you want maponly mode, by providing the '-1' flag.)\n");
1365 }
1366
1367 #ifdef PMAP
1368 if (length1 > 0) {
1369 lastchar = pointer1[length1-1];
1370 if (length2a > 0) {
1371 lastchar = pointer2a[length2a-1];
1372 }
1373 if (length2b > 0) {
1374 lastchar = pointer2b[length2b-1];
1375 }
1376 }
1377
1378 new->fulllength_given = fulllength;
1379 if (lastchar != '*') {
1380 debug(printf("Sequence does not end with *, so adding it\n"));
1381 fulllength++;
1382 }
1383 #endif
1384
1385 debug(printf("fulllength = %d\n",fulllength));
1386 new->fulllength = fulllength;
1387 new->skiplength = skiplength;
1388
1389 new->trimstart = 0;
1390 #ifdef PMAP
1391 new->trimend = new->fulllength_given;
1392 #else
1393 new->trimend = fulllength;
1394 #endif
1395
1396 new->contents = new->contents_alloc = (char *) MALLOC_IN((fulllength+1)*sizeof(char));
1397 if (length1 > 0) {
1398 strncpy(new->contents,pointer1,length1);
1399 new->contents[length1] = '\0';
1400
1401 if (length2a > 0) {
1402 strncpy(&(new->contents[length1]),pointer2a,length2a);
1403 new->contents[length1+length2a] = '\0';
1404
1405 }
1406 if (length2b > 0) {
1407 strncpy(&(new->contents[length1+length2a]),pointer2b,length2b);
1408 new->contents[length1+length2a+length2b] = '\0';
1409
1410 }
1411 }
1412
1413 #ifdef PMAP
1414 if (lastchar != '*') {
1415 new->contents[fulllength-1] = '*';
1416 }
1417 #endif
1418 /* new->free_contents_p = true; */
1419 new->subseq_offset = 0;
1420
1421 #ifndef PMAP
1422 /* Quality string */
1423 new->quality = new->quality_alloc = (char *) NULL;
1424 if (*nextchar == '+') {
1425 skip_header(input);
1426 *nextchar = fgetc(input);
1427 quality_length = input_sequence(&(*nextchar),&pointer1,&length1,&pointer2a,&length2a,
1428 &pointer2b,&length2b,&skiplength,input,/*possible_fasta_header_p*/false);
1429 if (quality_length != fulllength) {
1430 fprintf(stderr,"Length %d of quality score differs from length %d of nucleotides in sequence %s\n",
1431 quality_length,fulllength,new->acc);
1432 exit(9);
1433 } else {
1434 new->quality = new->quality_alloc = (char *) MALLOC_IN((fulllength+1)*sizeof(char));
1435 if (length1 > 0) {
1436 strncpy(new->quality,pointer1,length1);
1437 new->quality[length1] = '\0';
1438
1439 if (length2a > 0) {
1440 strncpy(&(new->quality[length1]),pointer2a,length2a);
1441 new->quality[length1+length2a] = '\0';
1442 }
1443 if (length2b > 0) {
1444 strncpy(&(new->quality[length1+length2a]),pointer2b,length2b);
1445 new->quality[length1+length2a+length2b] = '\0';
1446 }
1447 }
1448 }
1449 }
1450 #endif
1451
1452 debug(printf("Final query sequence is:\n"));
1453 debug(Sequence_stdout(new,/*uppercasep*/false,/*wraplength*/60,/*trimmedp*/false));
1454 return new;
1455 }
1456
1457
1458 T
Sequence_read_multifile(int * nextchar,FILE ** input,char * read_files_command,char *** files,int * nfiles)1459 Sequence_read_multifile (int *nextchar, FILE **input,
1460 char *read_files_command, char ***files, int *nfiles) {
1461 T queryseq;
1462
1463 while (1) {
1464 if (*input == NULL || *nextchar == EOF) { /* was feof(*input) */
1465 if (*input != NULL) {
1466 if (read_files_command != NULL) {
1467 pclose(*input);
1468 } else {
1469 fclose(*input);
1470 }
1471 *input = NULL;
1472 }
1473
1474 if (*nfiles == 0) {
1475 *nextchar = EOF;
1476 return NULL;
1477 } else {
1478 while (*nfiles > 0 && (*input = Fopen_read_text(read_files_command,(*files)[0])) == NULL) {
1479 (*files)++;
1480 (*nfiles)--;
1481 }
1482 if (*input == NULL) {
1483 *nextchar = EOF;
1484 return NULL;
1485 } else {
1486 (*files)++;
1487 (*nfiles)--;
1488 *nextchar = '\0';
1489 }
1490 }
1491 }
1492 if ((queryseq = Sequence_read(&(*nextchar),*input)) != NULL) {
1493 return queryseq;
1494 }
1495 }
1496 }
1497
1498
1499 #if 0 && defined(HAVE_ZLIB)
1500 T
1501 Sequence_read_multifile_gzip (int *nextchar, gzFile *input, char ***files, int *nfiles) {
1502 T queryseq;
1503
1504 while (1) {
1505 if (*input == NULL || *nextchar == EOF) { /* was gzeof(*input) */
1506 if (*input != NULL) {
1507 gzclose(*input);
1508 *input = NULL;
1509 }
1510
1511 if (*nfiles == 0) {
1512 *nextchar = EOF;
1513 return NULL;
1514 } else {
1515 while (*nfiles > 0 && (*input = gzopen((*files)[0],"rb")) == NULL) {
1516 (*files)++;
1517 (*nfiles)--;
1518 }
1519 if (*input == NULL) {
1520 *nextchar = EOF;
1521 return NULL;
1522 } else {
1523 #ifdef HAVE_ZLIB_GZBUFFER
1524 gzbuffer(*input,GZBUFFER_SIZE);
1525 #endif
1526 (*files)++;
1527 (*nfiles)--;
1528 *nextchar = '\0';
1529 }
1530 }
1531 }
1532 if ((queryseq = Sequence_read_gzip(&(*nextchar),*input)) != NULL) {
1533 return queryseq;
1534 }
1535 }
1536 }
1537 #endif
1538
1539
1540 #if 0 && defined(HAVE_BZLIB)
1541 T
1542 Sequence_read_multifile_bzip2 (int *nextchar, Bzip2_T *input, char ***files, int *nfiles) {
1543 T queryseq;
1544
1545 while (1) {
1546 if (*input == NULL || *nextchar == EOF) { /* was bzeof(*input) */
1547 if (*input != NULL) {
1548 Bzip2_free(&(*input));
1549 *input = NULL;
1550 }
1551
1552 if (*nfiles == 0) {
1553 *nextchar = EOF;
1554 return NULL;
1555 } else {
1556 while (*nfiles > 0 && (*input = Bzip2_new((*files)[0])) == NULL) {
1557 (*files)++;
1558 (*nfiles)--;
1559 }
1560 if (*input == NULL) {
1561 *nextchar = EOF;
1562 return NULL;
1563 } else {
1564 (*files)++;
1565 (*nfiles)--;
1566 *nextchar = '\0';
1567 }
1568 }
1569 }
1570 if ((queryseq = Sequence_read_bzip2(&(*nextchar),*input)) != NULL) {
1571 return queryseq;
1572 }
1573 }
1574 }
1575 #endif
1576
1577
1578 /* This is intended for user genomicseg, which will start with
1579 standard FASTA ">" */
1580 T
Sequence_read_unlimited(int * nextchar,FILE * input)1581 Sequence_read_unlimited (int *nextchar, FILE *input) {
1582 T new;
1583 Intlist_T intlist = NULL;
1584 char *p;
1585 int length, startpos = 1, maxseqlen = MAXSEQLEN;
1586 bool eolnp;
1587
1588 if (feof(input)) {
1589 *nextchar = EOF;
1590 return NULL;
1591 }
1592
1593 if (*nextchar == '\0') {
1594 if ((*nextchar = Sequence_input_init(input)) == EOF) {
1595 *nextchar = EOF;
1596 return NULL;
1597 }
1598 }
1599
1600 new = (T) MALLOC_IN(sizeof(*new));
1601
1602 if (*nextchar != '>' /* && *nextchar != '<' */) {
1603 new->firstp = true; /* by default */
1604 blank_header(new);
1605 Sequence[startpos++] = (char) *nextchar;
1606 maxseqlen--;
1607 } else if (input_header(input,new) == NULL) {
1608 /* File ends after >. Don't process. */
1609 FREE_IN(new);
1610 return NULL;
1611 } else if (*nextchar == '>') {
1612 new->firstp = true;
1613 #if 0
1614 } else if (*nextchar == '<') {
1615 new->firstp = false;
1616 #endif
1617 } else {
1618 abort();
1619 }
1620
1621 /* Don't touch Sequence[0], because subsequent calls to
1622 Sequence_read depend on it being '\0'. */
1623 eolnp = true;
1624 while (fgets(&(Sequence[startpos]),maxseqlen,input) != NULL &&
1625 (eolnp == false || (Sequence[1] != '>' /* && Sequence[1] != '<' */))) {
1626 for (p = &(Sequence[1]); *p != '\r' && *p != '\n' && *p != '\0'; p++) {
1627 if (!iscntrl((int) *p)
1628 #ifdef DASH
1629 && *p != DASH
1630 #endif
1631 ) {
1632 intlist = Intlist_push(intlist,(int) *p);
1633 }
1634 }
1635 if (*p == '\r' || *p == '\n') {
1636 eolnp = true;
1637 } else {
1638 eolnp = false;
1639 }
1640 startpos = 1;
1641 maxseqlen = MAXSEQLEN;
1642 }
1643 intlist = Intlist_reverse(intlist);
1644 new->contents = new->contents_alloc = Intlist_to_char_array_in(&length,intlist);
1645
1646 Intlist_free(&intlist);
1647
1648 if (length == 0) {
1649 return NULL;
1650 } else {
1651 new->fulllength = new->trimend = length;
1652 #ifdef PMAP
1653 new->fulllength_given = length;
1654 #endif
1655 new->trimstart = 0;
1656
1657 #ifndef PMAP
1658 /* user genomic segment should not have quality */
1659 new->quality = new->quality_alloc = (char *) NULL;
1660 #endif
1661
1662 /* new->free_contents_p = true; */
1663 new->subseq_offset = 0;
1664 new->skiplength = 0;
1665
1666 /* Important to initialize for subsequent cDNA reads */
1667 *nextchar = '\0';
1668
1669 return new;
1670 }
1671 }
1672
1673
1674 void
Sequence_print_digest(Filestring_T fp,T this)1675 Sequence_print_digest (Filestring_T fp, T this) {
1676 unsigned char *digest;
1677
1678 digest = MD5_compute((unsigned char *) this->contents,this->fulllength);
1679 MD5_print(fp,digest);
1680 FREE(digest);
1681 return;
1682 }
1683
1684 /* Calling procedure needs to print the initial ">", if desired */
1685 void
Sequence_print_header(Filestring_T fp,T this,bool checksump)1686 Sequence_print_header (Filestring_T fp, T this, bool checksump) {
1687
1688 if (this->acc == NULL) {
1689 FPRINTF(fp,"NO_HEADER");
1690 } else {
1691 if (this->restofheader == NULL || this->restofheader[0] == '\0') {
1692 FPRINTF(fp,"%s",this->acc);
1693 } else {
1694 FPRINTF(fp,"%s %s",this->acc,this->restofheader);
1695 }
1696
1697 if (checksump == true) {
1698 FPRINTF(fp," md5:");
1699 Sequence_print_digest(fp,this);
1700 }
1701 }
1702
1703 FPRINTF(fp,"\n");
1704
1705 return;
1706 }
1707
1708 void
Sequence_stdout_header(T this)1709 Sequence_stdout_header (T this) {
1710 if (this->acc == NULL) {
1711 printf("NO_HEADER");
1712 } else {
1713 if (this->restofheader == NULL || this->restofheader[0] == '\0') {
1714 printf("%s",this->acc);
1715 } else {
1716 printf("%s %s",this->acc,this->restofheader);
1717 }
1718 }
1719
1720 printf("\n");
1721
1722 return;
1723 }
1724
1725
1726 #if 0
1727 /* Used by revcomp.c */
1728 void
1729 Sequence_stdout_header_revcomp (T this) {
1730 if (this->restofheader == NULL || this->restofheader[0] == '\0') {
1731 printf(">%s",this->acc);
1732 } else {
1733 printf(">%s %s",this->acc,this->restofheader);
1734 }
1735 printf(" REVCOMP");
1736 printf("\n");
1737 return;
1738 }
1739 #endif
1740
1741
1742 void
Sequence_print(Filestring_T fp,T this,bool uppercasep,int wraplength,bool trimmedp)1743 Sequence_print (Filestring_T fp, T this, bool uppercasep, int wraplength, bool trimmedp) {
1744 int i = 0, pos, start, end;
1745 char uppercaseCode[128] = UPPERCASE_STD;
1746
1747 if (trimmedp == true) {
1748 start = this->trimstart;
1749 end = this->trimend;
1750 } else {
1751 start = 0;
1752 end = this->fulllength;
1753 }
1754
1755 if (uppercasep == true) {
1756 for (pos = start; pos < end; pos++, i++) {
1757 PUTC(uppercaseCode[(int) this->contents[i]],fp);
1758 if ((i+1) % wraplength == 0) {
1759 PUTC('\n',fp);
1760 }
1761 }
1762 } else {
1763 for (pos = start; pos < end; pos++, i++) {
1764 PUTC(this->contents[i],fp);
1765 if ((i+1) % wraplength == 0) {
1766 PUTC('\n',fp);
1767 }
1768 }
1769 }
1770 if (i % wraplength != 0) {
1771 PUTC('\n',fp);
1772 }
1773
1774 return;
1775 }
1776
1777
1778 void
Sequence_stdout(T this,bool uppercasep,int wraplength,bool trimmedp)1779 Sequence_stdout (T this, bool uppercasep, int wraplength, bool trimmedp) {
1780 int i = 0, pos, start, end;
1781 char uppercaseCode[128] = UPPERCASE_STD;
1782
1783 if (trimmedp == true) {
1784 start = this->trimstart;
1785 end = this->trimend;
1786 } else {
1787 start = 0;
1788 end = this->fulllength;
1789 }
1790
1791 if (uppercasep == true) {
1792 for (pos = start; pos < end; pos++, i++) {
1793 putchar(uppercaseCode[(int) this->contents[i]]);
1794 if ((i+1) % wraplength == 0) {
1795 putchar('\n');
1796 }
1797 }
1798 } else {
1799 for (pos = start; pos < end; pos++, i++) {
1800 putchar(this->contents[i]);
1801 if ((i+1) % wraplength == 0) {
1802 putchar('\n');
1803 }
1804 }
1805 }
1806 if (i % wraplength != 0) {
1807 putchar('\n');
1808 }
1809
1810 return;
1811 }
1812
1813
1814 void
Sequence_stdout_alt(T ref,T alt,T snp,bool uppercasep,int wraplength)1815 Sequence_stdout_alt (T ref, T alt, T snp, bool uppercasep, int wraplength) {
1816 int i = 0, pos, start, end;
1817 char uppercaseCode[128] = UPPERCASE_STD;
1818
1819 start = 0;
1820 end = alt->fulllength;
1821
1822 if (uppercasep == true) {
1823 for (pos = start; pos < end; pos++, i++) {
1824 if (snp->contents[i] == ' ') {
1825 /* Not a SNP, so print reference */
1826 printf("%c",uppercaseCode[(int) ref->contents[i]]);
1827 } else if (uppercaseCode[(int) alt->contents[i]] == uppercaseCode[(int) ref->contents[i]]) {
1828 /* Wildcard SNP */
1829 printf("N");
1830 } else {
1831 printf("%c",uppercaseCode[(int) alt->contents[i]]);
1832 }
1833 if ((i+1) % wraplength == 0) {
1834 printf("\n");
1835 }
1836 }
1837 } else {
1838 for (pos = start; pos < end; pos++, i++) {
1839 if (snp->contents[i] == ' ') {
1840 /* Not a SNP, so print reference */
1841 printf("%c",ref->contents[i]);
1842 } else if (uppercaseCode[(int) alt->contents[i]] == uppercaseCode[(int) ref->contents[i]]) {
1843 /* Wildcard SNP */
1844 printf("N");
1845 } else {
1846 printf("%c",alt->contents[i]);
1847 }
1848 if ((i+1) % wraplength == 0) {
1849 printf("\n");
1850 }
1851 }
1852 }
1853 if (i % wraplength != 0) {
1854 printf("\n");
1855 }
1856 return;
1857 }
1858
1859
1860 void
Sequence_stdout_two(T ref,T alt,bool uppercasep,int wraplength)1861 Sequence_stdout_two (T ref, T alt, bool uppercasep, int wraplength) {
1862 int i = 0, pos, pos2, startpos, end;
1863 char uppercaseCode[128] = UPPERCASE_STD;
1864
1865 end = ref->fulllength;
1866
1867 pos = 0;
1868 i = 0;
1869 if (uppercasep == true) {
1870 printf("ref\t");
1871 startpos = pos;
1872 while (pos < end) {
1873 printf("%c",uppercaseCode[(int) ref->contents[pos]]);
1874 if (++i % wraplength == 0) {
1875 printf("\n");
1876 printf("alt\t");
1877 for (pos2 = startpos, i = 0; i < wraplength; pos2++, i++) {
1878 if (uppercaseCode[(int) alt->contents[pos2]] == uppercaseCode[(int) ref->contents[pos2]]) {
1879 /* Wildcard SNP */
1880 printf("N");
1881 } else {
1882 printf("%c",uppercaseCode[(int) alt->contents[pos2]]);
1883 }
1884 }
1885 printf("\n\n");
1886 if (pos+1 < end) {
1887 printf("ref\t");
1888 }
1889 startpos = pos+1;
1890 }
1891 pos++;
1892 }
1893 if (i % wraplength != 0) {
1894 printf("\n");
1895 printf("alt\t");
1896 for (pos2 = startpos; pos2 < end; pos2++) {
1897 if (uppercaseCode[(int) alt->contents[pos2]] == uppercaseCode[(int) ref->contents[pos2]]) {
1898 /* Wildcard SNP */
1899 printf("N");
1900 } else {
1901 printf("%c",uppercaseCode[(int) alt->contents[pos2]]);
1902 }
1903 }
1904 printf("\n\n");
1905 }
1906
1907 } else {
1908 printf("ref\t");
1909 startpos = pos;
1910 while (pos < end) {
1911 printf("%c",ref->contents[pos]);
1912 if (++i % wraplength == 0) {
1913 printf("\n");
1914 printf("alt\t");
1915 for (pos2 = startpos, i = 0; i < wraplength; pos2++, i++) {
1916 if (uppercaseCode[(int) alt->contents[pos2]] == uppercaseCode[(int) ref->contents[pos2]]) {
1917 /* Wildcard SNP */
1918 printf("N");
1919 } else {
1920 printf("%c",alt->contents[pos2]);
1921 }
1922 }
1923 printf("\n\n");
1924 if (pos+1 < end) {
1925 printf("ref\t");
1926 }
1927 startpos = pos+1;
1928 }
1929 pos++;
1930 }
1931 if (i % wraplength != 0) {
1932 printf("\n");
1933 printf("alt\t");
1934 for (pos2 = startpos; pos2 < end; pos2++) {
1935 if (uppercaseCode[(int) alt->contents[pos2]] == uppercaseCode[(int) ref->contents[pos2]]) {
1936 /* Wildcard SNP */
1937 printf("N");
1938 } else {
1939 printf("%c",alt->contents[pos2]);
1940 }
1941 }
1942 printf("\n\n");
1943 }
1944 }
1945
1946 return;
1947 }
1948
1949
1950 void
Sequence_stdout_raw(T this)1951 Sequence_stdout_raw (T this) {
1952 int i = 0, pos, start, end;
1953
1954 start = 0;
1955 end = this->fulllength;
1956
1957 for (pos = start; pos < end; pos++, i++) {
1958 printf("%d\n",(int) this->contents[i]);
1959 }
1960 return;
1961 }
1962
1963 void
Sequence_stdout_stream_chars(T this)1964 Sequence_stdout_stream_chars (T this) {
1965 int i = 0, pos, start, end;
1966
1967 start = 0;
1968 end = this->fulllength;
1969
1970 for (pos = start; pos < end; pos++, i++) {
1971 switch (toupper(this->contents[i])) {
1972 case 'A': putchar('A'); break;
1973 case 'C': putchar('C'); break;
1974 case 'G': putchar('G'); break;
1975 case 'T': putchar('T'); break;
1976 default: putchar('X');
1977 }
1978 }
1979 return;
1980 }
1981
1982 void
Sequence_stdout_stream_ints(T this)1983 Sequence_stdout_stream_ints (T this) {
1984 int i = 0, pos, start, end;
1985
1986 start = 0;
1987 end = this->fulllength;
1988
1989 for (pos = start; pos < end; pos++, i++) {
1990 switch (toupper(this->contents[i])) {
1991 case 'A': putchar(0); break;
1992 case 'C': putchar(1); break;
1993 case 'G': putchar(2); break;
1994 case 'T': putchar(3); break;
1995 default: putchar(4);
1996 }
1997 }
1998 return;
1999 }
2000
2001
2002 T
Sequence_substring(T usersegment,unsigned int left,unsigned int length,bool revcomp)2003 Sequence_substring (T usersegment, unsigned int left, unsigned int length,
2004 bool revcomp) {
2005 char *gbuffer;
2006
2007 gbuffer = (char *) CALLOC(length+1,sizeof(char));
2008
2009 memcpy(gbuffer,&(usersegment->contents[left]),length*sizeof(char));
2010 gbuffer[length] = '\0';
2011
2012 if (revcomp == true) {
2013 /* make_complement_buffered(gbuffer2,gbuffer1,length); */
2014 make_complement_inplace(gbuffer,length);
2015 debug(fprintf(stderr,"Got sequence at %u with length %u, revcomp\n",left,length));
2016 return Sequence_genomic_new(gbuffer,length,/*copyp*/false);
2017 } else {
2018 debug(fprintf(stderr,"Got sequence at %u with length %u, forward\n",left,length));
2019 return Sequence_genomic_new(gbuffer,length,/*copyp*/false);
2020 }
2021 }
2022
2023
2024