1 /*
2 * $Id: aceread.c,v 1.25 2012/04/02 22:11:35 kans Exp $
3 *
4 * ===========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 *
28 * Authors: Colleen Bollin
29 *
30 */
31
32
33 #include <stdlib.h>
34 #include <stdio.h>
35 #include <string.h>
36 #include <ctype.h>
37 #include <util/creaders/alnread.h>
38 #include <aceread.h>
39
40
41 typedef enum {
42 eJustRight = 0,
43 eNone,
44 eTooMany,
45 eTooFew,
46 eUnexpected
47 } EFound;
48
49
PrintACEFormatErrorXMLStart(char * id,char * has_errors)50 extern void PrintACEFormatErrorXMLStart (char *id, char *has_errors)
51 {
52 if (has_errors != NULL) {
53 if (*has_errors == 0) {
54 printf ("<aceread>\n");
55 *has_errors = 1;
56 }
57 }
58 printf ("<message severity=\"ERROR\" seq-id=\"%s\" code=\"bad_format\">", id == NULL ? "No ID" : id);
59 }
60
61
PrintACEFormatErrorXMLEnd(void)62 extern void PrintACEFormatErrorXMLEnd (void)
63 {
64 printf ("</message>\n");
65 }
66
67
PrintACEFormatErrorXML(char * msg,char * id,char * has_errors)68 extern void PrintACEFormatErrorXML (char *msg, char *id, char *has_errors)
69 {
70 if (has_errors != NULL) {
71 if (*has_errors == 0) {
72 printf ("<aceread>\n");
73 *has_errors = 1;
74 }
75 }
76 printf ("<message severity=\"ERROR\" seq-id=\"%s\" code=\"bad_format\">%s</message>\n", id == NULL ? "No ID" : id, msg);
77 }
78
79
s_ReportFound(EFound val,char * label,char * id,char * has_errors)80 static void s_ReportFound (EFound val, char *label, char *id, char *has_errors)
81 {
82 switch (val) {
83 case eNone:
84 PrintACEFormatErrorXMLStart (id, has_errors);
85 printf ("Found no %s", label);
86 PrintACEFormatErrorXMLEnd ();
87 break;
88 case eTooMany:
89 PrintACEFormatErrorXMLStart (id, has_errors);
90 printf ("Too many %s", label);
91 PrintACEFormatErrorXMLEnd ();
92 break;
93 case eTooFew:
94 PrintACEFormatErrorXMLStart (id, has_errors);
95 printf ("Too few %s", label);
96 PrintACEFormatErrorXMLEnd ();
97 break;
98 case eUnexpected:
99 PrintACEFormatErrorXMLStart (id, has_errors);
100 printf ("Unexpected character while reading %s", label);
101 PrintACEFormatErrorXMLEnd ();
102 break;
103 case eJustRight:
104 break;
105 default:
106 PrintACEFormatErrorXML ("Unknown error", id, has_errors);
107 break;
108 }
109 }
110
111
GapInfoNew(void)112 extern TGapInfoPtr GapInfoNew (void)
113 {
114 TGapInfoPtr g;
115
116 g = (TGapInfoPtr) malloc (sizeof (SGapInfo));
117 if (g != NULL) {
118 g->num_gaps = 0;
119 g->gap_offsets = NULL;
120 }
121 return g;
122 }
123
124
GapInfoFree(TGapInfoPtr g)125 extern void GapInfoFree (TGapInfoPtr g)
126 {
127 if (g != NULL) {
128 free (g->gap_offsets);
129 free (g);
130 }
131 }
132
133
s_IsGapChar(char ch,char * gap_chars)134 static int s_IsGapChar (char ch, char *gap_chars)
135 {
136 if (ch == 0 || gap_chars == NULL) {
137 return 0;
138 }
139 while (*gap_chars != 0 && *gap_chars != ch) {
140 gap_chars ++;
141 }
142 if (*gap_chars == ch) {
143 return 1;
144 } else {
145 return 0;
146 }
147 }
148
149
150 /* The Trace Archive Gap String is a list of the number of nucleotides to skip before adding the next gap */
GapInfoFromSequenceString(char * seq_str,char gap_char)151 extern TGapInfoPtr GapInfoFromSequenceString (char *seq_str, char gap_char)
152 {
153 char * cp;
154 int num_gaps = 0, pos, gap_num = 0;
155 TGapInfoPtr g = NULL;
156
157 if (seq_str == NULL) return NULL;
158
159 /* first determine number of gaps */
160 cp = seq_str;
161 while (*cp != 0) {
162 if (*cp == gap_char) {
163 num_gaps++;
164 }
165 cp++;
166 }
167
168 g = GapInfoNew ();
169 if (num_gaps > 0) {
170 g->num_gaps = num_gaps;
171 g->gap_offsets = malloc (g->num_gaps * sizeof (int));
172 cp = seq_str;
173 pos = 0;
174 while (*cp != 0) {
175 if (*cp == gap_char) {
176 g->gap_offsets[gap_num] = pos;
177 gap_num++;
178 pos = 0;
179 } else {
180 pos++;
181 }
182 cp++;
183 }
184 }
185 return g;
186 }
187
RemoveGapCharsFromSequenceString(char * seq_str,char gap_char)188 extern void RemoveGapCharsFromSequenceString (char *seq_str, char gap_char)
189 {
190 char *cp_src, *cp_dst;
191
192 if (seq_str == NULL) {
193 return;
194 }
195
196 cp_src = seq_str;
197 cp_dst = seq_str;
198 while (*cp_src != 0) {
199 if (*cp_src != gap_char) {
200 *cp_dst = *cp_src;
201 cp_dst++;
202 }
203 cp_src++;
204 }
205 }
206
207
208 /* calculate sequence position from tiling position (both values zero-based) given gap_info */
SeqPosFromTilingPos(int tiling_pos,TGapInfoPtr gap_info)209 extern int SeqPosFromTilingPos (int tiling_pos, TGapInfoPtr gap_info)
210 {
211 int pos = 0, seq_pos = 0, gap_num = 0;
212
213 if (tiling_pos < 0 || gap_info == NULL || gap_info->num_gaps == 0) {
214 return tiling_pos;
215 }
216
217 while (gap_num < gap_info->num_gaps && pos + gap_info->gap_offsets[gap_num] <= tiling_pos) {
218 seq_pos += gap_info->gap_offsets[gap_num];
219 pos += gap_info->gap_offsets[gap_num] + 1;
220 gap_num++;
221 }
222 seq_pos += tiling_pos - pos;
223 return seq_pos;
224 }
225
226
227 /* calculate sequence position from tiling position (both values zero-based) given gap_info */
TilingPosFromSeqPos(int seq_pos,TGapInfoPtr gap_info)228 extern int TilingPosFromSeqPos (int seq_pos, TGapInfoPtr gap_info)
229 {
230 int pos = 0, tiling_pos = 0, gap_num = 0;
231
232 if (seq_pos < 0 || gap_info == NULL || gap_info->num_gaps == 0) {
233 return seq_pos;
234 }
235
236 while (gap_num < gap_info->num_gaps && pos + gap_info->gap_offsets[gap_num] <= seq_pos) {
237 pos += gap_info->gap_offsets[gap_num];
238 tiling_pos += gap_info->gap_offsets[gap_num] + 1;
239 gap_num++;
240 }
241 tiling_pos += seq_pos - pos;
242 return tiling_pos;
243 }
244
245
246 /* adjust gap info when sequence is trimmed */
AdjustGapInfoFor5Trim(TGapInfoPtr gap_info,int trim)247 static void AdjustGapInfoFor5Trim (TGapInfoPtr gap_info, int trim)
248 {
249 int pos = 0;
250 int num_gaps = 0;
251 int i;
252
253 if (gap_info == NULL || gap_info->num_gaps < 1 || trim < 1) {
254 return;
255 }
256
257 while (num_gaps < gap_info->num_gaps && pos + gap_info->gap_offsets[num_gaps] < trim) {
258 pos += gap_info->gap_offsets[num_gaps];
259 num_gaps++;
260 }
261 if (num_gaps < gap_info->num_gaps) {
262 gap_info->gap_offsets[num_gaps] -= trim - pos;
263 for (i = num_gaps; i < gap_info->num_gaps; i++) {
264 gap_info->gap_offsets[i - num_gaps] = gap_info->gap_offsets[i];
265 }
266 gap_info->num_gaps -= num_gaps;
267 } else {
268 free (gap_info->gap_offsets);
269 gap_info->gap_offsets = NULL;
270 gap_info->num_gaps = 0;
271 }
272
273 }
274
275
AdjustGapInfoFor3Trim(TGapInfoPtr gap_info,int new_len)276 static void AdjustGapInfoFor3Trim (TGapInfoPtr gap_info, int new_len)
277 {
278 int pos = 0;
279 int num_gaps = 0;
280
281 if (gap_info == NULL || gap_info->num_gaps < 1) {
282 return;
283 }
284
285 while (num_gaps < gap_info->num_gaps && pos + gap_info->gap_offsets[num_gaps] < new_len) {
286 pos += gap_info->gap_offsets[num_gaps];
287 num_gaps++;
288 }
289 if (num_gaps < gap_info->num_gaps) {
290 gap_info->num_gaps = num_gaps;
291 }
292 }
293
294 /* TODO: NEED TO write function for truncating on right, test function for truncating on left */
295
ContigReadNew(void)296 extern TContigReadPtr ContigReadNew (void)
297 {
298 TContigReadPtr r;
299
300 r = (TContigReadPtr) malloc (sizeof (SContigRead));
301 if (r == NULL) {
302 return NULL;
303 }
304 r->read_id = NULL;
305 r->ti = 0;
306 r->srr = NULL;
307 r->read_seq = NULL;
308 r->is_complement = 0;
309 r->cons_start = 0;
310 r->cons_stop = 0;
311 r->gaps = NULL;
312 r->local = 1;
313 r->valid = 0;
314 r->qual_scores = NULL;
315 r->num_qual_scores = 0;
316 r->tag = NULL;
317 return r;
318 }
319
320
ContigReadFree(TContigReadPtr r)321 extern void ContigReadFree (TContigReadPtr r)
322 {
323 if (r != NULL) {
324 if (r->read_id != NULL) {
325 free (r->read_id);
326 }
327 if (r->srr != NULL) {
328 free (r->srr);
329 }
330 if (r->read_seq != NULL) {
331 free (r->read_seq);
332 }
333 if (r->gaps != NULL) {
334 GapInfoFree (r->gaps);
335 }
336 if (r->qual_scores != NULL) {
337 free (r->qual_scores);
338 }
339 if (r->tag != NULL) {
340 free (r->tag);
341 }
342 free (r);
343 }
344 }
345
346
BaseSegNew(void)347 extern TBaseSegPtr BaseSegNew (void)
348 {
349 TBaseSegPtr b;
350
351 b = (TBaseSegPtr) malloc (sizeof (SBaseSeg));
352 if (b == NULL) {
353 return NULL;
354 }
355 b->read_id = NULL;
356 b->cons_start = 0;
357 b->cons_stop = 0;
358 return b;
359 }
360
361
BaseSegFree(TBaseSegPtr b)362 extern void BaseSegFree (TBaseSegPtr b)
363 {
364 if (b != NULL) {
365 if (b->read_id != NULL) {
366 free (b->read_id);
367 }
368 free (b);
369 }
370 }
371
372
373 /* reads a correctly formatted line and creates a base seg.
374 */
s_ReadBaseSeg(char * line)375 static TBaseSegPtr s_ReadBaseSeg (char *line)
376 {
377 TBaseSegPtr base_seg = NULL;
378 char *cp;
379 int start, stop, len;
380
381 if (line == NULL || *line != 'B' || *(line + 1) != 'S') {
382 return NULL;
383 }
384
385
386 cp = line + 2;
387 while (isspace (*cp)) {
388 cp++;
389 }
390 if (!isdigit (*cp)) {
391 return NULL;
392 }
393 start = atoi (cp);
394 while (isdigit (*cp)) {
395 cp++;
396 }
397 while (isspace (*cp)) {
398 cp++;
399 }
400 if (!isdigit (*cp)) {
401 return NULL;
402 }
403 stop = atoi (cp);
404 while (isdigit (*cp)) {
405 cp++;
406 }
407 while (isspace (*cp)) {
408 cp++;
409 }
410 if (*cp == 0) {
411 return NULL;
412 }
413
414 len = strlen (cp);
415
416 base_seg = BaseSegNew ();
417 base_seg->cons_start = start;
418 base_seg->cons_stop = stop;
419 base_seg->read_id = malloc (sizeof (char) * len + 1);
420 strcpy (base_seg->read_id, cp);
421
422 return base_seg;
423 }
424
425
ConsensusReadAlnNew(int numseg)426 extern TConsensusReadAlnPtr ConsensusReadAlnNew (int numseg)
427 {
428 TConsensusReadAlnPtr a;
429 int i;
430
431 a = (TConsensusReadAlnPtr) malloc (sizeof (SConsensusReadAln));
432 a->is_complement = 0;
433 if (numseg < 1) {
434 a->lens = NULL;
435 a->cons_starts = NULL;
436 a->read_starts = NULL;
437 a->numseg = 0;
438 } else {
439 a->lens = (int *) malloc (sizeof (int) * numseg);
440 a->cons_starts = (int *) malloc (sizeof (int) * numseg);
441 a->read_starts = (int *) malloc (sizeof (int) * numseg);
442 for (i = 0; i < numseg; i++) {
443 a->lens[i] = 0;
444 a->cons_starts[i] = 0;
445 a->read_starts[0] = 0;
446 }
447 a->numseg = numseg;
448 }
449 return a;
450 }
451
452
ConsensusReadAlnFree(TConsensusReadAlnPtr a)453 extern TConsensusReadAlnPtr ConsensusReadAlnFree (TConsensusReadAlnPtr a)
454 {
455 if (a != NULL) {
456 if (a->lens != NULL) {
457 free (a->lens);
458 a->lens = NULL;
459 }
460 if (a->cons_starts != NULL) {
461 free (a->cons_starts);
462 a->cons_starts = NULL;
463 }
464 if (a->read_starts != NULL) {
465 free (a->read_starts);
466 a->read_starts = NULL;
467 }
468 free (a);
469 a = NULL;
470 }
471 return a;
472 }
473
474
GetConsensusReadAln(char * consensus_seq,TContigReadPtr read)475 extern TConsensusReadAlnPtr GetConsensusReadAln (char *consensus_seq, TContigReadPtr read)
476 {
477 TConsensusReadAlnPtr aln = NULL;
478 char *c;
479 char *c_start;
480 char *r;
481 char *r_start;
482 int numseg = 0, aln_len, pos, seg, con_offset = 0, read_offset = 0;
483 char con_gap_open = 0, read_gap_open = 0, gap_change;
484
485 if (consensus_seq == NULL || read == NULL) {
486 return NULL;
487 }
488
489 if (read->cons_start > 0) {
490 c_start = consensus_seq + read->cons_start;
491 r_start = read->read_seq + read->read_assem_start - 1;
492 } else {
493 c_start = consensus_seq;
494 r_start = read->read_seq + read->read_assem_start - 1;
495 }
496
497 aln_len = read->cons_stop - read->cons_start + 1;
498 while (*c_start == '*' && *r_start == '*') {
499 c_start++;
500 r_start++;
501 aln_len--;
502 }
503
504 /* first, count number of segments needed */
505 c = c_start;
506 r = r_start;
507 if (*c != '*' && *r != '*') {
508 numseg++;
509 }
510 pos = 0;
511 while (*c != 0 && *r != 0 && pos < aln_len) {
512 if (*c == '*' && *r == '*') {
513 /* both in gap - ignore */
514 } else {
515 gap_change = 0;
516 if (*c == '*') {
517 if (!con_gap_open) {
518 gap_change = 1;
519 con_gap_open = 1;
520 }
521 } else {
522 if (con_gap_open) {
523 gap_change = 1;
524 con_gap_open = 0;
525 }
526 }
527 if (*r == '*') {
528 if (!read_gap_open) {
529 gap_change = 1;
530 read_gap_open = 1;
531 }
532 } else {
533 if (read_gap_open) {
534 gap_change = 1;
535 read_gap_open = 0;
536 }
537 }
538 if (gap_change) {
539 numseg++;
540 }
541 }
542 c++;
543 r++;
544 pos++;
545 }
546
547 /* create alignment */
548 aln = ConsensusReadAlnNew (numseg);
549 pos = 0;
550 seg = 0;
551
552
553 c = consensus_seq;
554 while (c < c_start) {
555 if (*c != '*') {
556 con_offset ++;
557 }
558 c++;
559 }
560
561 r = read->read_seq;
562 while (r < r_start) {
563 if (*r != '*') {
564 read_offset ++;
565 }
566 r++;
567 }
568
569
570 if (*c_start == '*') {
571 aln->cons_starts[0] = -1;
572 con_gap_open = 1;
573 } else {
574 aln->cons_starts[0] = con_offset;
575 con_gap_open = 0;
576 }
577
578 if (*r_start == '*') {
579 aln->read_starts[0] = -1;
580 read_gap_open = 1;
581 } else {
582 aln->read_starts[0] = read_offset;
583 read_gap_open = 0;
584 }
585
586 c = c_start + 1;
587 r = r_start + 1;
588 aln->lens[0] = 1;
589 pos = 1;
590
591 while (*c != 0 && *r != 0 && pos < aln_len) {
592 if (*c == '*' && *r == '*') {
593 /* both in gap - ignore */
594 } else {
595 gap_change = 0;
596 if (*c == '*') {
597 if (!con_gap_open) {
598 gap_change = 1;
599 con_gap_open = 1;
600 }
601 } else {
602 if (con_gap_open) {
603 gap_change = 1;
604 con_gap_open = 0;
605 }
606 }
607 if (*r == '*') {
608 if (!read_gap_open) {
609 gap_change = 1;
610 read_gap_open = 1;
611 }
612 } else {
613 if (read_gap_open) {
614 gap_change = 1;
615 read_gap_open = 0;
616 }
617 }
618 if (gap_change) {
619 seg++;
620 if (con_gap_open) {
621 aln->cons_starts[seg] = -1;
622 } else if (aln->cons_starts[seg - 1] > -1) {
623 aln->cons_starts[seg] = aln->cons_starts[seg - 1] + aln->lens[seg - 1];
624 } else if (seg > 1 && aln->cons_starts[seg - 2] > -1) {
625 aln->cons_starts[seg] = aln->cons_starts[seg - 2] + aln->lens[seg - 2];
626 } else {
627 aln->cons_starts[seg] = con_offset;
628 }
629 if (read_gap_open) {
630 aln->read_starts[seg] = -1;
631 } else if (aln->read_starts[seg - 1] > -1) {
632 aln->read_starts[seg] = aln->read_starts[seg - 1] + aln->lens[seg - 1];
633 } else if (seg > 1 && aln->read_starts[seg - 2] > -1) {
634 aln->read_starts[seg] = aln->read_starts[seg - 2] + aln->lens[seg - 2];
635 } else {
636 aln->read_starts[seg] = read_offset;
637 }
638 }
639 aln->lens[seg]++;
640 }
641 c++;
642 r++;
643 pos++;
644 }
645
646 /* todo - adjust starts for complement */
647 if (read->is_complement) {
648 for (seg = 0; seg < aln->numseg; seg++) {
649 if (aln->read_starts[seg] > -1) {
650 aln->read_starts[seg] = read->read_len - aln->read_starts[seg] - aln->lens[seg];
651 }
652 }
653 aln->is_complement = 1;
654 }
655
656 return aln;
657 }
658
659
ContigNew(void)660 extern TContigPtr ContigNew (void)
661 {
662 TContigPtr c;
663
664 c = (TContigPtr) malloc (sizeof (SContig));
665 if (c == NULL) {
666 return NULL;
667 }
668 c->consensus_id = NULL;
669 c->consensus_seq = NULL;
670 c->consensus_assem_len = 0;
671 c->consensus_seq_len = 0;
672 c->is_complement = 0;
673 c->num_qual_scores = 0;
674 c->qual_scores = NULL;
675 c->num_reads = 0;
676 c->reads = NULL;
677 c->gaps = NULL;
678 c->num_reads = 0;
679 c->reads = NULL;
680 c->num_base_segs = 0;
681 c->base_segs = NULL;
682 c->tag = NULL;
683
684 return c;
685 }
686
687
ContigFree(TContigPtr c)688 extern void ContigFree (TContigPtr c)
689 {
690 int i;
691
692 if (c != NULL) {
693 if (c->consensus_id != NULL) free (c->consensus_id);
694 if (c->consensus_seq != NULL) free (c->consensus_seq);
695 if (c->qual_scores != NULL) free (c->qual_scores);
696
697 if (c->reads != NULL) {
698 for (i = 0; i < c->num_reads; i++) {
699 if (c->reads[i] != NULL) {
700 ContigReadFree (c->reads[i]);
701 }
702 }
703 free (c->reads);
704 }
705 if (c->base_segs != NULL) {
706 for (i = 0; i < c->num_base_segs; i++) {
707 if (c->base_segs[i] != NULL) {
708 BaseSegFree (c->base_segs[i]);
709 }
710 }
711 free (c->base_segs);
712 }
713 if (c->tag != NULL) {
714 free (c->tag);
715 }
716 free (c);
717 }
718 }
719
720
ACEFileNew()721 extern TACEFilePtr ACEFileNew ()
722 {
723 TACEFilePtr afp;
724
725 afp = (TACEFilePtr) malloc (sizeof (SACEFile));
726 if (afp == NULL) {
727 return NULL;
728 }
729 afp->num_contigs = 0;
730 afp->contigs = NULL;
731
732 return afp;
733 }
734
735
ACEFileFree(TACEFilePtr afp)736 extern void ACEFileFree (TACEFilePtr afp)
737 {
738 int i;
739
740 if (afp != NULL) {
741 for (i = 0; i < afp->num_contigs; i++) {
742 ContigFree (afp->contigs[i]);
743 }
744 free (afp->contigs);
745 free (afp);
746 }
747 }
748
749
s_IsSeqChar(char ch)750 static char s_IsSeqChar (char ch)
751 {
752 switch (ch) {
753 case '*':
754 case '-':
755 case 'A':
756 case 'B':
757 case 'C':
758 case 'D':
759 case 'E':
760 case 'F':
761 case 'G':
762 case 'H':
763 case 'I':
764 case 'J':
765 case 'K':
766 case 'L':
767 case 'M':
768 case 'N':
769 case 'O':
770 case 'P':
771 case 'Q':
772 case 'R':
773 case 'S':
774 case 'T':
775 case 'U':
776 case 'V':
777 case 'W':
778 case 'X':
779 case 'Y':
780 case 'Z':
781 case 'a':
782 case 'b':
783 case 'c':
784 case 'd':
785 case 'e':
786 case 'f':
787 case 'g':
788 case 'h':
789 case 'i':
790 case 'j':
791 case 'k':
792 case 'l':
793 case 'm':
794 case 'n':
795 case 'o':
796 case 'p':
797 case 'q':
798 case 'r':
799 case 's':
800 case 't':
801 case 'u':
802 case 'v':
803 case 'w':
804 case 'x':
805 case 'y':
806 case 'z':
807 return 1;
808 break;
809 default:
810 return 0;
811 break;
812 }
813 }
814
815
s_IsEOF(char * linestring)816 static char s_IsEOF (char *linestring)
817 {
818 if (linestring == NULL || linestring [0] == EOF) {
819 return 1;
820 } else {
821 return 0;
822 }
823 }
824
825
826 static char *
s_ReadSequenceFromFile(int len,FReadLineFunction readfunc,void * userdata,char * id,char * has_errors)827 s_ReadSequenceFromFile
828 (int len,
829 FReadLineFunction readfunc,
830 void * userdata,
831 char * id,
832 char * has_errors)
833 {
834 char *seq;
835 char *linestring;
836 char *cp;
837 int pos = 0;
838
839 /* copy in sequence data */
840 seq = malloc (len + 1);
841 linestring = readfunc (userdata);
842 while (!s_IsEOF (linestring) && s_IsSeqChar (linestring [0])) {
843 /* append to consensus */
844 cp = linestring;
845 while (s_IsSeqChar (*cp) && pos < len) {
846 if (isalpha (*cp)) {
847 seq [pos] = toupper (*cp);
848 } else if (*cp == '-') {
849 seq [pos] = '*';
850 } else {
851 seq [pos] = *cp;
852 }
853 pos++;
854 cp++;
855 }
856 if (s_IsSeqChar (*cp)) {
857 PrintACEFormatErrorXML ("Too many sequence characters!", id, has_errors);
858 free (seq);
859 return NULL;
860 }
861 free (linestring);
862 linestring = readfunc (userdata);
863 }
864 free (linestring);
865 if (pos < len) {
866 PrintACEFormatErrorXML ("Too few sequence characters!", id, has_errors);
867 free (seq);
868 seq = NULL;
869 } else {
870 seq[pos] = 0;
871 }
872 return seq;
873 }
874
875
s_LineIsEmptyButNotEof(char * linestring)876 static char s_LineIsEmptyButNotEof (char *linestring)
877 {
878 char *cp;
879 if (s_IsEOF (linestring)) {
880 return 0;
881 }
882
883 cp = linestring;
884 while (*cp != 0 && isspace (*cp)) {
885 cp++;
886 }
887 if (*cp == 0) {
888 return 1;
889 } else {
890 return 0;
891 }
892 }
893
894
s_SkipQualScores(FReadLineFunction readfunc,void * userdata)895 static void s_SkipQualScores
896 (FReadLineFunction readfunc,
897 void * userdata)
898 {
899 char * linestring;
900 char * cp;
901 if (readfunc == NULL) return;
902
903 linestring = readfunc (userdata);
904 while (s_LineIsEmptyButNotEof (linestring)) {
905 free (linestring);
906 linestring = readfunc (userdata);
907 }
908 if (linestring == NULL || linestring [0] == EOF || strcmp (linestring, "BQ") != 0) {
909 return;
910 }
911 linestring = readfunc (userdata);
912 while (!s_IsEOF (linestring)
913 && isdigit (*(cp = linestring + strspn (linestring, " \t")))) {
914 free (linestring);
915 linestring = readfunc (userdata);
916 }
917 free (linestring);
918 }
919
920
s_ReadQualScores(TContigPtr contig,FReadLineFunction readfunc,void * userdata)921 static EFound s_ReadQualScores
922 (TContigPtr contig,
923 FReadLineFunction readfunc,
924 void * userdata)
925 {
926 char * linestring;
927 char * cp;
928 int pos;
929
930 if (contig == NULL || readfunc == NULL || contig->consensus_assem_len == 0) {
931 return eNone;
932 }
933
934 linestring = readfunc (userdata);
935 while (s_LineIsEmptyButNotEof (linestring)) {
936 free (linestring);
937 linestring = readfunc (userdata);
938 }
939 if (linestring == NULL || linestring [0] == EOF || strcmp (linestring, "BQ") != 0) {
940 return eNone;
941 }
942
943 /* read quality scores */
944 contig->num_qual_scores = contig->consensus_assem_len;
945 /* no score for * in consensus seq */
946 for (pos = 0; pos < contig->consensus_assem_len; pos++) {
947 if (contig->consensus_seq[pos] == '*') {
948 contig->num_qual_scores --;
949 }
950 }
951 contig->qual_scores = malloc (sizeof (int) * contig->num_qual_scores);
952 pos = 0;
953 linestring = readfunc (userdata);
954 while (!s_IsEOF (linestring)
955 && isdigit (*(cp = linestring + strspn (linestring, " \t")))) {
956 while (isdigit (*cp) && pos < contig->num_qual_scores) {
957 contig->qual_scores [pos] = atoi (cp);
958 pos++;
959 while (isdigit (*cp)) {
960 cp++;
961 }
962 while (isspace (*cp)) {
963 cp++;
964 }
965 }
966 if (isdigit (*cp)) {
967 return eTooMany;
968 }
969 free (linestring);
970 linestring = readfunc (userdata);
971 }
972 if (pos < contig->num_qual_scores) {
973 return eTooFew;
974 } else {
975 return eJustRight;
976 }
977 }
978
979
s_ReadAFLines(TContigPtr contig,FReadLineFunction readfunc,void * userdata,char ** next_line)980 static EFound s_ReadAFLines
981 (TContigPtr contig,
982 FReadLineFunction readfunc,
983 void * userdata,
984 char ** next_line)
985 {
986 char * linestring;
987 char * cp;
988 int read_num, len;
989 EFound rval = eJustRight;
990
991 if (contig == NULL || readfunc == NULL || contig->num_reads == 0) return eNone;
992
993 /* get AF lines */
994 contig->reads = malloc (contig->num_reads * sizeof (TContigReadPtr));
995 linestring = readfunc (userdata);
996 while (s_LineIsEmptyButNotEof (linestring)) {
997 free (linestring);
998 linestring = readfunc (userdata);
999 }
1000 if (linestring == NULL || linestring [0] == EOF || strncmp (linestring, "AF", 2) != 0) {
1001 *next_line = linestring;
1002 return eNone;
1003 }
1004
1005 read_num = 0;
1006 while (!s_IsEOF(linestring) && read_num < contig->num_reads
1007 && linestring [0] == 'A' && linestring [1] == 'F' && isspace (linestring [2])) {
1008 contig->reads[read_num] = ContigReadNew ();
1009 len = strlen (linestring + 3);
1010 contig->reads[read_num]->read_id = malloc (len + 1);
1011 strcpy (contig->reads[read_num]->read_id, linestring + 3);
1012 cp = contig->reads[read_num]->read_id;
1013 while (*cp != 0 && !isspace (*cp)) {
1014 cp++;
1015 }
1016 if (isspace (*cp)) {
1017 *cp = 0;
1018 cp++;
1019 }
1020 if (*cp == 'C') {
1021 contig->reads[read_num]->is_complement = 1;
1022 } else if (*cp != 'U') {
1023 *next_line = linestring;
1024 return eUnexpected;
1025 }
1026 cp++;
1027 if (isspace (*cp)) {
1028 cp++;
1029 }
1030 contig->reads[read_num]->cons_start = atoi (cp) - 1;
1031 read_num++;
1032 free (linestring);
1033 linestring = readfunc (userdata);
1034 }
1035 if (read_num < contig->num_reads) {
1036 rval = eTooFew;
1037 } else if (!s_IsEOF(linestring) && strncmp (linestring, "AF ", 3) == 0) {
1038 rval = eTooMany;
1039 } else {
1040 rval = eJustRight;
1041 }
1042 *next_line = linestring;
1043 return rval;
1044 }
1045
1046
s_ReadBaseSegs(TContigPtr contig,int num_base_segs,char ** firstline,FReadLineFunction readfunc,void * userdata)1047 static EFound s_ReadBaseSegs
1048 (TContigPtr contig,
1049 int num_base_segs,
1050 char ** firstline,
1051 FReadLineFunction readfunc,
1052 void * userdata)
1053 {
1054 char * linestring;
1055
1056 if (contig == NULL || readfunc == NULL || num_base_segs == 0 || firstline == NULL) return eNone;
1057
1058 contig->base_segs = malloc (sizeof (TBaseSegPtr) * num_base_segs);
1059 contig->num_base_segs = 0;
1060
1061 /* get BS lines */
1062 linestring = *firstline;
1063 while (s_LineIsEmptyButNotEof (linestring)) {
1064 free (linestring);
1065 linestring = readfunc (userdata);
1066 }
1067 if (linestring == NULL || linestring [0] == EOF || strncmp (linestring, "BS", 2) != 0) {
1068 return eNone;
1069 }
1070
1071 while (linestring != NULL && linestring [0] != EOF && contig->num_base_segs < num_base_segs
1072 && linestring [0] == 'B' && linestring [1] == 'S' && isspace (linestring [2])) {
1073 contig->base_segs[contig->num_base_segs++] = s_ReadBaseSeg (linestring);
1074 free (linestring);
1075 linestring = readfunc (userdata);
1076 }
1077 *firstline = linestring;
1078 if (contig->num_base_segs < num_base_segs) {
1079 return eTooFew;
1080 } else if (linestring != NULL && linestring [0] == 'B' && linestring[1] == 'S') {
1081 return eTooMany;
1082 } else {
1083 return eJustRight;
1084 }
1085 }
1086
1087
s_IsEquivN(char ch)1088 static char s_IsEquivN (char ch)
1089 {
1090 if (ch == 'N' || ch == 'X') {
1091 return 1;
1092 } else {
1093 return 0;
1094 }
1095 }
1096
1097
1098 /* Terminal Ns will always be trimmed in the GenBank records */
s_AdjustContigReadForTerminalNs(TContigReadPtr read)1099 static void s_AdjustContigReadForTerminalNs (TContigReadPtr read)
1100 {
1101 char * cp_src;
1102 char * cp_dst;
1103 int len = 0;
1104
1105 if (read == NULL || read->read_seq == NULL) return;
1106 cp_src = read->read_seq;
1107 while (s_IsEquivN(*cp_src)) {
1108 len++;
1109 cp_src++;
1110 }
1111 if (len > 0) {
1112 read->cons_start += len;
1113 cp_dst = read->read_seq;
1114 while (*cp_src != 0) {
1115 *cp_dst = *cp_src;
1116 cp_dst++;
1117 cp_src++;
1118 }
1119 *cp_dst = 0;
1120 }
1121 len = strlen (read->read_seq);
1122 cp_src = read->read_seq + len - 1;
1123 while (cp_src >= read->read_seq && s_IsEquivN(*cp_src)) {
1124 *cp_src = 0;
1125 cp_src--;
1126 }
1127 }
1128
1129
1130 /* Terminal Ns will always be trimmed by the GenBank record */
s_AdjustContigForTerminalNs(TContigPtr contig)1131 static void s_AdjustContigForTerminalNs (TContigPtr contig)
1132 {
1133 char * cp_src;
1134 char * cp_dst;
1135 int len = 0, i;
1136
1137 if (contig == NULL || contig->consensus_seq == NULL) return;
1138 cp_src = contig->consensus_seq;
1139 while (s_IsEquivN(*cp_src)) {
1140 len++;
1141 cp_src++;
1142 }
1143 if (len > 0) {
1144 /* adjust quality scores */
1145 if (contig->qual_scores != NULL) {
1146 contig->num_qual_scores -= len;
1147 for (i = 0; i < contig->num_qual_scores; i++) {
1148 contig->qual_scores[i] = contig->qual_scores [i + len];
1149 }
1150 }
1151 /* adjust reads */
1152 if (contig->reads != NULL) {
1153 for (i = 0; i < contig->num_reads; i++) {
1154 if (contig->reads[i] != NULL) {
1155 contig->reads[i]->cons_start -= len;
1156 }
1157 }
1158 }
1159 /* adjust consensus sequence */
1160 cp_dst = contig->consensus_seq;
1161 while (*cp_src != 0) {
1162 *cp_dst = *cp_src;
1163 cp_dst++;
1164 cp_src++;
1165 }
1166 *cp_dst = 0;
1167 contig->consensus_assem_len -= len;
1168 }
1169 /* trim 3' Ns */
1170 len = 0;
1171 cp_src = contig->consensus_seq + contig->consensus_assem_len - 1;
1172 while (cp_src >= contig->consensus_seq && s_IsEquivN(*cp_src)) {
1173 *cp_src = 0;
1174 cp_src--;
1175 contig->consensus_assem_len--;
1176 len++;
1177 }
1178 /* truncate quality scores if 3' Ns trimmed */
1179 if (len > 0 && contig->qual_scores != NULL) {
1180 contig->num_qual_scores -= len;
1181 }
1182 }
1183
1184
1185 /* Clips the sequence read in according to the QA clipping.
1186 * The real coordinates will be recovered when an alignment is generated between
1187 * the sequence in the structure and the sequence downloaded from the Trace Archive.
1188 */
ApplyQALineToRead(TContigReadPtr read,char * linestring,char * id,char * has_errors)1189 static char ApplyQALineToRead (TContigReadPtr read, char *linestring, char *id, char *has_errors)
1190 {
1191 char *cp;
1192 int values[4];
1193 int i = 0;
1194
1195 if (read == NULL || linestring == NULL) {
1196 PrintACEFormatErrorXML ("File end where QA line should be", id, has_errors);
1197 return 0;
1198 }
1199
1200 cp = linestring;
1201 if (*cp != 'Q') {
1202 PrintACEFormatErrorXMLStart (id, has_errors);
1203 printf ("Expected QA line, found %s", linestring);
1204 PrintACEFormatErrorXMLEnd ();
1205 return 0;
1206 }
1207 cp++;
1208 if (*cp != 'A') {
1209 PrintACEFormatErrorXMLStart (id, has_errors);
1210 printf ("Expected QA line, found %s", linestring);
1211 PrintACEFormatErrorXMLEnd ();
1212 return 0;
1213 }
1214 cp++;
1215 while (*cp != 0 && i < 4) {
1216 while (isspace (*cp)) {
1217 cp++;
1218 }
1219 if (*cp != '-' && !isdigit (*cp)) {
1220 PrintACEFormatErrorXML ("Found non-number on QA line", id, has_errors);
1221 return 0;
1222 }
1223 values[i] = atoi (cp);
1224 i++;
1225 while (*cp == '-' || isdigit (*cp)) {
1226 cp++;
1227 }
1228 }
1229 if (*cp != 0 || i < 4) {
1230 PrintACEFormatErrorXML ("Fewer than four numbers on line", id, has_errors);
1231 return 0;
1232 }
1233 if (values[0] > 0 || values[2] > 0) {
1234 if (values[0] > values[2]) {
1235 read->read_assem_start = values[0];
1236 } else {
1237 read->read_assem_start = values[2];
1238 }
1239 }
1240
1241 if (values[1] > 0 && values[3] > 0) {
1242 if (values[1] < values[3]) {
1243 read->read_assem_stop = values[1];
1244 } else {
1245 read->read_assem_stop = values[3];
1246 }
1247 } else if (values[1] > 0) {
1248 read->read_assem_stop = values[1];
1249 } else if (values[3] > 0) {
1250 read->read_assem_stop = values[3];
1251 }
1252
1253 /* adjust first gap position for start */
1254 if (read->read_assem_start > 1 && read->gaps != NULL && read->gaps->num_gaps > 0 && read->gaps->gap_offsets != NULL) {
1255 read->gaps->gap_offsets[0] -= read->read_assem_start - 1;
1256 }
1257
1258 return 1;
1259 }
1260
1261
1262 /* calculate gap info for consensus sequence */
1263 /* calculate cons_stop positions and tiling positions for each read */
s_CalculateContigOffsets(TContigPtr contig)1264 static void s_CalculateContigOffsets (TContigPtr contig)
1265 {
1266 int i;
1267
1268 if (contig == NULL) return;
1269
1270 for (i = 0; i < contig->num_reads; i++) {
1271 contig->reads[i]->tiling_start = contig->reads[i]->read_assem_start + contig->reads[i]->cons_start;
1272 contig->reads[i]->tiling_stop = contig->reads[i]->read_assem_stop + contig->reads[i]->cons_start;
1273 contig->reads[i]->cons_stop = SeqPosFromTilingPos (contig->reads[i]->tiling_stop - 1, contig->gaps) + 1;
1274 contig->reads[i]->read_start = SeqPosFromTilingPos(contig->reads[i]->read_assem_start - 1, contig->reads[i]->gaps) + 1;
1275 contig->reads[i]->read_stop = SeqPosFromTilingPos(contig->reads[i]->read_assem_stop - 1, contig->reads[i]->gaps) + 1;
1276 }
1277
1278 }
1279
1280
s_GetUngappedSeqLen(char * str,char gap_char)1281 static int s_GetUngappedSeqLen (char *str, char gap_char)
1282 {
1283 int len = 0;
1284
1285 if (str == NULL) return 0;
1286 while (*str != 0) {
1287 if (*str != gap_char) {
1288 len++;
1289 }
1290 str++;
1291 }
1292 return len;
1293 }
1294
1295
s_AddToTagComment(char * orig,char * extra)1296 static char * s_AddToTagComment (char *orig, char *extra)
1297 {
1298 char * tag = NULL;
1299 int tag_len;
1300
1301 if (orig == NULL) {
1302 tag = extra;
1303 } else if (extra == NULL) {
1304 tag = orig;
1305 } else {
1306 tag_len = strlen (orig) + strlen (extra) + 1;
1307 tag = malloc (sizeof (char) * (tag_len + 1));
1308 strcpy (tag, orig);
1309 strcat (tag, "\n");
1310 strcat (tag, extra);
1311 free (orig);
1312 free (extra);
1313 }
1314 return tag;
1315 }
1316
1317
1318 typedef enum {
1319 eTagCommentStatus_ok = 0,
1320 eTagCommentStatus_runon
1321 } ETagCommentStatus;
1322
s_ReadTagComment(FReadLineFunction readfunc,void * userdata,ETagCommentStatus * comment_status)1323 static char * s_ReadTagComment
1324 (FReadLineFunction readfunc,
1325 void * userdata,
1326 ETagCommentStatus *comment_status)
1327 {
1328 char *linestring;
1329 char *tag = NULL;
1330 char *cp = NULL;
1331 char *tmp;
1332 int tag_len = 0, end_len;
1333
1334 linestring = readfunc (userdata);
1335 while (linestring != NULL && linestring [0] != EOF && (cp = strchr (linestring, '}')) == NULL) {
1336 if (tag == NULL) {
1337 tag_len = strlen (linestring);
1338 tag = malloc (sizeof (char) * (tag_len + 1));
1339 strcpy (tag, linestring);
1340 } else {
1341 tag_len = tag_len + strlen (linestring) + 1;
1342 tmp = malloc (sizeof (char) * (tag_len + 1));
1343 strcpy (tmp, tag);
1344 strcat (tmp, "\n");
1345 strcat (tmp, linestring);
1346 free (tag);
1347 tag = tmp;
1348 }
1349 free (linestring);
1350 if (tag_len > 1000) {
1351 *comment_status = eTagCommentStatus_runon;
1352 free (tag);
1353 return NULL;
1354 }
1355 linestring = readfunc (userdata);
1356 }
1357 if (cp != NULL && cp > linestring) {
1358 end_len = cp - linestring;
1359 tag_len = tag_len + end_len + 1;
1360 tmp = malloc (sizeof (char) * (tag_len + 1));
1361 strcpy (tmp, tag);
1362 strcat (tmp, "\n");
1363 strncat (tmp, linestring, end_len);
1364 tmp[tag_len] = 0;
1365 free (tag);
1366 tag = tmp;
1367 }
1368 if (linestring != NULL) {
1369 free (linestring);
1370 }
1371
1372 return tag;
1373 }
1374
1375
1376 /* Reads the portion of and ACE file for a single contig, including the reads */
s_ReadContig(char ** initline,FReadLineFunction readfunc,void * userdata,char make_qual_scores,char * has_errors)1377 static TContigPtr s_ReadContig
1378 (char ** initline,
1379 FReadLineFunction readfunc,
1380 void * userdata,
1381 char make_qual_scores,
1382 char * has_errors)
1383 {
1384 char *linestring;
1385 char *firstline;
1386 char *cp;
1387 int len = 0, read_num = 0, num_base_segs = 0, report_read_num = 0;
1388 EFound val;
1389 char found_comp_char = 0;
1390 TContigPtr contig = NULL;
1391 ETagCommentStatus comment_status = eTagCommentStatus_ok;
1392
1393 if (initline == NULL) return NULL;
1394 firstline = *initline;
1395 if (firstline == NULL || readfunc == NULL) return NULL;
1396
1397 if (firstline [0] != 'C' || firstline [1] != 'O' || ! isspace (firstline [2])) {
1398 return NULL;
1399 }
1400
1401 contig = ContigNew ();
1402 len = strlen (firstline + 3);
1403 contig->consensus_id = malloc (len + 1);
1404 strcpy (contig->consensus_id, firstline + 3);
1405
1406 cp = contig->consensus_id;
1407 while (*cp != 0 && !isspace (*cp)) {
1408 cp++;
1409 }
1410 if (isspace (*cp)) {
1411 *cp = 0;
1412 cp++;
1413 contig->consensus_assem_len = atoi (cp);
1414 while (isdigit (*cp)) {
1415 cp++;
1416 }
1417 if (isspace (*cp)) {
1418 cp++;
1419 contig->num_reads = atoi (cp);
1420 while (isdigit (*cp)) {
1421 cp++;
1422 }
1423 if (isspace (*cp)) {
1424 cp++;
1425 num_base_segs = atoi (cp);
1426 while (isdigit (*cp)) {
1427 cp++;
1428 }
1429 if (isspace (*cp)) {
1430 cp++;
1431 found_comp_char = 1;
1432 if (*cp == 'C') {
1433 contig->is_complement = 1;
1434 } else {
1435 contig->is_complement = 0;
1436 }
1437 }
1438 }
1439 }
1440 }
1441 if (contig->consensus_assem_len == 0 || contig->num_reads == 0 || !found_comp_char) {
1442 PrintACEFormatErrorXML ("Error in consensus line", contig->consensus_id, has_errors);
1443 ContigFree (contig);
1444 return NULL;
1445 }
1446
1447 /* now copy in sequence data */
1448 contig->consensus_seq = s_ReadSequenceFromFile (contig->consensus_assem_len, readfunc, userdata, contig->consensus_id, has_errors);
1449 if (contig->consensus_seq == NULL) {
1450 ContigFree (contig);
1451 return NULL;
1452 }
1453
1454 /* record actual length of consensus seq */
1455 contig->consensus_seq_len = s_GetUngappedSeqLen (contig->consensus_seq, '*');
1456
1457 /* calculate gap info */
1458 contig->gaps = GapInfoFromSequenceString (contig->consensus_seq, '*');
1459
1460 /* read quality scores */
1461 if (make_qual_scores) {
1462 val = s_ReadQualScores (contig, readfunc, userdata);
1463 if (val != eNone && val != eJustRight) {
1464 s_ReportFound (val, "quality scores", contig->consensus_id, has_errors);
1465 ContigFree (contig);
1466 return NULL;
1467 }
1468 } else {
1469 s_SkipQualScores (readfunc, userdata);
1470 }
1471
1472 /* collect reads */
1473 val = s_ReadAFLines (contig, readfunc, userdata, &linestring);
1474 if (val != eJustRight) {
1475 s_ReportFound (val, "AF lines", contig->consensus_id, has_errors);
1476 ContigFree (contig);
1477 if (linestring != NULL) free (linestring);
1478 return NULL;
1479 }
1480
1481 if (num_base_segs > 0) {
1482 val = s_ReadBaseSegs (contig, num_base_segs, &linestring, readfunc, userdata);
1483 if (val != eJustRight) {
1484 s_ReportFound (val, "base segments", contig->consensus_id, has_errors);
1485 ContigFree (contig);
1486 return NULL;
1487 }
1488 }
1489
1490
1491 read_num = 0;
1492 while (linestring != NULL && linestring [0] != EOF) {
1493 if (linestring [0] == 'R' && linestring[1] == 'D' && isspace (linestring [2])) {
1494 len = strlen (contig->reads[read_num]->read_id);
1495 if (strncmp (linestring + 3, contig->reads[read_num]->read_id, len) != 0
1496 || !isspace (linestring [3 + len])) {
1497 PrintACEFormatErrorXML ("Read IDs out of order!", contig->consensus_id, has_errors);
1498 ContigFree (contig);
1499 return NULL;
1500 }
1501 len = atoi (linestring + 3 + len);
1502 contig->reads[read_num]->read_seq = s_ReadSequenceFromFile (len, readfunc, userdata, contig->reads[read_num]->read_id, has_errors);
1503 if (contig->reads[read_num]->read_seq == NULL) {
1504 ContigFree (contig);
1505 return NULL;
1506 }
1507 s_AdjustContigReadForTerminalNs (contig->reads[read_num]);
1508 contig->reads[read_num]->read_len = s_GetUngappedSeqLen (contig->reads[read_num]->read_seq, '*');
1509 contig->reads[read_num]->gaps = GapInfoFromSequenceString (contig->reads[read_num]->read_seq, '*');
1510 read_num++;
1511 report_read_num = read_num - 1;
1512 } else if (linestring [0] == 'Q' && linestring[1] == 'A' && isspace (linestring[2])) {
1513 if (read_num < 1) {
1514 PrintACEFormatErrorXML ("Found QA line before RD!", contig->consensus_id, has_errors);
1515 ContigFree (contig);
1516 return NULL;
1517 } else if (!ApplyQALineToRead (contig->reads[report_read_num], linestring, contig->reads[report_read_num]->read_id, has_errors)) {
1518 PrintACEFormatErrorXML ("Error in QA line format!", contig->reads[report_read_num]->read_id, has_errors);
1519 ContigFree (contig);
1520 return NULL;
1521 }
1522 } else if (linestring[0] == 'D' && linestring[1] == 'S' && (isspace (linestring[2]) || linestring[2] == 0)) {
1523 /* skip DS lines */
1524 } else if (strncmp (linestring, "RT{", 3) == 0) {
1525 contig->reads[read_num - 1]->tag = s_AddToTagComment (contig->reads[report_read_num]->tag, s_ReadTagComment (readfunc, userdata, &comment_status));
1526 } else if (strncmp (linestring, "WR{", 3) == 0) {
1527 contig->reads[read_num - 1]->tag = s_AddToTagComment (contig->reads[report_read_num]->tag, s_ReadTagComment (readfunc, userdata, &comment_status));
1528 } else if (strncmp (linestring, "CT{", 3) == 0) {
1529 contig->tag = s_AddToTagComment (contig->tag, s_ReadTagComment (readfunc, userdata, &comment_status));
1530 } else if (strncmp (linestring, "WA{", 3) == 0) {
1531 contig->tag = s_AddToTagComment (contig->tag, s_ReadTagComment (readfunc, userdata, &comment_status));
1532 } else if (linestring[0] != 0) {
1533 /* found next line */
1534 *initline = linestring;
1535 s_AdjustContigForTerminalNs (contig);
1536 s_CalculateContigOffsets (contig);
1537 return contig;
1538 }
1539 free (linestring);
1540 if (comment_status == eTagCommentStatus_runon) {
1541 PrintACEFormatErrorXML ("Error in CT line - comment tag is unusally long, suspect missing terminating }", contig->reads[report_read_num]->read_id, has_errors);
1542 ContigFree (contig);
1543 return NULL;
1544 }
1545
1546 linestring = readfunc (userdata);
1547 }
1548 *initline = NULL;
1549 s_AdjustContigForTerminalNs (contig);
1550 s_CalculateContigOffsets (contig);
1551 return contig;
1552 }
1553
1554
1555 /* Used to detect errors in ACE file formatting */
s_UnexpectedLineBetweenContigs(char * linestring)1556 static char s_UnexpectedLineBetweenContigs (char *linestring)
1557 {
1558 if (linestring == NULL) {
1559 return 0;
1560 } else if (linestring [0] == 'A' && linestring [1] == 'F') {
1561 return 1;
1562 } else if (linestring [0] == 'R' && linestring [1] == 'D') {
1563 return 1;
1564 } else {
1565 return 0;
1566 }
1567 }
1568
1569
1570 /* This is the main function for reading in an ACE file */
1571 extern TACEFilePtr
ReadACEFile(FReadLineFunction readfunc,void * userdata,char make_qual_scores,char * has_errors)1572 ReadACEFile
1573 (FReadLineFunction readfunc,
1574 void * userdata,
1575 char make_qual_scores,
1576 char * has_errors)
1577 {
1578 char * linestring;
1579 TACEFilePtr afp;
1580 char * cp;
1581 int contig_num = 0, read_num = 0;
1582 int num_reads_expected = 0;
1583
1584 if (readfunc == NULL) {
1585 return NULL;
1586 }
1587
1588 afp = ACEFileNew ();
1589 if (afp == NULL) {
1590 return NULL;
1591 }
1592
1593 linestring = readfunc (userdata);
1594
1595 while (linestring != NULL && linestring [0] != EOF) {
1596 if (linestring [0] == 'A' && linestring [1] == 'S' && isspace (linestring [2])) {
1597 if (num_reads_expected > 0) {
1598 PrintACEFormatErrorXML ("Two file header lines!", NULL, has_errors);
1599 ACEFileFree (afp);
1600 free (linestring);
1601 return NULL;
1602 }
1603 /* first line in file, number of contigs */
1604 cp = linestring + 3;
1605 afp->num_contigs = atoi (cp);
1606 afp->contigs = malloc (afp->num_contigs * sizeof (TContigPtr));
1607 if (afp->contigs == NULL) {
1608 PrintACEFormatErrorXML ("Memory allocation failed!", NULL, has_errors);
1609 free (linestring);
1610 ACEFileFree (afp);
1611 return NULL;
1612 }
1613 while (isdigit (*cp)) {
1614 cp++;
1615 }
1616 num_reads_expected = atoi (cp);
1617 free (linestring);
1618 linestring = readfunc (userdata);
1619 } else if (linestring [0] == 'C' && linestring [1] == 'O' && isspace (linestring [2])) {
1620 if (contig_num >= afp->num_contigs) {
1621 PrintACEFormatErrorXML ("Too many contigs!", NULL, has_errors);
1622 free (linestring);
1623 ACEFileFree (afp);
1624 return NULL;
1625 }
1626 afp->contigs[contig_num] = s_ReadContig (&linestring, readfunc, userdata, make_qual_scores, has_errors);
1627 if (afp->contigs[contig_num] == NULL) {
1628 PrintACEFormatErrorXMLStart (NULL, has_errors);
1629 printf ("Unable to read contig (%d)", contig_num);
1630 PrintACEFormatErrorXMLEnd ();
1631 ACEFileFree (afp);
1632 return NULL;
1633 }
1634 read_num += afp->contigs[contig_num]->num_reads;
1635 contig_num++;
1636 } else if (s_UnexpectedLineBetweenContigs (linestring)) {
1637 PrintACEFormatErrorXMLStart (NULL, has_errors);
1638 printf ("Unexpected line after contig %d:%s", read_num, linestring);
1639 PrintACEFormatErrorXMLEnd ();
1640 free (linestring);
1641 ACEFileFree (afp);
1642 return NULL;
1643 } else {
1644 free (linestring);
1645 linestring = readfunc (userdata);
1646 }
1647 }
1648 if (contig_num < afp->num_contigs) {
1649 PrintACEFormatErrorXML ("Not enough contigs!", NULL, has_errors);
1650 ACEFileFree (afp);
1651 afp = NULL;
1652 } else if (read_num < num_reads_expected) {
1653 PrintACEFormatErrorXML ("Not enough reads!", NULL, has_errors);
1654 ACEFileFree (afp);
1655 afp = NULL;
1656 }
1657
1658 return afp;
1659 }
1660
1661
1662 /* This function writes out sequence characters, 60 per line. */
s_WriteSeq(FILE * fp,char * seq)1663 static void s_WriteSeq (FILE *fp, char *seq)
1664 {
1665 int i;
1666 char * cp;
1667
1668 if (fp == NULL || seq == NULL) return;
1669 cp = seq;
1670 while (*cp != 0) {
1671 for (i = 0; i < 60 && *cp != 0; i++, cp++) {
1672 fprintf (fp, "%c", *cp);
1673 }
1674 fprintf (fp, "\n");
1675 }
1676 }
1677
1678
1679 /* This function writes out quality scores in the ACE file format. */
s_WriteQualScores(FILE * fp,TContigPtr contig)1680 static void s_WriteQualScores (FILE *fp, TContigPtr contig)
1681 {
1682 int q_pos, line_pos;
1683
1684 if (fp == NULL || contig == NULL || contig->num_qual_scores == 0) return;
1685
1686 fprintf (fp, "BQ\n");
1687 q_pos = 0;
1688 while (q_pos < contig->num_qual_scores) {
1689 line_pos = 0;
1690 while (line_pos < 60 && q_pos < contig->num_qual_scores) {
1691 if (contig->consensus_seq[q_pos] != '*') {
1692 fprintf (fp, "%d ", contig->qual_scores[q_pos]);
1693 line_pos++;
1694 }
1695 q_pos++;
1696 }
1697 fprintf (fp, "\n");
1698 }
1699 }
1700
1701
1702 /* NOTE - this file does not provide all of the information required for an ACE file. */
s_WriteContig(FILE * fp,TContigPtr contig)1703 static void s_WriteContig (FILE *fp, TContigPtr contig)
1704 {
1705 int i;
1706
1707 if (contig == NULL) return;
1708
1709 fprintf (fp, "CO %s %d %d\n\n", contig->consensus_id, contig->consensus_assem_len, contig->num_reads);
1710 s_WriteSeq (fp, contig->consensus_seq);
1711 fprintf (fp, "\n");
1712
1713 s_WriteQualScores (fp, contig);
1714 fprintf (fp, "\n");
1715
1716 for (i = 0; i < contig->num_reads; i++) {
1717 fprintf (fp, "AF %s %c %d\n", contig->reads[i]->read_id,
1718 contig->reads[i]->is_complement ? 'C' : 'U',
1719 contig->reads[i]->cons_start + 1);
1720 }
1721 fprintf (fp, "\n");
1722 for (i = 0; i < contig->num_reads; i++) {
1723 fprintf (fp, "RD %s %d\n", contig->reads[i]->read_id, (int) strlen (contig->reads[i]->read_seq));
1724 s_WriteSeq (fp, contig->reads[i]->read_seq);
1725 fprintf (fp, "\n");
1726 }
1727
1728 }
1729
1730
1731 /* NOTE - This generates an incomplete ACE file - the data structure we currently use
1732 * does not provide enough data to create a complete ACE file.
1733 */
WriteACEFile(FILE * fp,TACEFilePtr afp)1734 extern void WriteACEFile (FILE *fp, TACEFilePtr afp)
1735 {
1736 int i, tot_reads = 0;
1737 if (fp == NULL || afp == NULL) return;
1738
1739 for (i = 0; i < afp->num_contigs; i++) {
1740 tot_reads += afp->contigs[i]->num_reads;
1741 }
1742 fprintf (fp, "AS %d %d\n\n", afp->num_contigs, tot_reads);
1743
1744 for (i = 0; i < afp->num_contigs; i++) {
1745 s_WriteContig (fp, afp->contigs[i]);
1746 }
1747 }
1748
1749
1750 /* This function generates a string that uses the FASTA+GAP method for specifying gaps
1751 * (dashes instead of asterisks)
1752 */
1753 static char *
s_AlignmentSeqFromContigSeq(char * contig_seq,int cons_start,int aln_len,int read_start,int read_stop)1754 s_AlignmentSeqFromContigSeq
1755 (char *contig_seq,
1756 int cons_start,
1757 int aln_len,
1758 int read_start,
1759 int read_stop)
1760 {
1761 char * aln_seq;
1762 char * cp;
1763 int pos = 0, i;
1764
1765 aln_seq = malloc (sizeof (char) * (aln_len + 1));
1766 /* pad start */
1767 for (i = 0; i < cons_start; i++) {
1768 aln_seq[pos] = '-';
1769 pos++;
1770 }
1771 cp = contig_seq;
1772 if (read_start > 1) {
1773 i = 1;
1774 while (*cp != 0 && i < read_start) {
1775 aln_seq[pos] = '-';
1776 pos++;
1777 cp++;
1778 i++;
1779 }
1780 }
1781 while (*cp != 0 && (read_stop < 1 || i < read_stop)) {
1782 if (*cp == '*') {
1783 aln_seq[pos] = '-';
1784 } else {
1785 aln_seq[pos] = *cp;
1786 }
1787 pos++;
1788 cp++;
1789 i++;
1790 }
1791 while (pos < aln_len) {
1792 aln_seq[pos] = '-';
1793 pos++;
1794 }
1795 aln_seq[pos] = 0;
1796 return aln_seq;
1797 }
1798
1799
s_FarPointerIdFromReadId(char * read_id)1800 static char * s_FarPointerIdFromReadId (char * read_id)
1801 {
1802 char * far_id = NULL;
1803 far_id = malloc (sizeof (char) * (strlen (read_id) + 1));
1804 strcpy (far_id, read_id);
1805 return far_id;
1806 }
1807
1808
1809 /* This function generates an intermediate data format suitable for generating
1810 * a SeqEntry with an alignment.
1811 */
AlignmentFileFromContig(TContigPtr contig)1812 extern TAlignmentFilePtr AlignmentFileFromContig (TContigPtr contig)
1813 {
1814 TAlignmentFilePtr aln;
1815 int i, len;
1816 int consensus_pad = 0, pad, end_pad = 0, aln_len;
1817
1818 if (contig == NULL) return NULL;
1819
1820 aln = AlignmentFileNew ();
1821 aln->num_sequences = 1 + contig->num_reads;
1822 aln->num_organisms = 0;
1823 aln->num_deflines = 0;
1824 aln->num_segments = 1;
1825 aln->ids = malloc (sizeof (char *) * aln->num_sequences);
1826 aln->sequences = malloc (sizeof (char *) * aln->num_sequences);
1827 aln->organisms = NULL;
1828 aln->deflines = NULL;
1829 aln->align_format_found = 1;
1830 /* calculate padding for consensus */
1831 for (i = 0; i < contig->num_reads; i++) {
1832 if (contig->reads[i]->cons_start < 0) {
1833 pad = 0 - contig->reads[i]->cons_start;
1834 if (consensus_pad < pad) {
1835 consensus_pad = pad;
1836 }
1837 }
1838 len = contig->reads[i]->cons_start + strlen (contig->reads[i]->read_seq);
1839 if (len > contig->consensus_assem_len) {
1840 pad = len - contig->consensus_assem_len;
1841 if (pad > end_pad) {
1842 end_pad = pad;
1843 }
1844 }
1845 }
1846 aln_len = consensus_pad + contig->consensus_assem_len + end_pad;
1847 /* seq for consensus */
1848 len = strlen (contig->consensus_id);
1849 aln->ids[0] = malloc (sizeof (char) * (len + 1));
1850 strcpy (aln->ids[0], contig->consensus_id);
1851 aln->sequences[0] = s_AlignmentSeqFromContigSeq (contig->consensus_seq,
1852 consensus_pad,
1853 aln_len, 0, 0);
1854 for (i = 1; i < aln->num_sequences; i++) {
1855 len = strlen (contig->reads[i - 1]->read_id);
1856 aln->ids[i] = s_FarPointerIdFromReadId (contig->reads[i - 1]->read_id);
1857 aln->sequences[i] = s_AlignmentSeqFromContigSeq (contig->reads[i - 1]->read_seq,
1858 consensus_pad + contig->reads[i - 1]->cons_start,
1859 aln_len,
1860 contig->reads[i - 1]->read_assem_start,
1861 contig->reads[i - 1]->read_assem_stop);
1862 }
1863 return aln;
1864 }
1865
1866
1867 /* The Trace Archive Gap String is a list of the number of nucleotides to skip before adding the next gap */
TraceArchiveGapStringFromACESequence(char * seq_str)1868 extern char * TraceArchiveGapStringFromACESequence (char *seq_str)
1869 {
1870 char *cp;
1871 char * gap_str = NULL;
1872 char * print_pos;
1873 int len = 0, pos;
1874
1875 if (seq_str == NULL) return NULL;
1876
1877 /* first determine length of gap string */
1878 cp = seq_str;
1879 while (*cp != 0) {
1880 if (*cp == '*' || *cp == '-') {
1881 len++;
1882 }
1883 cp++;
1884 }
1885 len = 15 * len + 1;
1886 gap_str = malloc (sizeof (char) * len);
1887 cp = seq_str;
1888 print_pos = gap_str;
1889 pos = 0;
1890 while (*cp != 0) {
1891 if (*cp == '*' || *cp == '-') {
1892 sprintf (print_pos, "%d,", pos);
1893 print_pos += strlen (print_pos);
1894 pos = 0;
1895 } else {
1896 pos++;
1897 }
1898 cp++;
1899 }
1900 /* trim final comma */
1901 print_pos[strlen(print_pos) - 1] = 0;
1902 return gap_str;
1903 }
1904
1905
1906 /* NOTE - These functions are currently incomplete */
WriteTraceArchiveRead(FILE * fp,TContigReadPtr read)1907 extern void WriteTraceArchiveRead (FILE *fp, TContigReadPtr read)
1908 {
1909 char *cp;
1910 if (fp == NULL || read == NULL) {
1911 return;
1912 }
1913
1914 fprintf (fp, "<trace>\n");
1915 fprintf (fp, "<trace_name>%s</trace_name>\n", read->read_id);
1916 fprintf (fp, "<traceconsensus>");
1917 cp = read->read_seq;
1918 while (*cp != 0) {
1919 if (*cp != '*') {
1920 fprintf (fp, "%c", *cp);
1921 }
1922 cp++;
1923 }
1924 fprintf (fp, "</traceconsensus>\n");
1925 cp = TraceArchiveGapStringFromACESequence (read->read_seq);
1926 fprintf (fp, "<tracegaps>%s</tracegaps>\n", cp);
1927 free (cp);
1928 fprintf (fp, "</trace>\n");
1929 }
1930
1931
s_GetTokenLen(char * str)1932 static int s_GetTokenLen (char *str)
1933 {
1934 char *cp;
1935 int len = 0;
1936
1937 if (str == NULL) return 0;
1938
1939 cp = str;
1940 while (*cp != 0 && !isspace (*cp)) {
1941 len++;
1942 cp++;
1943 }
1944 return len;
1945 }
1946
1947
s_SkipTokens(char * str,int num_tokens)1948 static char * s_SkipTokens (char *str, int num_tokens)
1949 {
1950 char *cp;
1951 int i;
1952
1953 if (str == NULL || num_tokens < 0) return NULL;
1954
1955 cp = str;
1956 /* skip leading whitespace */
1957 while (isspace (*cp)) {
1958 cp++;
1959 }
1960
1961 for (i = 0; i < num_tokens && *cp != 0; i++) {
1962 /* skip token */
1963 while (*cp != 0 && !isspace (*cp)) {
1964 cp++;
1965 }
1966 /* skip trailing whitespace */
1967 while (isspace (*cp)) {
1968 cp++;
1969 }
1970 }
1971 if (*cp == 0) {
1972 return NULL;
1973 } else {
1974 return cp;
1975 }
1976 }
1977
1978
1979 /* for reading other formats */
1980 extern TContigReadPtr
ReadContigFromString(char * str,char ** consensus_id,int id_col,int seq_col,int contig_id_col,int strand_col,int start_col,int interpret_n_col)1981 ReadContigFromString
1982 (char *str,
1983 char **consensus_id,
1984 int id_col,
1985 int seq_col,
1986 int contig_id_col,
1987 int strand_col,
1988 int start_col,
1989 int interpret_n_col
1990 )
1991 {
1992 TContigReadPtr read = NULL;
1993 char *cp;
1994 int len, col_num = 1, n_is_gap = 0;
1995 int max_col;
1996
1997 if (str == NULL) {
1998 return NULL;
1999 }
2000
2001 max_col = id_col;
2002 if (seq_col > max_col) {
2003 max_col = seq_col;
2004 }
2005 if (contig_id_col > max_col) {
2006 max_col = contig_id_col;
2007 }
2008 if (strand_col > max_col) {
2009 max_col = strand_col;
2010 }
2011 if (start_col > max_col) {
2012 max_col = start_col;
2013 }
2014 if (interpret_n_col > max_col) {
2015 max_col = interpret_n_col;
2016 }
2017
2018 read = ContigReadNew ();
2019
2020 cp = str;
2021 len = s_GetTokenLen (cp);
2022 while (cp != NULL && *cp != 0 && col_num <= max_col) {
2023 if (id_col == col_num) {
2024 read->read_id = malloc (len + 1);
2025 strncpy (read->read_id, cp, len);
2026 read->read_id[len] = 0;
2027 } else if (seq_col == col_num) {
2028 read->read_seq = malloc (len + 1);
2029 strncpy (read->read_seq, cp, len);
2030 read->read_seq[len] = 0;
2031 read->read_len = len;
2032 } else if (contig_id_col == col_num) {
2033 if (consensus_id != NULL) {
2034 *consensus_id = malloc (len + 1);
2035 strncpy (*consensus_id, cp, len);
2036 (*consensus_id)[len] = 0;
2037 }
2038 } else if (strand_col == col_num) {
2039 if (*cp == 'R' || *cp == '-') {
2040 read->is_complement = 1;
2041 }
2042 } else if (start_col == col_num) {
2043 read->cons_start = atoi (cp);
2044 } else if (interpret_n_col == col_num) {
2045 if (*cp == 'I') {
2046 n_is_gap = 1;
2047 }
2048 }
2049 /* advance to next token */
2050 col_num++;
2051 cp = s_SkipTokens (cp, 1);
2052 len = s_GetTokenLen (cp);
2053 }
2054
2055 if (max_col > col_num) {
2056 ContigReadFree (read);
2057 read = NULL;
2058 } else {
2059 read->cons_stop = read->cons_start + read->read_len - 1;
2060 read->tiling_start = read->cons_start;
2061 read->tiling_stop = read->cons_stop;
2062 read->read_assem_start = 0;
2063 read->read_assem_stop = read->read_len - 1;
2064 read->read_start = 1;
2065 read->read_stop = read->read_len;
2066 if (n_is_gap) {
2067 /* adjust for gaps */
2068 read->gaps = GapInfoFromSequenceString (read->read_seq, 'N');
2069 if (read->gaps->num_gaps > 0) {
2070 RemoveGapCharsFromSequenceString (read->read_seq, 'N');
2071 read->read_stop -= read->gaps->num_gaps;
2072 read->read_len -= read->gaps->num_gaps;
2073 }
2074 }
2075 }
2076
2077 return read;
2078 }
2079
2080
ReadFromMAQString(char * str,char ** consensus_id)2081 extern TContigReadPtr ASSEMBLY_CALLBACK ReadFromMAQString (char *str, char **consensus_id)
2082 {
2083 TContigReadPtr read = NULL;
2084
2085 read = ReadContigFromString (str, consensus_id, 1, 15, 2, 4, 3, 0);
2086 return read;
2087 }
2088
2089
ReadFromElandMostCompressed(char * str,char ** consensus_id)2090 extern TContigReadPtr ASSEMBLY_CALLBACK ReadFromElandMostCompressed (char *str, char **consensus_id)
2091 {
2092 TContigReadPtr read = NULL;
2093
2094 read = ReadContigFromString (str, consensus_id, 0, 1, 0, 5, 4, 0);
2095 return read;
2096 }
2097
2098
ReadFromElandSanger(char * str,char ** consensus_id)2099 extern TContigReadPtr ASSEMBLY_CALLBACK ReadFromElandSanger (char *str, char **consensus_id)
2100 {
2101 TContigReadPtr read = NULL;
2102
2103 read = ReadContigFromString (str, consensus_id, 1, 2, 4, 6, 5, 0);
2104 return read;
2105 }
2106
2107
ReadFromElandStandalone(char * str,char ** consensus_id)2108 extern TContigReadPtr ASSEMBLY_CALLBACK ReadFromElandStandalone (char *str, char **consensus_id)
2109 {
2110 TContigReadPtr read = NULL;
2111
2112 read = ReadContigFromString (str, consensus_id, 1, 2, 7, 9, 8, 10);
2113 return read;
2114 }
2115
2116
2117 #define READ_BLOCK_SIZE 50
2118
2119 typedef struct ReadList {
2120 TContigReadPtr reads[READ_BLOCK_SIZE];
2121 int num_reads;
2122 struct ReadList * next;
2123 } SReadList, * TReadListPtr;
2124
2125
ReadListNew()2126 static TReadListPtr ReadListNew ()
2127 {
2128 TReadListPtr r;
2129
2130 r = malloc (sizeof (SReadList));
2131 r->num_reads = 0;
2132 r->next = NULL;
2133 return r;
2134 }
2135
2136
ReadListFree(TReadListPtr r)2137 static TReadListPtr ReadListFree (TReadListPtr r)
2138 {
2139 TReadListPtr r_next;
2140 int i;
2141
2142 while (r != NULL) {
2143 for (i = 0; i < r->num_reads; i++) {
2144 ContigReadFree (r->reads[i]);
2145 }
2146 r_next = r;
2147 free (r);
2148 r = r_next;
2149 }
2150 return r;
2151 }
2152
2153
AddToReadList(TContigReadPtr read,TReadListPtr read_list)2154 static TReadListPtr AddToReadList (TContigReadPtr read, TReadListPtr read_list)
2155 {
2156 if (read_list == NULL) {
2157 read_list = ReadListNew();
2158 } else {
2159 while (read_list->next != NULL && read_list->num_reads == READ_BLOCK_SIZE) {
2160 read_list = read_list->next;
2161 }
2162 if (read_list->num_reads == READ_BLOCK_SIZE) {
2163 read_list->next = ReadListNew();
2164 read_list = read_list->next;
2165 }
2166 }
2167 read_list->reads[read_list->num_reads++] = read;
2168 return read_list;
2169 }
2170
2171
2172 typedef struct ConsensusReads {
2173 TContigPtr contig;
2174 TReadListPtr read_list;
2175 TReadListPtr last_read;
2176 struct ConsensusReads * next;
2177 } SConsensusReads, * TConsensusReadsPtr;
2178
2179
ConsensusReadsNew(char * consensus_id)2180 static TConsensusReadsPtr ConsensusReadsNew (char *consensus_id)
2181 {
2182 TConsensusReadsPtr c;
2183
2184 c = malloc (sizeof (SConsensusReads));
2185 c->contig = ContigNew ();
2186 if (consensus_id != NULL) {
2187 c->contig->consensus_id = malloc (strlen (consensus_id) + 1);
2188 strcpy (c->contig->consensus_id, consensus_id);
2189 }
2190
2191 c->read_list = NULL;
2192 c->last_read = NULL;
2193 c->next = NULL;
2194 return c;
2195 }
2196
2197
ConsensusReadsFree(TConsensusReadsPtr c)2198 static TConsensusReadsPtr ConsensusReadsFree (TConsensusReadsPtr c)
2199 {
2200 TConsensusReadsPtr c_next;
2201
2202 while (c != NULL) {
2203 c_next = c->next;
2204 ContigFree (c->contig);
2205 c->read_list = ReadListFree (c->read_list);
2206 free (c);
2207 c = c_next;
2208 }
2209 return c;
2210 }
2211
2212
AddReadToConsensusReads(TConsensusReadsPtr c,TContigReadPtr read)2213 static void AddReadToConsensusReads (TConsensusReadsPtr c, TContigReadPtr read)
2214 {
2215 if (c != NULL && read != NULL) {
2216 c->last_read = AddToReadList (read, c->read_list);
2217 if (c->read_list == NULL) {
2218 c->read_list = c->last_read;
2219 }
2220 }
2221 }
2222
2223 #define CONSENSUS_BLOCK_SIZE 50
2224
2225 typedef struct ConsensusReadsList {
2226 TConsensusReadsPtr contigs[CONSENSUS_BLOCK_SIZE];
2227 int num_contigs;
2228 struct ConsensusReadsList * next;
2229 } SConsensusReadsList, * TConsensusReadsListPtr;
2230
2231
ConsensusReadsListNew()2232 static TConsensusReadsListPtr ConsensusReadsListNew ()
2233 {
2234 TConsensusReadsListPtr c;
2235
2236 c = malloc (sizeof (SConsensusReadsList));
2237 c->num_contigs = 0;
2238 c->next = NULL;
2239 return c;
2240 }
2241
2242
ConsensusReadsListFree(TConsensusReadsListPtr c)2243 static TConsensusReadsListPtr ConsensusReadsListFree (TConsensusReadsListPtr c)
2244 {
2245 TConsensusReadsListPtr c_next;
2246 int i;
2247
2248 while (c != NULL) {
2249 c_next = c->next;
2250 for (i = 0; i < c->num_contigs; i++) {
2251 c->contigs[i] = ConsensusReadsFree (c->contigs[i]);
2252 }
2253 free (c);
2254 c = c_next;
2255 }
2256 return c;
2257 }
2258
2259
FindConsensusIDInConsensusReadsList(TConsensusReadsListPtr c,char * consensus_id)2260 static TConsensusReadsPtr FindConsensusIDInConsensusReadsList (TConsensusReadsListPtr c, char *consensus_id)
2261 {
2262 int i;
2263 TConsensusReadsPtr r = NULL;
2264
2265 if (consensus_id == NULL) {
2266 return NULL;
2267 }
2268 while (c != NULL && r == NULL) {
2269 for (i = 0; i < c->num_contigs && r == NULL; i++) {
2270 if (c->contigs[i] != NULL
2271 && c->contigs[i]->contig != NULL
2272 && strcmp (c->contigs[i]->contig->consensus_id, consensus_id) == 0) {
2273 r = c->contigs[i];
2274 }
2275 }
2276 c = c->next;
2277 }
2278 return r;
2279 }
2280
2281
2282 static TConsensusReadsListPtr
AddConsensusReadToConsensusReadsList(TConsensusReadsListPtr c,char * consensus_id,TContigReadPtr read)2283 AddConsensusReadToConsensusReadsList
2284 (TConsensusReadsListPtr c,
2285 char * consensus_id,
2286 TContigReadPtr read)
2287 {
2288 TConsensusReadsPtr r = NULL;
2289
2290 if (c == NULL) {
2291 c = ConsensusReadsListNew ();
2292 r = ConsensusReadsNew(consensus_id);
2293 c->contigs[c->num_contigs++] = r;
2294 } else {
2295 r = FindConsensusIDInConsensusReadsList (c, consensus_id);
2296 if (r == NULL) {
2297 while (c->next != NULL && c->num_contigs == CONSENSUS_BLOCK_SIZE) {
2298 c = c->next;
2299 }
2300 if (c->num_contigs == CONSENSUS_BLOCK_SIZE) {
2301 c->next = ConsensusReadsListNew ();
2302 c = c->next;
2303 }
2304 r = ConsensusReadsNew(consensus_id);
2305 c->contigs[c->num_contigs++] = r;
2306 }
2307 }
2308 AddReadToConsensusReads (r, read);
2309
2310 return c;
2311 }
2312
2313
MoveReadsToContigFromReadList(TContigPtr contig,TReadListPtr read_list)2314 static void MoveReadsToContigFromReadList (TContigPtr contig, TReadListPtr read_list)
2315 {
2316 TReadListPtr r;
2317 int n = 0, i;
2318
2319 if (contig == NULL) {
2320 return;
2321 }
2322
2323 for (r = read_list; r != NULL; r = r->next) {
2324 n += r->num_reads;
2325 }
2326
2327 contig->num_reads = n;
2328 contig->reads = malloc (contig->num_reads * sizeof (TContigReadPtr));
2329 n = 0;
2330
2331 for (r = read_list; r != NULL; r = r->next) {
2332 for (i = 0; i < r->num_reads; i++) {
2333 contig->reads[n++] = r->reads[i];
2334 r->reads[i] = NULL;
2335 }
2336 r->num_reads = 0;
2337 }
2338 }
2339
2340
ACEFileFromConsensusReadsList(TConsensusReadsListPtr contig_list)2341 static TACEFilePtr ACEFileFromConsensusReadsList (TConsensusReadsListPtr contig_list)
2342 {
2343 TACEFilePtr afp = NULL;
2344 TConsensusReadsListPtr c;
2345 int i, n = 0;
2346
2347 if (contig_list == NULL || contig_list->num_contigs == 0) {
2348 return NULL;
2349 }
2350
2351 afp = ACEFileNew ();
2352 for (c = contig_list; c != NULL; c=c->next) {
2353 afp->num_contigs += c->num_contigs;
2354 }
2355
2356 afp->contigs = malloc (afp->num_contigs * sizeof (TContigPtr));
2357 for (c = contig_list; c != NULL; c = c->next) {
2358 for (i = 0; i < c->num_contigs; i++) {
2359 MoveReadsToContigFromReadList (c->contigs[i]->contig, c->contigs[i]->read_list);
2360 afp->contigs[n++] = c->contigs[i]->contig;
2361 c->contigs[i]->contig = NULL;
2362 c->contigs[i]->last_read = NULL;
2363 }
2364 c->num_contigs = 0;
2365 }
2366 return afp;
2367 }
2368
2369
ReadAssemblyFile(FReadLineFunction readfunc,void * fileuserdata,FReadFromStringFunction makeread_func)2370 extern TACEFilePtr ReadAssemblyFile
2371 (FReadLineFunction readfunc, /* function for reading lines of
2372 * alignment file
2373 */
2374 void * fileuserdata, /* data to be passed back each time
2375 * readfunc is invoked
2376 */
2377 FReadFromStringFunction makeread_func) /* function to transform a string into a read */
2378 {
2379 TACEFilePtr afp = NULL;
2380 TContigReadPtr read;
2381 TConsensusReadsListPtr contig_list = NULL, contig_last = NULL;
2382 char *linestring;
2383 char *consensus_id = NULL;
2384
2385 if (readfunc == NULL || makeread_func == NULL) {
2386 return NULL;
2387 }
2388 linestring = readfunc (fileuserdata);
2389
2390 while (linestring != NULL && linestring [0] != EOF) {
2391 /* get ContigRead */
2392 read = makeread_func (linestring, &consensus_id);
2393
2394 /* group with other ContigReads from the same consensus_id */
2395 if (read != NULL && consensus_id != NULL) {
2396 contig_last = AddConsensusReadToConsensusReadsList (contig_last, consensus_id, read);
2397 if (contig_list == NULL) {
2398 contig_list = contig_last;
2399 }
2400 read = NULL;
2401 }
2402 if (consensus_id != NULL) {
2403 free (consensus_id);
2404 consensus_id = NULL;
2405 }
2406 ContigReadFree (read);
2407 free (linestring);
2408 linestring = readfunc (fileuserdata);
2409 }
2410
2411 afp = ACEFileFromConsensusReadsList (contig_list);
2412 contig_list = ConsensusReadsListFree (contig_list);
2413 return afp;
2414 }
2415
2416
ReadMAQFile(FReadLineFunction readfunc,void * fileuserdata)2417 extern TACEFilePtr ReadMAQFile
2418 (FReadLineFunction readfunc, /* function for reading lines of
2419 * alignment file
2420 */
2421 void * fileuserdata) /* data to be passed back each time
2422 * readfunc is invoked
2423 */
2424 {
2425 return ReadAssemblyFile (readfunc, fileuserdata, ReadFromMAQString);
2426 }
2427
2428
ReadElandStandaloneFile(FReadLineFunction readfunc,void * fileuserdata)2429 extern TACEFilePtr ReadElandStandaloneFile
2430 (FReadLineFunction readfunc, /* function for reading lines of
2431 * alignment file
2432 */
2433 void * fileuserdata) /* data to be passed back each time
2434 * readfunc is invoked
2435 */
2436 {
2437 return ReadAssemblyFile (readfunc, fileuserdata, ReadFromElandStandalone);
2438 }
2439
2440
2441 /* functions for writing out XML */
WriteTraceGapsXML(TGapInfoPtr gap_info,FILE * fp)2442 static void WriteTraceGapsXML (TGapInfoPtr gap_info, FILE *fp)
2443 {
2444 int i;
2445
2446 if (gap_info != NULL && fp != NULL) {
2447 fprintf (fp, " <ntracegaps>%d</ntracegaps>\n", gap_info->num_gaps);
2448 if (gap_info->num_gaps > 0) {
2449 fprintf (fp, " <tracegaps source=\"INLINE\">");
2450 for (i = 0; i < gap_info->num_gaps - 1; i++) {
2451 fprintf (fp, "%d ", gap_info->gap_offsets[i]);
2452 }
2453 fprintf (fp, "%d</tracegaps>\n", gap_info->gap_offsets[gap_info->num_gaps - 1]);
2454 } else {
2455 fprintf (fp, " <tracegaps source=\"INLINE\"> </tracegaps>\n");
2456 }
2457 }
2458 }
2459
2460
WriteTraceReadXML(TContigReadPtr read,FILE * fp)2461 static void WriteTraceReadXML (TContigReadPtr read, FILE *fp)
2462 {
2463 if (read != NULL && fp != NULL) {
2464 fprintf (fp, "<trace>\n");
2465 if (read->ti > 0) {
2466 fprintf (fp, " <ti>%d</ti>\n", read->ti);
2467 }
2468 if (read->srr != NULL) {
2469 fprintf (fp, " <srr>%s</srr>\n", read->srr);
2470 }
2471 if (read->read_id != NULL) {
2472 fprintf (fp, " <trace_name>%s</trace_name>\n", read->read_id);
2473 }
2474 fprintf (fp, " <nbasecalls>%d</nbasecalls>\n", read->read_len);
2475 fprintf (fp, " <valid>\n");
2476 fprintf (fp, " <start>%d</start>\n", read->read_start);
2477 fprintf (fp, " <stop>%d</stop>\n", read->read_stop);
2478 fprintf (fp, " </valid>\n");
2479 fprintf (fp, " <tiling direction = \"%s\">\n", read->is_complement ? "REVERSE" : "FORWARD");
2480 fprintf (fp, " <start>%d</start>\n", read->tiling_start);
2481 fprintf (fp, " <stop>%d</stop>\n", read->tiling_stop);
2482 fprintf (fp, " </tiling>\n");
2483 fprintf (fp, " <traceconsensus>\n");
2484 fprintf (fp, " <start>%d</start>\n", read->cons_start);
2485 fprintf (fp, " <stop>%d</stop>\n", read->cons_stop);
2486 fprintf (fp, " </traceconsensus>\n");
2487 WriteTraceGapsXML (read->gaps, fp);
2488 fprintf (fp, "</trace>\n");
2489 }
2490 }
2491
2492
WriteTraceAssemblyFromContig(TContigPtr contig,FILE * fp)2493 extern void WriteTraceAssemblyFromContig (TContigPtr contig, FILE *fp)
2494 {
2495 int i;
2496
2497 if (contig == NULL || fp == NULL) return;
2498
2499 /* NOTE - need to add new field to TContigPtr for submitter reference, where orig ID should move to */
2500 fprintf (fp, " <contig submitter_reference=\"%s\" conformation=\"LINEAR\" type=\"NEW\">\n",
2501 contig->consensus_id == NULL ? "not supplied" : contig->consensus_id);
2502
2503 fprintf (fp, " <ntraces>%d</ntraces>\n", contig->num_reads);
2504
2505 fprintf (fp, " <nconbases>%d</nconbases>\n", contig->consensus_seq_len);
2506
2507 /* need nbasecalls */
2508
2509 if (contig->gaps == NULL) {
2510 fprintf (fp, " <ncongaps>0</ncongaps>\n");
2511 } else {
2512 fprintf (fp, " <ncongaps>%d</ncongaps>\n", contig->gaps->num_gaps);
2513 if (contig->gaps->num_gaps > 0) {
2514 fprintf (fp, " <congaps source=\"INLINE\">");
2515 for (i = 0; i < contig->gaps->num_gaps - 1; i++) {
2516 fprintf (fp, "%d ", contig->gaps->gap_offsets[i]);
2517 }
2518 fprintf (fp, "%d</congaps>\n", contig->gaps->gap_offsets[contig->gaps->num_gaps - 1]);
2519 }
2520 }
2521 fprintf (fp, " <consensus>%s</consensus>\n",
2522 contig->consensus_id == NULL ? "not supplied" : contig->consensus_id);
2523 if (contig->num_qual_scores > 0) {
2524 fprintf (fp, " <conqualities source=\"INLINE\">");
2525 for (i = 0; i < contig->num_qual_scores; i++) {
2526 fprintf (fp, "%d ", contig->qual_scores[i]);
2527 }
2528 fprintf (fp, "</conqualities>\n");
2529 }
2530
2531 for (i = 0; i < contig->num_reads; i++) {
2532 WriteTraceReadXML (contig->reads[i], fp);
2533 }
2534 fprintf (fp, " </contig>\n");
2535 }
2536
2537
2538 extern void
WriteTraceAssemblyHeader(char * assembly_type,char * subref,char * center_name,int taxid,char * description,char * assembly,int num_contigs,unsigned int num_conbases,int num_reads,unsigned int num_readbases,FILE * fp)2539 WriteTraceAssemblyHeader
2540 (char * assembly_type,
2541 char * subref,
2542 char * center_name,
2543 int taxid,
2544 char * description,
2545 char * assembly,
2546 int num_contigs,
2547 unsigned int num_conbases,
2548 int num_reads,
2549 unsigned int num_readbases,
2550 FILE * fp)
2551 {
2552 if (fp == NULL) {
2553 return;
2554 }
2555
2556 fprintf (fp, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
2557
2558 fprintf (fp, "<assembly submitter_reference=\"%s\" type = \"%s\">\n",
2559 subref == NULL ? "Not supplied" : subref,
2560 assembly_type == NULL ? "NEW" : assembly_type);
2561 fprintf (fp, " <center_name>%s</center_name>\n", center_name == NULL ? "Not supplied" : center_name);\
2562 fprintf (fp, " <organism descriptor=\"TAXID\">%d</organism>\n", taxid);
2563 fprintf (fp, " <description>%s</description>\n", description == NULL ? "Not supplied" : description);
2564 fprintf (fp, " <structure>%s</structure>\n", assembly == NULL ? "transcript assembly" : assembly);
2565 fprintf (fp, " <ncontigs>%d</ncontigs>\n", num_contigs);
2566 fprintf (fp, " <nconbases>%u</nconbases>\n", num_conbases);
2567 fprintf (fp, " <ntraces>%d</ntraces>\n", num_reads);
2568 fprintf (fp, " <nbasecalls>%u</nbasecalls>\n", num_readbases);
2569 fprintf (fp, " <coverage>%f</coverage>\n", num_conbases == 0 ? 0 : (float) ((float) num_readbases/ (float) num_conbases));
2570 }
2571
2572
WriteTraceAssemblyTrailer(FILE * fp)2573 extern void WriteTraceAssemblyTrailer (FILE *fp)
2574 {
2575 if (fp == NULL) {
2576 return;
2577 }
2578 fprintf (fp, "</assembly>\n");
2579 }
2580
2581
2582 extern void
WriteTraceAssemblyFromAceFile(TACEFilePtr afp,char * subref,char * center_name,int taxid,char * description,FILE * fp)2583 WriteTraceAssemblyFromAceFile
2584 (TACEFilePtr afp,
2585 char * subref,
2586 char * center_name,
2587 int taxid,
2588 char * description,
2589 FILE *fp)
2590 {
2591 int i, j, traces = 0;
2592 unsigned int conbases = 0, basecalls = 0;
2593
2594 if (afp == NULL || fp == NULL) return;
2595
2596
2597 for (i = 0; i < afp->num_contigs; i++) {
2598 conbases += afp->contigs[i]->consensus_seq_len;
2599 traces += afp->contigs[i]->num_reads;
2600 for (j = 0; j < afp->contigs[i]->num_reads; j++) {
2601 basecalls += afp->contigs[i]->reads[j]->read_len;
2602 }
2603 }
2604 WriteTraceAssemblyHeader (NULL, subref, center_name, taxid, description, NULL, afp->num_contigs, conbases, traces, basecalls, fp);
2605
2606 for (i = 0; i < afp->num_contigs; i++) {
2607 WriteTraceAssemblyFromContig (afp->contigs[i], fp);
2608 }
2609 WriteTraceAssemblyTrailer (fp);
2610 }
2611
2612
WriteFASTAFromContig(TContigPtr contig,FILE * fp)2613 extern void WriteFASTAFromContig
2614 (TContigPtr contig,
2615 FILE *fp)
2616 {
2617 int k;
2618 char *cp;
2619
2620 if (contig == NULL || fp == NULL) return;
2621
2622 fprintf (fp, ">%s\n", contig->consensus_id);
2623 cp = contig->consensus_seq;
2624 while (*cp != 0) {
2625 k = 0;
2626 while (k < 40 && *cp != 0) {
2627 if (*cp != '*') {
2628 fprintf (fp, "%c", *cp);
2629 k++;
2630 }
2631 cp++;
2632 }
2633 fprintf (fp, "\n");
2634 }
2635 fprintf (fp, "\n");
2636 }
2637
2638
2639 extern void
WriteFASTAFromAceFile(TACEFilePtr afp,FILE * fp)2640 WriteFASTAFromAceFile
2641 (TACEFilePtr afp,
2642 FILE *fp)
2643 {
2644 int i;
2645
2646 if (afp == NULL || fp == NULL) return;
2647
2648 for (i = 0; i < afp->num_contigs; i++) {
2649 WriteFASTAFromContig (afp->contigs[i], fp);
2650 }
2651 }
2652
2653
2654 #define kFASTASeqBufSize 100
2655
2656 typedef struct fastaseqbuf {
2657 char buf[kFASTASeqBufSize];
2658 int num_used;
2659 struct fastaseqbuf *next;
2660 } SFASTASeqBuf, * TFASTASeqBufPtr;
2661
2662
s_FASTASeqBufNew()2663 static TFASTASeqBufPtr s_FASTASeqBufNew ()
2664 {
2665 TFASTASeqBufPtr s;
2666
2667 s = (TFASTASeqBufPtr) malloc (sizeof (SFASTASeqBuf));
2668 if (s != NULL) {
2669 s->num_used = 0;
2670 s->next = NULL;
2671 }
2672 return s;
2673 }
2674
2675
s_FASTASeqBufFree(TFASTASeqBufPtr s)2676 static void s_FASTASeqBufFree (TFASTASeqBufPtr s)
2677 {
2678 TFASTASeqBufPtr s_next;
2679
2680 while (s != NULL) {
2681 s_next = s->next;
2682 free (s);
2683 s = s_next;
2684 }
2685 }
2686
2687
s_AddFASTAToBuf(char * line,TFASTASeqBufPtr buf)2688 static TFASTASeqBufPtr s_AddFASTAToBuf (char *line, TFASTASeqBufPtr buf)
2689 {
2690 TFASTASeqBufPtr last_buf;
2691 char *cp;
2692
2693 if (buf == NULL) {
2694 buf = s_FASTASeqBufNew();
2695 last_buf = buf;
2696 } else {
2697 last_buf = buf;
2698 while (last_buf->next != NULL) {
2699 last_buf = last_buf->next;
2700 }
2701 }
2702
2703 cp = line;
2704 while (*cp != 0 && *cp != '\r' && *cp != '\n') {
2705 while (isspace (*cp)) {
2706 cp++;
2707 }
2708 if (*cp != 0) {
2709 if (!isalpha (*cp)) {
2710 printf ("Found bad character in FASTA file!\n");
2711 s_FASTASeqBufFree (buf);
2712 buf = NULL;
2713 return buf;
2714 }
2715 if (last_buf->num_used == kFASTASeqBufSize) {
2716 last_buf->next = s_FASTASeqBufNew ();
2717 last_buf = last_buf->next;
2718 }
2719 last_buf->buf[last_buf->num_used++] = *cp;
2720 cp++;
2721 }
2722 }
2723 return buf;
2724 }
2725
2726
s_StripStars(char * str)2727 static char * s_StripStars (char *str)
2728 {
2729 char *cp_src;
2730 char *cp_dst;
2731 char *stripped;
2732
2733 if (str == NULL) {
2734 return 0;
2735 }
2736 cp_src = str;
2737 stripped = (char *) malloc (sizeof (char) * (strlen (str) + 1));
2738 cp_dst = stripped;
2739 while (*cp_src != 0) {
2740 if (*cp_src != '*') {
2741 *cp_dst = *cp_src;
2742 cp_dst++;
2743 }
2744 cp_src++;
2745 }
2746 *cp_dst = 0;
2747 return stripped;
2748 }
2749
2750
s_DoesFASTAMatchSeq(TFASTASeqBufPtr buf,char * trimmed_seq)2751 static int s_DoesFASTAMatchSeq (TFASTASeqBufPtr buf, char *trimmed_seq)
2752 {
2753 int does_match = 1, match_len, seq_len;
2754
2755 if (buf == NULL || trimmed_seq == NULL || *trimmed_seq == 0) {
2756 return 1;
2757 }
2758
2759 seq_len = strlen (trimmed_seq);
2760 while (buf != NULL && seq_len > 0 && does_match) {
2761 if (seq_len < buf->num_used) {
2762 match_len = seq_len;
2763 } else {
2764 match_len = buf->num_used;
2765 }
2766 if (strncmp (buf->buf, trimmed_seq, match_len) == 0) {
2767 buf = buf->next;
2768 trimmed_seq += match_len;
2769 seq_len -= match_len;
2770 } else {
2771 does_match = 0;
2772 }
2773 }
2774 return does_match;
2775 }
2776
2777
s_CompLetter(char ch)2778 static char s_CompLetter (char ch)
2779 {
2780 switch (ch) {
2781 case 'A':
2782 ch = 'T';
2783 break;
2784 case 'T':
2785 ch = 'A';
2786 break;
2787 case 'G':
2788 ch = 'C';
2789 break;
2790 case 'C':
2791 ch = 'G';
2792 break;
2793 }
2794 return ch;
2795 }
2796
2797
s_RevCompSequence(char * seq)2798 static void s_RevCompSequence (char *seq)
2799 {
2800 char tmp;
2801 int len, i;
2802
2803 if (seq == NULL || *seq == 0) {
2804 return;
2805 }
2806 len = strlen (seq);
2807
2808 for (i = 0; i < len / 2; i++) {
2809 tmp = seq[i];
2810 seq[i] = s_CompLetter (seq[len - i - 1]);
2811 seq[len - i - 1] = s_CompLetter (tmp);
2812 }
2813 if (len %2 > 0) {
2814 seq[i] = s_CompLetter (seq[i]);
2815 }
2816 }
2817
2818
s_GetSequenceOffset(TFASTASeqBufPtr buf,char * trimmed_seq,char is_complement)2819 static int s_GetSequenceOffset (TFASTASeqBufPtr buf, char *trimmed_seq, char is_complement)
2820 {
2821 int offset = 0, buf_offset;
2822 int match_found = 0;
2823 int match_len, seq_len;
2824
2825 if (buf == NULL || trimmed_seq == NULL) {
2826 return 0;
2827 }
2828
2829 trimmed_seq = s_StripStars (trimmed_seq);
2830
2831 if (is_complement) {
2832 s_RevCompSequence (trimmed_seq);
2833 }
2834
2835 seq_len = strlen (trimmed_seq);
2836
2837 while (buf != NULL && !match_found) {
2838 buf_offset = 0;
2839 while (buf_offset < buf->num_used && !match_found) {
2840 if (buf->num_used - buf_offset < seq_len) {
2841 match_len = buf->num_used - buf_offset;
2842 } else {
2843 match_len = seq_len;
2844 }
2845 if (match_len < seq_len && buf->next == NULL) {
2846 /* ran out of sequence, no match */
2847 buf_offset = buf->num_used;
2848 } else if (strncmp (buf->buf + buf_offset, trimmed_seq, match_len) == 0
2849 && s_DoesFASTAMatchSeq (buf->next, trimmed_seq + match_len)) {
2850 match_found = 1;
2851 } else {
2852 buf_offset++;
2853 offset++;
2854 }
2855 }
2856 if (!match_found) {
2857 buf = buf->next;
2858 }
2859 }
2860 free (trimmed_seq);
2861 if (match_found) {
2862 return offset;
2863 } else {
2864 return -1;
2865 }
2866 }
2867
2868
2869 #define kSeqListBufSize 100
2870
2871 typedef struct seqlistbuf {
2872 TFASTASeqBufPtr buf[kSeqListBufSize];
2873 char * id_list[kSeqListBufSize];
2874 int num_used;
2875 struct seqlistbuf *next;
2876 } SSeqListBuf, * TSeqListBufPtr;
2877
2878
s_SeqListBufNew()2879 static TSeqListBufPtr s_SeqListBufNew ()
2880 {
2881 TSeqListBufPtr s;
2882
2883 s = (TSeqListBufPtr) malloc (sizeof (SSeqListBuf));
2884 if (s != NULL) {
2885 s->num_used = 0;
2886 s->next = NULL;
2887 }
2888 return s;
2889 }
2890
2891
s_SeqListBufFree(TSeqListBufPtr s)2892 static void s_SeqListBufFree (TSeqListBufPtr s)
2893 {
2894 TSeqListBufPtr s_next;
2895 int i;
2896
2897 while (s != NULL) {
2898 s_next = s->next;
2899 for (i = 0; i < s->num_used; i++) {
2900 s_FASTASeqBufFree (s->buf[i]);
2901 free (s->id_list[i]);
2902 s->buf[i] = NULL;
2903 }
2904 free (s);
2905 s = s_next;
2906 }
2907 }
2908
2909
s_GetFASTAIdFromString(char * str)2910 static char * s_GetFASTAIdFromString (char * str)
2911 {
2912 char * cp;
2913 char * id;
2914 int len;
2915
2916 if (str == NULL) {
2917 return NULL;
2918 }
2919
2920 cp = str;
2921 cp += strspn (str, " >\t");
2922 len = strcspn (cp, " \t\r\n");
2923 if (len == 0) {
2924 return NULL;
2925 }
2926 id = (char *)malloc (len + 1);
2927 if (id == NULL) {
2928 return NULL;
2929 }
2930 strncpy (id, cp, len);
2931 id [ len ] = 0;
2932 return id;
2933 }
2934
2935
s_AddToSeqList(char * line,TSeqListBufPtr buf)2936 static TSeqListBufPtr s_AddToSeqList (char *line, TSeqListBufPtr buf)
2937 {
2938 TSeqListBufPtr last_buf;
2939
2940 if (line == NULL) {
2941 return buf;
2942 }
2943 if (buf == NULL) {
2944 buf = s_SeqListBufNew();
2945 last_buf = buf;
2946 } else {
2947 last_buf = buf;
2948 while (last_buf->next != NULL) {
2949 last_buf = last_buf->next;
2950 }
2951 }
2952
2953 if (*line == '>') {
2954 if (last_buf->num_used == kSeqListBufSize) {
2955 last_buf->next = s_SeqListBufNew();
2956 last_buf = last_buf->next;
2957 }
2958 last_buf->buf[last_buf->num_used] = s_FASTASeqBufNew ();
2959 last_buf->id_list[last_buf->num_used] = s_GetFASTAIdFromString (line);
2960 last_buf->num_used++;
2961 } else if (last_buf->num_used > 0) {
2962 last_buf->buf[last_buf->num_used - 1] = s_AddFASTAToBuf (line, last_buf->buf[last_buf->num_used - 1]);
2963 }
2964
2965 return buf;
2966 }
2967
2968
s_GetFastaSeq(TSeqListBufPtr buf,char * id)2969 static TFASTASeqBufPtr s_GetFastaSeq (TSeqListBufPtr buf, char *id)
2970 {
2971 TFASTASeqBufPtr seq = NULL;
2972 char *cp;
2973 int i, match_len;
2974
2975 if (id == NULL) {
2976 return NULL;
2977 }
2978
2979 cp = strchr (id, '.');
2980 if (cp == NULL) {
2981 match_len = strlen (id);
2982 } else {
2983 match_len = cp - id;
2984 }
2985
2986 while (buf != NULL && seq == NULL) {
2987 for (i = 0; i < buf->num_used && seq == NULL; i++) {
2988 if (strncmp (id, buf->id_list[i], match_len) == 0) {
2989 seq = buf->buf[i];
2990 }
2991 }
2992 buf = buf->next;
2993 }
2994 return seq;
2995 }
2996
2997
s_ReadFastaFile(FReadLineFunction readfunc,void * userdata)2998 static TSeqListBufPtr s_ReadFastaFile (FReadLineFunction readfunc, void * userdata)
2999 {
3000 char *linestring;
3001 TSeqListBufPtr fasta = NULL;
3002
3003 if (readfunc == NULL) {
3004 return NULL;
3005 }
3006 linestring = readfunc (userdata);
3007 while (!s_IsEOF (linestring)) {
3008 fasta = s_AddToSeqList (linestring, fasta);
3009 free (linestring);
3010 linestring = readfunc (userdata);
3011 }
3012 return fasta;
3013 }
3014
3015
3016
3017
3018 #define kQualScoreBufSize 100
3019
3020 typedef struct qualscorelist {
3021 int scores[kQualScoreBufSize];
3022 int num_used;
3023 struct qualscorelist *next;
3024 } SQualScoreList, * TQualScoreListPtr;
3025
3026
s_QualScoreNew()3027 static TQualScoreListPtr s_QualScoreNew ()
3028 {
3029 TQualScoreListPtr s;
3030
3031 s =(TQualScoreListPtr) malloc (sizeof (SQualScoreList));
3032 if (s != NULL) {
3033 s->num_used = 0;
3034 s->next = NULL;
3035 }
3036 return s;
3037 }
3038
3039
s_QualScoreFree(TQualScoreListPtr s)3040 static void s_QualScoreFree (TQualScoreListPtr s)
3041 {
3042 TQualScoreListPtr s_next;
3043
3044 while (s != NULL) {
3045 s_next = s->next;
3046 free (s);
3047 s = s_next;
3048 }
3049 }
3050
3051
s_AddQualScores(char * line,TQualScoreListPtr scores)3052 static TQualScoreListPtr s_AddQualScores (char *line, TQualScoreListPtr scores)
3053 {
3054 TQualScoreListPtr last_score;
3055 char *cp;
3056
3057 if (scores == NULL) {
3058 scores = s_QualScoreNew();
3059 last_score = scores;
3060 } else {
3061 last_score = scores;
3062 while (last_score->next != NULL) {
3063 last_score = last_score->next;
3064 }
3065 }
3066
3067 cp = line;
3068 while (*cp != 0 && *cp != '\r' && *cp != '\n') {
3069 while (isspace (*cp)) {
3070 cp++;
3071 }
3072 if (*cp != 0) {
3073 if (!isdigit (*cp)) {
3074 printf ("Found bad character in quality scores file!\n");
3075 s_QualScoreFree (scores);
3076 scores = NULL;
3077 return scores;
3078 }
3079 if (last_score->num_used == kQualScoreBufSize) {
3080 last_score->next = s_QualScoreNew ();
3081 last_score = last_score->next;
3082 }
3083 last_score->scores[last_score->num_used++] = atoi (cp);
3084 while (isdigit (*cp)) {
3085 cp++;
3086 }
3087 }
3088 }
3089 return scores;
3090 }
3091
3092
s_AddScoresToRead(TContigReadPtr read,TQualScoreListPtr scores,TSeqListBufPtr fasta)3093 static int s_AddScoresToRead (TContigReadPtr read, TQualScoreListPtr scores, TSeqListBufPtr fasta)
3094 {
3095 int score_pos = 0;
3096 int offset = 0;
3097 int skip, score_len;
3098 char *cp;
3099 int *dst;
3100 TFASTASeqBufPtr fasta_seq;
3101
3102 if (read == NULL || scores == NULL) {
3103 return 0;
3104 }
3105
3106 if (fasta == NULL) {
3107 skip = read->read_start - 1;
3108 } else {
3109 fasta_seq = s_GetFastaSeq (fasta, read->read_id);
3110 if (fasta_seq == NULL) {
3111 printf ("Unable to locate fasta for %s\n", read->read_id);
3112 return 0;
3113 }
3114
3115 skip = s_GetSequenceOffset (fasta_seq, read->read_seq, read->is_complement);
3116 if (skip < 0) {
3117 printf ("ACE read did not match FASTA read for %s\n", read->read_id);
3118 return 0;
3119 }
3120 }
3121
3122 /* skip over scores before part used in assembly */
3123 while (scores != NULL && score_pos < skip) {
3124 if (skip - score_pos < scores->num_used) {
3125 offset = skip - score_pos;
3126 score_pos = skip;
3127 } else if (scores->next == NULL) {
3128 printf ("Not enough scores read for %s\n", read->read_id);
3129 return 0;
3130 } else {
3131 score_pos += kQualScoreBufSize;
3132 scores = scores->next;
3133 }
3134 }
3135
3136 score_len = strlen (read->read_seq);
3137 read->qual_scores = malloc (sizeof (int) * score_len);
3138
3139 if (read->is_complement) {
3140 /* need to read scores in reverse direction */
3141 cp = read->read_seq + score_len - 1;
3142 dst = read->qual_scores + score_len - 1;
3143 while (scores != NULL && read->num_qual_scores < score_len) {
3144 if (*cp == '*') {
3145 *dst = 0;
3146 } else {
3147 *dst = scores->scores[offset];
3148 offset++;
3149 }
3150 cp--;
3151 dst--;
3152 read->num_qual_scores++;
3153
3154 if (offset == kQualScoreBufSize) {
3155 scores = scores->next;
3156 offset = 0;
3157 }
3158 }
3159 } else {
3160 cp = read->read_seq;
3161 dst = read->qual_scores;
3162 while (scores != NULL && read->num_qual_scores < score_len) {
3163 if (*cp == '*') {
3164 *dst = 0;
3165 } else {
3166 *dst = scores->scores[offset];
3167 offset++;
3168 }
3169 cp++;
3170 dst++;
3171 read->num_qual_scores++;
3172
3173 if (offset == kQualScoreBufSize) {
3174 scores = scores->next;
3175 offset = 0;
3176 }
3177 }
3178 }
3179 if (read->num_qual_scores == score_len) {
3180 return 1;
3181 } else {
3182 printf ("Not enough qual scores for %s\n", read->read_id);
3183 return 0;
3184 }
3185 }
3186
3187
s_AddQualScoresToReadsInAceFile(TACEFilePtr afp,char * id,TQualScoreListPtr scores,TSeqListBufPtr fasta)3188 static int s_AddQualScoresToReadsInAceFile (TACEFilePtr afp, char *id, TQualScoreListPtr scores, TSeqListBufPtr fasta)
3189 {
3190 int i, j, found = 0, match_len, rval = 1;
3191 char *cp;
3192
3193 if (afp == NULL || id == NULL) {
3194 return 0;
3195 }
3196
3197 cp = strchr (id, '.');
3198 if (cp == NULL) {
3199 match_len = strlen (id);
3200 } else {
3201 match_len = cp - id;
3202 }
3203
3204 for (i = 0; i < afp->num_contigs; i++) {
3205 for (j = 0; j < afp->contigs[i]->num_reads; j++) {
3206 if (strncmp (afp->contigs[i]->reads[j]->read_id, id, match_len) == 0) {
3207 found = 1;
3208 rval &= s_AddScoresToRead (afp->contigs[i]->reads[j], scores, fasta);
3209 }
3210 }
3211 }
3212 if (!found) {
3213 printf ("Unable to locate %s in ACE file\n", id);
3214 rval = 0;
3215 }
3216 return rval;
3217 }
3218
3219
3220 extern int
AddReadQualScores(TACEFilePtr afp,FReadLineFunction readfunc,void * userdata,FReadLineFunction fasta_readfunc,void * fasta_userdata)3221 AddReadQualScores
3222 (TACEFilePtr afp,
3223 FReadLineFunction readfunc,
3224 void * userdata,
3225 FReadLineFunction fasta_readfunc,
3226 void * fasta_userdata)
3227 {
3228 char *linestring;
3229 char *score_id = NULL;
3230 TQualScoreListPtr scores = NULL;
3231 TSeqListBufPtr fasta = NULL;
3232 int rval = 1;
3233
3234 if (afp == NULL || readfunc == NULL) {
3235 return 0;
3236 }
3237
3238 if (fasta_readfunc != NULL) {
3239 fasta = s_ReadFastaFile (fasta_readfunc, fasta_userdata);
3240 if (fasta == NULL) {
3241 printf ("Unable to read FASTA file\n");
3242 return 0;
3243 }
3244 }
3245
3246 linestring = readfunc (userdata);
3247 while (!s_IsEOF (linestring)) {
3248 if (linestring[0] == '>') {
3249 if (score_id != NULL && scores != NULL) {
3250 /* add previously read scores to last read */
3251 if (s_AddQualScoresToReadsInAceFile (afp, score_id, scores, fasta) == 0) {
3252 printf ("Failed to add quality scores from %s\n", score_id);
3253 rval = 0;
3254 }
3255 }
3256 s_QualScoreFree (scores);
3257 scores = NULL;
3258 free (score_id);
3259
3260 score_id = s_GetFASTAIdFromString (linestring);
3261 } else if (score_id != NULL) {
3262 scores = s_AddQualScores (linestring, scores);
3263 }
3264 free (linestring);
3265 linestring = readfunc (userdata);
3266 }
3267
3268 /* handle last set of scores read */
3269 if (score_id != NULL && scores != NULL) {
3270 /* add previously read scores to last read */
3271 if (s_AddQualScoresToReadsInAceFile (afp, score_id, scores, fasta) == 0) {
3272 printf ("Failed to add quality scores from %s\n", score_id);
3273 rval = 0;
3274 }
3275 }
3276 s_SeqListBufFree (fasta);
3277 s_QualScoreFree (scores);
3278 scores = NULL;
3279 free (score_id);
3280 return rval;
3281 }
3282
3283
s_LetterPos(char ch)3284 static int s_LetterPos (char ch) {
3285 int rval = -1;
3286
3287 switch (ch) {
3288 case 'A':
3289 rval = 0;
3290 break;
3291 case 'T':
3292 rval = 1;
3293 break;
3294 case 'G':
3295 rval = 2;
3296 break;
3297 case 'C':
3298 rval = 3;
3299 break;
3300 case '*':
3301 rval = 4;
3302 break;
3303 }
3304 return rval;
3305 }
3306
3307
s_GetUngappedPosition(int gapped_pos,char * seq)3308 static int s_GetUngappedPosition (int gapped_pos, char *seq)
3309 {
3310 int pos = 0, ungapped_pos = 0;
3311 int gaps_found = 0;
3312 char *cp;
3313
3314 cp = seq;
3315 while (*cp != 0 && pos < gapped_pos ) {
3316 if (*cp == '*') {
3317 gaps_found++;
3318 } else {
3319 ungapped_pos++;
3320 }
3321 cp++;
3322 pos++;
3323 }
3324 return ungapped_pos;
3325 }
3326
3327
s_GetQualScoreForReadPos(TContigReadPtr r,int pos)3328 static int s_GetQualScoreForReadPos (TContigReadPtr r, int pos)
3329 {
3330 if (r == NULL || pos < 0) {
3331 return 0;
3332 }
3333
3334 /* note - don't need to get ungapped position because 0s are inserted when qual scores
3335 * are added to the reads.
3336 */
3337 /*pos = s_GetUngappedPosition (pos, r->read_seq); */
3338 if (pos > r->num_qual_scores) {
3339 return 0;
3340 } else {
3341 return r->qual_scores[pos];
3342 }
3343 }
3344
3345
ReplaceConsensusSequenceFromTraces(TContigPtr contig,char only_ns)3346 extern int ReplaceConsensusSequenceFromTraces (TContigPtr contig, char only_ns)
3347 {
3348 char * consensus_buf;
3349 int * new_qual_scores = NULL;
3350 int num_qual_scores = 0;
3351 int i, k, best, letter_pos;
3352 int char_counts[5];
3353 char best_ch, ch;
3354 int num_best, sum_best;
3355 int * consensus_qual_ptr = NULL;
3356 int read_offset, len;
3357 int num_change = 0;
3358
3359 if (contig == NULL) {
3360 return 0;
3361 }
3362
3363 consensus_buf = (char *) malloc (sizeof (char) * (contig->consensus_assem_len + 1));
3364 if (contig->reads[0]->num_qual_scores > 0) {
3365 new_qual_scores = (int *) malloc (sizeof (int) * contig->consensus_seq_len);
3366 }
3367
3368 consensus_qual_ptr = contig->qual_scores;
3369
3370 for (i = 0; i < contig->consensus_assem_len; i++) {
3371 if (only_ns && contig->consensus_seq[i] != 'N') {
3372 /* just use existing consensus character */
3373 consensus_buf[i] = contig->consensus_seq[i];
3374 /* add in qual scores */
3375 if (consensus_qual_ptr != NULL && new_qual_scores != NULL && contig->consensus_seq[i] != '*') {
3376 new_qual_scores[num_qual_scores++] = *consensus_qual_ptr;
3377 }
3378 } else {
3379 for (k = 0; k < 5; k++) {
3380 char_counts[k] = 0;
3381 }
3382 best = 0;
3383 best_ch = 'N';
3384 for (k = 0; k < contig->num_reads; k++) {
3385 read_offset = i - contig->reads[k]->cons_start;
3386 len = strlen (contig->reads[k]->read_seq);
3387 if (len > read_offset
3388 && read_offset >= 0) {
3389 ch = toupper (contig->reads[k]->read_seq[read_offset]);
3390 letter_pos = s_LetterPos (ch);
3391 if (letter_pos > -1) {
3392 char_counts[letter_pos]++;
3393 if (char_counts[letter_pos] > best
3394 || (char_counts[letter_pos] == best && best_ch == '*')) {
3395 best_ch = ch;
3396 best = char_counts[letter_pos];
3397 }
3398 }
3399 }
3400 }
3401 if (toupper (consensus_buf[i]) != best_ch) {
3402 num_change++;
3403 consensus_buf[i] = best_ch;
3404 if (best_ch != '*') {
3405 /* calculate quality score */
3406 if (new_qual_scores != NULL) {
3407 sum_best = 0;
3408 num_best = 0;
3409 for (k = 0; k < contig->num_reads; k++) {
3410 if (contig->reads[k]->num_qual_scores > i - contig->reads[k]->cons_start
3411 && best_ch == toupper (contig->reads[k]->read_seq[i - contig->reads[k]->cons_start])) {
3412 num_best ++;
3413 sum_best += s_GetQualScoreForReadPos (contig->reads[k], i - contig->reads[k]->cons_start);
3414 }
3415 }
3416 if (num_best == 0) {
3417 new_qual_scores[num_qual_scores++] = 0;
3418 } else {
3419 new_qual_scores[num_qual_scores++] = sum_best / num_best;
3420 }
3421 }
3422 }
3423 }
3424 }
3425 if (consensus_qual_ptr != NULL && contig->consensus_seq[i] != '*') {
3426 consensus_qual_ptr++;
3427 }
3428 }
3429 consensus_buf[i] = 0;
3430
3431 free (contig->consensus_seq);
3432 contig->consensus_seq = consensus_buf;
3433 if (contig->qual_scores != NULL) {
3434 free (contig->qual_scores);
3435 }
3436 contig->qual_scores = new_qual_scores;
3437 contig->num_qual_scores = num_qual_scores;
3438
3439 return num_change;
3440 }
3441
3442
RecalculateConsensusSequences(TACEFilePtr ace_file,char only_ns)3443 extern void RecalculateConsensusSequences (TACEFilePtr ace_file, char only_ns)
3444 {
3445 int i;
3446
3447 if (ace_file == NULL) {
3448 return;
3449 }
3450
3451 for (i = 0; i < ace_file->num_contigs; i++) {
3452 ReplaceConsensusSequenceFromTraces(ace_file->contigs[i], only_ns);
3453 }
3454
3455 }
3456
3457
WriteContigQualScores(TContigPtr contig,FILE * out)3458 extern void WriteContigQualScores (TContigPtr contig, FILE *out)
3459 {
3460 int i = 0, j;
3461
3462 if (contig == NULL || contig->qual_scores == NULL || contig->num_qual_scores < 1 || out == NULL) {
3463 return;
3464 }
3465 fprintf (out, ">%s\n", contig->consensus_id);
3466
3467 while (i < contig->num_qual_scores) {
3468 for (j = 0; j < 60 && i < contig->num_qual_scores; j++, i++) {
3469 fprintf (out, "%d ", contig->qual_scores[i]);
3470 }
3471 fprintf (out, "\n");
3472 }
3473 fprintf (out, "\n");
3474 }
3475
3476
3477 extern char
ProcessLargeACEFileForContigFastaAndQualScores(FReadLineFunction readfunc,void * userdata,char make_qual_scores,char * has_errors,ProcessContigFunc process_func,void * process_data)3478 ProcessLargeACEFileForContigFastaAndQualScores
3479 (FReadLineFunction readfunc,
3480 void * userdata,
3481 char make_qual_scores,
3482 char * has_errors,
3483 ProcessContigFunc process_func,
3484 void * process_data)
3485 {
3486 char * linestring;
3487 char * cp;
3488 int contig_num = 0, read_num = 0;
3489 int num_reads_expected = 0;
3490 int num_contigs = 0;
3491 TContigPtr contig = NULL;
3492 char rval = 1;
3493
3494 if (readfunc == NULL || process_func == NULL) {
3495 return 0;
3496 }
3497
3498 linestring = readfunc (userdata);
3499
3500 while (linestring != NULL && linestring [0] != EOF) {
3501 if (linestring [0] == 'A' && linestring [1] == 'S' && isspace (linestring [2])) {
3502 if (num_reads_expected > 0) {
3503 PrintACEFormatErrorXML ("Two file header lines!", NULL, has_errors);
3504 return 0;
3505 }
3506 /* first line in file, number of contigs */
3507 cp = linestring + 3;
3508 num_contigs = atoi (cp);
3509 while (isdigit (*cp)) {
3510 cp++;
3511 }
3512 num_reads_expected = atoi (cp);
3513 linestring = readfunc (userdata);
3514 } else if (linestring [0] == 'C' && linestring [1] == 'O' && isspace (linestring [2])) {
3515 if (contig_num >= num_contigs) {
3516 PrintACEFormatErrorXML ("Too many contigs!", NULL, has_errors);
3517 return 0;
3518 }
3519
3520 contig = s_ReadContig (&linestring, readfunc, userdata, make_qual_scores, has_errors);
3521 if (contig == NULL) {
3522 PrintACEFormatErrorXMLStart (NULL, has_errors);
3523 printf ("Unable to read contig (%d)", contig_num);
3524 PrintACEFormatErrorXMLEnd ();
3525 return 0;
3526 }
3527 read_num += contig->num_reads;
3528 process_func (contig, process_data);
3529 ContigFree (contig);
3530 contig = NULL;
3531 contig_num++;
3532 } else if (s_UnexpectedLineBetweenContigs (linestring)) {
3533 PrintACEFormatErrorXMLStart (NULL, has_errors);
3534 printf ("Unexpected line after contig %d", read_num);
3535 PrintACEFormatErrorXMLEnd ();
3536 return 0;
3537 } else {
3538 linestring = readfunc (userdata);
3539 }
3540 }
3541 if (contig_num < num_contigs) {
3542 PrintACEFormatErrorXML ("Not enough contigs!", NULL, has_errors);
3543 rval = 0;
3544 } else if (read_num < num_reads_expected) {
3545 PrintACEFormatErrorXML ("Not enough reads!", NULL, has_errors);
3546 rval = 0;
3547 }
3548
3549 return rval;
3550 }
3551