1 /*
2  * $Id: aceread.c,v 1.25 2012/04/02 22:11:35 kans Exp $
3  *
4  * ===========================================================================
5  *
6  *                            PUBLIC DOMAIN NOTICE
7  *               National Center for Biotechnology Information
8  *
9  *  This software/database is a "United States Government Work" under the
10  *  terms of the United States Copyright Act.  It was written as part of
11  *  the author's official duties as a United States Government employee and
12  *  thus cannot be copyrighted.  This software/database is freely available
13  *  to the public for use. The National Library of Medicine and the U.S.
14  *  Government have not placed any restriction on its use or reproduction.
15  *
16  *  Although all reasonable efforts have been taken to ensure the accuracy
17  *  and reliability of the software and data, the NLM and the U.S.
18  *  Government do not and cannot warrant the performance or results that
19  *  may be obtained by using this software or data. The NLM and the U.S.
20  *  Government disclaim all warranties, express or implied, including
21  *  warranties of performance, merchantability or fitness for any particular
22  *  purpose.
23  *
24  *  Please cite the author in any work or product based on this material.
25  *
26  * ===========================================================================
27  *
28  * Authors:  Colleen Bollin
29  *
30  */
31 
32 
33 #include <stdlib.h>
34 #include <stdio.h>
35 #include <string.h>
36 #include <ctype.h>
37 #include <util/creaders/alnread.h>
38 #include <aceread.h>
39 
40 
41 typedef enum {
42     eJustRight = 0,
43     eNone,
44     eTooMany,
45     eTooFew,
46     eUnexpected
47 } EFound;
48 
49 
PrintACEFormatErrorXMLStart(char * id,char * has_errors)50 extern void PrintACEFormatErrorXMLStart (char *id, char *has_errors)
51 {
52     if (has_errors != NULL) {
53         if (*has_errors == 0) {
54             printf ("<aceread>\n");
55             *has_errors = 1;
56         }
57     }
58     printf ("<message severity=\"ERROR\" seq-id=\"%s\" code=\"bad_format\">", id == NULL ? "No ID" : id);
59 }
60 
61 
PrintACEFormatErrorXMLEnd(void)62 extern void PrintACEFormatErrorXMLEnd (void)
63 {
64     printf ("</message>\n");
65 }
66 
67 
PrintACEFormatErrorXML(char * msg,char * id,char * has_errors)68 extern void PrintACEFormatErrorXML (char *msg, char *id, char *has_errors)
69 {
70     if (has_errors != NULL) {
71         if (*has_errors == 0) {
72             printf ("<aceread>\n");
73             *has_errors = 1;
74         }
75     }
76     printf ("<message severity=\"ERROR\" seq-id=\"%s\" code=\"bad_format\">%s</message>\n", id == NULL ? "No ID" : id, msg);
77 }
78 
79 
s_ReportFound(EFound val,char * label,char * id,char * has_errors)80 static void s_ReportFound (EFound val, char *label, char *id, char *has_errors)
81 {
82     switch (val) {
83         case eNone:
84             PrintACEFormatErrorXMLStart (id, has_errors);
85             printf ("Found no %s", label);
86             PrintACEFormatErrorXMLEnd ();
87             break;
88         case eTooMany:
89             PrintACEFormatErrorXMLStart (id, has_errors);
90             printf ("Too many %s", label);
91             PrintACEFormatErrorXMLEnd ();
92             break;
93         case eTooFew:
94             PrintACEFormatErrorXMLStart (id, has_errors);
95             printf ("Too few %s", label);
96             PrintACEFormatErrorXMLEnd ();
97             break;
98         case eUnexpected:
99             PrintACEFormatErrorXMLStart (id, has_errors);
100             printf ("Unexpected character while reading %s", label);
101             PrintACEFormatErrorXMLEnd ();
102             break;
103         case eJustRight:
104             break;
105         default:
106             PrintACEFormatErrorXML ("Unknown error", id, has_errors);
107             break;
108     }
109 }
110 
111 
GapInfoNew(void)112 extern TGapInfoPtr GapInfoNew (void)
113 {
114     TGapInfoPtr g;
115 
116     g = (TGapInfoPtr) malloc (sizeof (SGapInfo));
117     if (g != NULL) {
118         g->num_gaps = 0;
119         g->gap_offsets = NULL;
120     }
121     return g;
122 }
123 
124 
GapInfoFree(TGapInfoPtr g)125 extern void GapInfoFree (TGapInfoPtr g)
126 {
127     if (g != NULL) {
128         free (g->gap_offsets);
129         free (g);
130     }
131 }
132 
133 
s_IsGapChar(char ch,char * gap_chars)134 static int s_IsGapChar (char ch, char *gap_chars)
135 {
136     if (ch == 0 || gap_chars == NULL) {
137         return 0;
138     }
139     while (*gap_chars != 0 && *gap_chars != ch) {
140         gap_chars ++;
141     }
142     if (*gap_chars == ch) {
143         return 1;
144     } else {
145         return 0;
146     }
147 }
148 
149 
150 /* The Trace Archive Gap String is a list of the number of nucleotides to skip before adding the next gap */
GapInfoFromSequenceString(char * seq_str,char gap_char)151 extern TGapInfoPtr GapInfoFromSequenceString (char *seq_str, char gap_char)
152 {
153     char * cp;
154     int    num_gaps = 0, pos, gap_num = 0;
155     TGapInfoPtr g = NULL;
156 
157     if (seq_str == NULL) return NULL;
158 
159     /* first determine number of gaps */
160     cp = seq_str;
161     while (*cp != 0) {
162         if (*cp == gap_char) {
163             num_gaps++;
164         }
165         cp++;
166     }
167 
168     g = GapInfoNew ();
169     if (num_gaps > 0) {
170         g->num_gaps = num_gaps;
171         g->gap_offsets = malloc (g->num_gaps * sizeof (int));
172         cp = seq_str;
173         pos = 0;
174         while (*cp != 0) {
175             if (*cp == gap_char) {
176                 g->gap_offsets[gap_num] = pos;
177                 gap_num++;
178                 pos = 0;
179             } else {
180                 pos++;
181             }
182             cp++;
183         }
184     }
185     return g;
186 }
187 
RemoveGapCharsFromSequenceString(char * seq_str,char gap_char)188 extern void RemoveGapCharsFromSequenceString (char *seq_str, char gap_char)
189 {
190     char *cp_src, *cp_dst;
191 
192     if (seq_str == NULL) {
193       return;
194     }
195 
196     cp_src = seq_str;
197     cp_dst = seq_str;
198     while (*cp_src != 0) {
199         if (*cp_src != gap_char) {
200             *cp_dst = *cp_src;
201             cp_dst++;
202         }
203         cp_src++;
204     }
205 }
206 
207 
208 /* calculate sequence position from tiling position (both values zero-based) given gap_info */
SeqPosFromTilingPos(int tiling_pos,TGapInfoPtr gap_info)209 extern int SeqPosFromTilingPos (int tiling_pos, TGapInfoPtr gap_info)
210 {
211     int pos = 0, seq_pos = 0, gap_num = 0;
212 
213     if (tiling_pos < 0 || gap_info == NULL || gap_info->num_gaps == 0) {
214         return tiling_pos;
215     }
216 
217     while (gap_num < gap_info->num_gaps && pos + gap_info->gap_offsets[gap_num] <= tiling_pos) {
218         seq_pos += gap_info->gap_offsets[gap_num];
219         pos += gap_info->gap_offsets[gap_num] + 1;
220         gap_num++;
221     }
222     seq_pos += tiling_pos - pos;
223     return seq_pos;
224 }
225 
226 
227 /* calculate sequence position from tiling position (both values zero-based) given gap_info */
TilingPosFromSeqPos(int seq_pos,TGapInfoPtr gap_info)228 extern int TilingPosFromSeqPos (int seq_pos, TGapInfoPtr gap_info)
229 {
230     int pos = 0, tiling_pos = 0, gap_num = 0;
231 
232     if (seq_pos < 0 || gap_info == NULL || gap_info->num_gaps == 0) {
233         return seq_pos;
234     }
235 
236     while (gap_num < gap_info->num_gaps && pos + gap_info->gap_offsets[gap_num] <= seq_pos) {
237         pos += gap_info->gap_offsets[gap_num];
238         tiling_pos += gap_info->gap_offsets[gap_num] + 1;
239         gap_num++;
240     }
241     tiling_pos += seq_pos - pos;
242     return tiling_pos;
243 }
244 
245 
246 /* adjust gap info when sequence is trimmed */
AdjustGapInfoFor5Trim(TGapInfoPtr gap_info,int trim)247 static void AdjustGapInfoFor5Trim (TGapInfoPtr gap_info, int trim)
248 {
249     int pos = 0;
250     int num_gaps = 0;
251     int i;
252 
253     if (gap_info == NULL || gap_info->num_gaps < 1 || trim < 1) {
254         return;
255     }
256 
257     while (num_gaps < gap_info->num_gaps && pos + gap_info->gap_offsets[num_gaps] < trim) {
258         pos += gap_info->gap_offsets[num_gaps];
259         num_gaps++;
260     }
261     if (num_gaps < gap_info->num_gaps) {
262         gap_info->gap_offsets[num_gaps] -= trim - pos;
263         for (i = num_gaps; i < gap_info->num_gaps; i++) {
264             gap_info->gap_offsets[i - num_gaps] = gap_info->gap_offsets[i];
265         }
266         gap_info->num_gaps -= num_gaps;
267     } else {
268         free (gap_info->gap_offsets);
269         gap_info->gap_offsets = NULL;
270         gap_info->num_gaps = 0;
271     }
272 
273 }
274 
275 
AdjustGapInfoFor3Trim(TGapInfoPtr gap_info,int new_len)276 static void AdjustGapInfoFor3Trim (TGapInfoPtr gap_info, int new_len)
277 {
278     int pos = 0;
279     int num_gaps = 0;
280 
281     if (gap_info == NULL || gap_info->num_gaps < 1) {
282         return;
283     }
284 
285     while (num_gaps < gap_info->num_gaps && pos + gap_info->gap_offsets[num_gaps] < new_len) {
286         pos += gap_info->gap_offsets[num_gaps];
287         num_gaps++;
288     }
289     if (num_gaps < gap_info->num_gaps) {
290         gap_info->num_gaps = num_gaps;
291     }
292 }
293 
294 /* TODO: NEED TO write function for truncating on right, test function for truncating on left */
295 
ContigReadNew(void)296 extern TContigReadPtr ContigReadNew (void)
297 {
298     TContigReadPtr r;
299 
300     r = (TContigReadPtr) malloc (sizeof (SContigRead));
301     if (r == NULL) {
302         return NULL;
303     }
304     r->read_id = NULL;
305     r->ti = 0;
306     r->srr = NULL;
307     r->read_seq = NULL;
308     r->is_complement = 0;
309     r->cons_start = 0;
310     r->cons_stop = 0;
311     r->gaps = NULL;
312     r->local = 1;
313     r->valid = 0;
314     r->qual_scores = NULL;
315     r->num_qual_scores = 0;
316     r->tag = NULL;
317     return r;
318 }
319 
320 
ContigReadFree(TContigReadPtr r)321 extern void ContigReadFree (TContigReadPtr r)
322 {
323     if (r != NULL) {
324         if (r->read_id != NULL) {
325             free (r->read_id);
326         }
327         if (r->srr != NULL) {
328             free (r->srr);
329         }
330         if (r->read_seq != NULL) {
331             free (r->read_seq);
332         }
333         if (r->gaps != NULL) {
334             GapInfoFree (r->gaps);
335         }
336         if (r->qual_scores != NULL) {
337             free (r->qual_scores);
338         }
339         if (r->tag != NULL) {
340             free (r->tag);
341         }
342         free (r);
343     }
344 }
345 
346 
BaseSegNew(void)347 extern TBaseSegPtr BaseSegNew (void)
348 {
349     TBaseSegPtr b;
350 
351     b = (TBaseSegPtr) malloc (sizeof (SBaseSeg));
352     if (b == NULL) {
353         return NULL;
354     }
355     b->read_id = NULL;
356     b->cons_start = 0;
357     b->cons_stop = 0;
358     return b;
359 }
360 
361 
BaseSegFree(TBaseSegPtr b)362 extern void BaseSegFree (TBaseSegPtr b)
363 {
364     if (b != NULL) {
365         if (b->read_id != NULL) {
366             free (b->read_id);
367         }
368         free (b);
369     }
370 }
371 
372 
373 /* reads a correctly formatted line and creates a base seg.
374  */
s_ReadBaseSeg(char * line)375 static TBaseSegPtr s_ReadBaseSeg (char *line)
376 {
377     TBaseSegPtr base_seg = NULL;
378     char *cp;
379     int   start, stop, len;
380 
381     if (line == NULL || *line != 'B' || *(line + 1) != 'S') {
382         return NULL;
383     }
384 
385 
386     cp = line + 2;
387     while (isspace (*cp)) {
388         cp++;
389     }
390     if (!isdigit (*cp)) {
391         return NULL;
392     }
393     start = atoi (cp);
394     while (isdigit (*cp)) {
395         cp++;
396     }
397     while (isspace (*cp)) {
398         cp++;
399     }
400     if (!isdigit (*cp)) {
401         return NULL;
402     }
403     stop = atoi (cp);
404     while (isdigit (*cp)) {
405         cp++;
406     }
407     while (isspace (*cp)) {
408         cp++;
409     }
410     if (*cp == 0) {
411         return NULL;
412     }
413 
414     len = strlen (cp);
415 
416     base_seg = BaseSegNew ();
417     base_seg->cons_start = start;
418     base_seg->cons_stop = stop;
419     base_seg->read_id = malloc (sizeof (char) * len + 1);
420     strcpy (base_seg->read_id, cp);
421 
422     return base_seg;
423 }
424 
425 
ConsensusReadAlnNew(int numseg)426 extern TConsensusReadAlnPtr ConsensusReadAlnNew (int numseg)
427 {
428     TConsensusReadAlnPtr a;
429     int i;
430 
431     a = (TConsensusReadAlnPtr) malloc (sizeof (SConsensusReadAln));
432     a->is_complement = 0;
433     if (numseg < 1) {
434         a->lens = NULL;
435         a->cons_starts = NULL;
436         a->read_starts = NULL;
437         a->numseg = 0;
438     } else {
439         a->lens = (int *) malloc (sizeof (int) * numseg);
440         a->cons_starts = (int *) malloc (sizeof (int) * numseg);
441         a->read_starts = (int *) malloc (sizeof (int) * numseg);
442         for (i = 0; i < numseg; i++) {
443             a->lens[i] = 0;
444             a->cons_starts[i] = 0;
445             a->read_starts[0] = 0;
446         }
447         a->numseg = numseg;
448     }
449     return a;
450 }
451 
452 
ConsensusReadAlnFree(TConsensusReadAlnPtr a)453 extern TConsensusReadAlnPtr ConsensusReadAlnFree (TConsensusReadAlnPtr a)
454 {
455     if (a != NULL) {
456         if (a->lens != NULL) {
457             free (a->lens);
458             a->lens = NULL;
459         }
460         if (a->cons_starts != NULL) {
461             free (a->cons_starts);
462             a->cons_starts = NULL;
463         }
464         if (a->read_starts != NULL) {
465             free (a->read_starts);
466             a->read_starts = NULL;
467         }
468         free (a);
469         a = NULL;
470     }
471     return a;
472 }
473 
474 
GetConsensusReadAln(char * consensus_seq,TContigReadPtr read)475 extern TConsensusReadAlnPtr GetConsensusReadAln (char *consensus_seq, TContigReadPtr read)
476 {
477     TConsensusReadAlnPtr aln = NULL;
478     char *c;
479     char *c_start;
480     char *r;
481     char *r_start;
482     int numseg = 0, aln_len, pos, seg, con_offset = 0, read_offset = 0;
483     char con_gap_open = 0, read_gap_open = 0, gap_change;
484 
485     if (consensus_seq == NULL || read == NULL) {
486         return NULL;
487     }
488 
489     if (read->cons_start > 0) {
490         c_start = consensus_seq + read->cons_start;
491         r_start = read->read_seq + read->read_assem_start - 1;
492     } else {
493         c_start = consensus_seq;
494         r_start = read->read_seq + read->read_assem_start - 1;
495     }
496 
497     aln_len = read->cons_stop - read->cons_start + 1;
498     while (*c_start == '*' && *r_start == '*') {
499         c_start++;
500         r_start++;
501         aln_len--;
502     }
503 
504     /* first, count number of segments needed */
505     c = c_start;
506     r = r_start;
507     if (*c != '*' && *r != '*') {
508       numseg++;
509     }
510     pos = 0;
511     while (*c != 0 && *r != 0 && pos < aln_len) {
512         if (*c == '*' && *r == '*') {
513             /* both in gap - ignore */
514         } else {
515             gap_change = 0;
516             if (*c == '*') {
517                 if (!con_gap_open) {
518                     gap_change = 1;
519                     con_gap_open = 1;
520                 }
521             } else {
522                 if (con_gap_open) {
523                     gap_change = 1;
524                     con_gap_open = 0;
525                 }
526             }
527             if (*r == '*') {
528                 if (!read_gap_open) {
529                     gap_change = 1;
530                     read_gap_open = 1;
531                 }
532             } else {
533                 if (read_gap_open) {
534                     gap_change = 1;
535                     read_gap_open = 0;
536                 }
537             }
538             if (gap_change) {
539                 numseg++;
540             }
541         }
542         c++;
543         r++;
544         pos++;
545     }
546 
547     /* create alignment */
548     aln = ConsensusReadAlnNew (numseg);
549     pos = 0;
550     seg = 0;
551 
552 
553     c = consensus_seq;
554     while (c < c_start) {
555         if (*c != '*') {
556             con_offset ++;
557         }
558         c++;
559     }
560 
561     r = read->read_seq;
562     while (r < r_start) {
563         if (*r != '*') {
564             read_offset ++;
565         }
566         r++;
567     }
568 
569 
570     if (*c_start == '*') {
571         aln->cons_starts[0] = -1;
572         con_gap_open = 1;
573     } else {
574         aln->cons_starts[0] = con_offset;
575         con_gap_open = 0;
576     }
577 
578     if (*r_start == '*') {
579         aln->read_starts[0] = -1;
580         read_gap_open = 1;
581     } else {
582         aln->read_starts[0] = read_offset;
583         read_gap_open = 0;
584     }
585 
586     c = c_start + 1;
587     r = r_start + 1;
588     aln->lens[0] = 1;
589     pos = 1;
590 
591     while (*c != 0 && *r != 0 && pos < aln_len) {
592         if (*c == '*' && *r == '*') {
593             /* both in gap - ignore */
594         } else {
595             gap_change = 0;
596             if (*c == '*') {
597                 if (!con_gap_open) {
598                     gap_change = 1;
599                     con_gap_open = 1;
600                 }
601             } else {
602                 if (con_gap_open) {
603                     gap_change = 1;
604                     con_gap_open = 0;
605                 }
606             }
607             if (*r == '*') {
608                 if (!read_gap_open) {
609                     gap_change = 1;
610                     read_gap_open = 1;
611                 }
612             } else {
613                 if (read_gap_open) {
614                     gap_change = 1;
615                     read_gap_open = 0;
616                 }
617             }
618             if (gap_change) {
619                 seg++;
620                 if (con_gap_open) {
621                     aln->cons_starts[seg] = -1;
622                 } else if (aln->cons_starts[seg - 1] > -1) {
623                     aln->cons_starts[seg] = aln->cons_starts[seg - 1] + aln->lens[seg - 1];
624                 } else if (seg > 1 && aln->cons_starts[seg - 2] > -1) {
625                     aln->cons_starts[seg] = aln->cons_starts[seg - 2] + aln->lens[seg - 2];
626                 } else {
627                     aln->cons_starts[seg] = con_offset;
628                 }
629                 if (read_gap_open) {
630                     aln->read_starts[seg] = -1;
631                 } else if (aln->read_starts[seg - 1] > -1) {
632                     aln->read_starts[seg] = aln->read_starts[seg - 1] + aln->lens[seg - 1];
633                 } else if (seg > 1 && aln->read_starts[seg - 2] > -1) {
634                     aln->read_starts[seg] = aln->read_starts[seg - 2] + aln->lens[seg - 2];
635                 } else {
636                     aln->read_starts[seg] = read_offset;
637                 }
638             }
639             aln->lens[seg]++;
640         }
641         c++;
642         r++;
643         pos++;
644     }
645 
646     /* todo - adjust starts for complement */
647     if (read->is_complement) {
648       for (seg = 0; seg < aln->numseg; seg++) {
649         if (aln->read_starts[seg] > -1) {
650           aln->read_starts[seg] = read->read_len - aln->read_starts[seg] - aln->lens[seg];
651         }
652       }
653       aln->is_complement = 1;
654     }
655 
656     return aln;
657 }
658 
659 
ContigNew(void)660 extern TContigPtr ContigNew (void)
661 {
662     TContigPtr c;
663 
664     c = (TContigPtr) malloc (sizeof (SContig));
665     if (c == NULL) {
666         return NULL;
667     }
668     c->consensus_id = NULL;
669     c->consensus_seq = NULL;
670     c->consensus_assem_len = 0;
671     c->consensus_seq_len = 0;
672     c->is_complement = 0;
673     c->num_qual_scores = 0;
674     c->qual_scores = NULL;
675     c->num_reads = 0;
676     c->reads = NULL;
677     c->gaps = NULL;
678     c->num_reads = 0;
679     c->reads = NULL;
680     c->num_base_segs = 0;
681     c->base_segs = NULL;
682     c->tag = NULL;
683 
684     return c;
685 }
686 
687 
ContigFree(TContigPtr c)688 extern void ContigFree (TContigPtr c)
689 {
690     int i;
691 
692     if (c != NULL) {
693         if (c->consensus_id != NULL) free (c->consensus_id);
694         if (c->consensus_seq != NULL) free (c->consensus_seq);
695         if (c->qual_scores != NULL) free (c->qual_scores);
696 
697         if (c->reads != NULL) {
698             for (i = 0; i < c->num_reads; i++) {
699                 if (c->reads[i] != NULL) {
700                     ContigReadFree (c->reads[i]);
701                 }
702             }
703             free (c->reads);
704         }
705         if (c->base_segs != NULL) {
706             for (i = 0; i < c->num_base_segs; i++) {
707                 if (c->base_segs[i] != NULL) {
708                     BaseSegFree (c->base_segs[i]);
709                 }
710             }
711             free (c->base_segs);
712         }
713         if (c->tag != NULL) {
714             free (c->tag);
715         }
716         free (c);
717     }
718 }
719 
720 
ACEFileNew()721 extern TACEFilePtr ACEFileNew ()
722 {
723     TACEFilePtr afp;
724 
725     afp = (TACEFilePtr) malloc (sizeof (SACEFile));
726     if (afp == NULL) {
727         return NULL;
728     }
729     afp->num_contigs = 0;
730     afp->contigs = NULL;
731 
732     return afp;
733 }
734 
735 
ACEFileFree(TACEFilePtr afp)736 extern void ACEFileFree (TACEFilePtr afp)
737 {
738     int i;
739 
740     if (afp != NULL) {
741         for (i = 0; i < afp->num_contigs; i++) {
742             ContigFree (afp->contigs[i]);
743         }
744         free (afp->contigs);
745         free (afp);
746     }
747 }
748 
749 
s_IsSeqChar(char ch)750 static char s_IsSeqChar (char ch)
751 {
752     switch (ch) {
753         case '*':
754         case '-':
755         case 'A':
756         case 'B':
757         case 'C':
758         case 'D':
759         case 'E':
760         case 'F':
761         case 'G':
762         case 'H':
763         case 'I':
764         case 'J':
765         case 'K':
766         case 'L':
767         case 'M':
768         case 'N':
769         case 'O':
770         case 'P':
771         case 'Q':
772         case 'R':
773         case 'S':
774         case 'T':
775         case 'U':
776         case 'V':
777         case 'W':
778         case 'X':
779         case 'Y':
780         case 'Z':
781         case 'a':
782         case 'b':
783         case 'c':
784         case 'd':
785         case 'e':
786         case 'f':
787         case 'g':
788         case 'h':
789         case 'i':
790         case 'j':
791         case 'k':
792         case 'l':
793         case 'm':
794         case 'n':
795         case 'o':
796         case 'p':
797         case 'q':
798         case 'r':
799         case 's':
800         case 't':
801         case 'u':
802         case 'v':
803         case 'w':
804         case 'x':
805         case 'y':
806         case 'z':
807             return 1;
808             break;
809         default:
810             return 0;
811             break;
812     }
813 }
814 
815 
s_IsEOF(char * linestring)816 static char s_IsEOF (char *linestring)
817 {
818     if (linestring == NULL || linestring [0] == EOF) {
819         return 1;
820     } else {
821         return 0;
822     }
823 }
824 
825 
826 static char *
s_ReadSequenceFromFile(int len,FReadLineFunction readfunc,void * userdata,char * id,char * has_errors)827 s_ReadSequenceFromFile
828 (int                  len,
829  FReadLineFunction    readfunc,
830  void *               userdata,
831  char *               id,
832  char *               has_errors)
833 {
834     char *seq;
835     char *linestring;
836     char *cp;
837     int  pos = 0;
838 
839     /* copy in sequence data */
840     seq = malloc (len + 1);
841     linestring = readfunc (userdata);
842     while (!s_IsEOF (linestring) && s_IsSeqChar (linestring [0])) {
843         /* append to consensus */
844         cp = linestring;
845         while (s_IsSeqChar (*cp) && pos < len) {
846             if (isalpha (*cp)) {
847                 seq [pos] = toupper (*cp);
848             } else if (*cp == '-') {
849                 seq [pos] = '*';
850             } else {
851                 seq [pos] = *cp;
852             }
853             pos++;
854             cp++;
855         }
856         if (s_IsSeqChar (*cp)) {
857             PrintACEFormatErrorXML ("Too many sequence characters!", id, has_errors);
858             free (seq);
859             return NULL;
860         }
861         free (linestring);
862         linestring = readfunc (userdata);
863     }
864     free (linestring);
865     if (pos < len) {
866         PrintACEFormatErrorXML ("Too few sequence characters!", id, has_errors);
867         free (seq);
868         seq = NULL;
869     } else {
870         seq[pos] = 0;
871     }
872     return seq;
873 }
874 
875 
s_LineIsEmptyButNotEof(char * linestring)876 static char s_LineIsEmptyButNotEof (char *linestring)
877 {
878     char *cp;
879     if (s_IsEOF (linestring)) {
880         return 0;
881     }
882 
883     cp = linestring;
884     while (*cp != 0 && isspace (*cp)) {
885         cp++;
886     }
887     if (*cp == 0) {
888         return 1;
889     } else {
890         return 0;
891     }
892 }
893 
894 
s_SkipQualScores(FReadLineFunction readfunc,void * userdata)895 static void s_SkipQualScores
896 (FReadLineFunction    readfunc,
897  void *               userdata)
898 {
899     char * linestring;
900     char * cp;
901     if (readfunc == NULL) return;
902 
903     linestring = readfunc (userdata);
904     while (s_LineIsEmptyButNotEof (linestring)) {
905         free (linestring);
906         linestring = readfunc (userdata);
907     }
908     if (linestring == NULL  ||  linestring [0] == EOF || strcmp (linestring, "BQ") != 0) {
909         return;
910     }
911     linestring = readfunc (userdata);
912     while (!s_IsEOF (linestring)
913            && isdigit (*(cp = linestring + strspn (linestring, " \t")))) {
914         free (linestring);
915         linestring = readfunc (userdata);
916     }
917     free (linestring);
918 }
919 
920 
s_ReadQualScores(TContigPtr contig,FReadLineFunction readfunc,void * userdata)921 static EFound s_ReadQualScores
922 (TContigPtr contig,
923  FReadLineFunction    readfunc,
924  void *               userdata)
925 {
926     char * linestring;
927     char * cp;
928     int    pos;
929 
930     if (contig == NULL || readfunc == NULL || contig->consensus_assem_len == 0) {
931         return eNone;
932     }
933 
934     linestring = readfunc (userdata);
935     while (s_LineIsEmptyButNotEof (linestring)) {
936         free (linestring);
937         linestring = readfunc (userdata);
938     }
939     if (linestring == NULL  ||  linestring [0] == EOF || strcmp (linestring, "BQ") != 0) {
940         return eNone;
941     }
942 
943     /* read quality scores */
944     contig->num_qual_scores = contig->consensus_assem_len;
945     /* no score for * in consensus seq */
946     for (pos = 0; pos < contig->consensus_assem_len; pos++) {
947       if (contig->consensus_seq[pos] == '*') {
948         contig->num_qual_scores --;
949       }
950     }
951     contig->qual_scores = malloc (sizeof (int) * contig->num_qual_scores);
952     pos = 0;
953     linestring = readfunc (userdata);
954     while (!s_IsEOF (linestring)
955            && isdigit (*(cp = linestring + strspn (linestring, " \t")))) {
956         while (isdigit (*cp) && pos < contig->num_qual_scores) {
957             contig->qual_scores [pos] = atoi (cp);
958             pos++;
959             while (isdigit (*cp)) {
960                 cp++;
961             }
962             while (isspace (*cp)) {
963                 cp++;
964             }
965         }
966         if (isdigit (*cp)) {
967             return eTooMany;
968         }
969         free (linestring);
970         linestring = readfunc (userdata);
971     }
972     if (pos < contig->num_qual_scores) {
973         return eTooFew;
974     } else {
975         return eJustRight;
976     }
977 }
978 
979 
s_ReadAFLines(TContigPtr contig,FReadLineFunction readfunc,void * userdata,char ** next_line)980 static EFound s_ReadAFLines
981 (TContigPtr contig,
982  FReadLineFunction    readfunc,
983  void *               userdata,
984  char **              next_line)
985 {
986     char * linestring;
987     char * cp;
988     int    read_num, len;
989     EFound rval = eJustRight;
990 
991     if (contig == NULL || readfunc == NULL || contig->num_reads == 0) return eNone;
992 
993     /* get AF lines */
994     contig->reads = malloc (contig->num_reads * sizeof (TContigReadPtr));
995     linestring = readfunc (userdata);
996     while (s_LineIsEmptyButNotEof (linestring)) {
997         free (linestring);
998         linestring = readfunc (userdata);
999     }
1000     if (linestring == NULL  ||  linestring [0] == EOF || strncmp (linestring, "AF", 2) != 0) {
1001         *next_line = linestring;
1002         return eNone;
1003     }
1004 
1005     read_num = 0;
1006     while (!s_IsEOF(linestring) && read_num < contig->num_reads
1007            && linestring [0] == 'A' && linestring [1] == 'F' && isspace (linestring [2])) {
1008         contig->reads[read_num] = ContigReadNew ();
1009         len = strlen (linestring + 3);
1010         contig->reads[read_num]->read_id = malloc (len + 1);
1011         strcpy (contig->reads[read_num]->read_id, linestring + 3);
1012         cp = contig->reads[read_num]->read_id;
1013         while (*cp != 0 && !isspace (*cp)) {
1014             cp++;
1015         }
1016         if (isspace (*cp)) {
1017             *cp = 0;
1018             cp++;
1019         }
1020         if (*cp == 'C') {
1021             contig->reads[read_num]->is_complement = 1;
1022         } else if (*cp != 'U') {
1023             *next_line = linestring;
1024             return eUnexpected;
1025         }
1026         cp++;
1027         if (isspace (*cp)) {
1028             cp++;
1029         }
1030         contig->reads[read_num]->cons_start = atoi (cp) - 1;
1031         read_num++;
1032         free (linestring);
1033         linestring = readfunc (userdata);
1034     }
1035     if (read_num < contig->num_reads) {
1036         rval = eTooFew;
1037     } else if (!s_IsEOF(linestring) && strncmp (linestring, "AF ", 3) == 0) {
1038         rval = eTooMany;
1039     } else {
1040         rval = eJustRight;
1041     }
1042     *next_line = linestring;
1043     return rval;
1044 }
1045 
1046 
s_ReadBaseSegs(TContigPtr contig,int num_base_segs,char ** firstline,FReadLineFunction readfunc,void * userdata)1047 static EFound s_ReadBaseSegs
1048 (TContigPtr           contig,
1049  int                  num_base_segs,
1050  char **              firstline,
1051  FReadLineFunction    readfunc,
1052  void *               userdata)
1053 {
1054     char * linestring;
1055 
1056     if (contig == NULL || readfunc == NULL || num_base_segs == 0 || firstline == NULL) return eNone;
1057 
1058     contig->base_segs = malloc (sizeof (TBaseSegPtr) * num_base_segs);
1059     contig->num_base_segs = 0;
1060 
1061     /* get BS lines */
1062     linestring = *firstline;
1063     while (s_LineIsEmptyButNotEof (linestring)) {
1064         free (linestring);
1065         linestring = readfunc (userdata);
1066     }
1067     if (linestring == NULL  ||  linestring [0] == EOF || strncmp (linestring, "BS", 2) != 0) {
1068         return eNone;
1069     }
1070 
1071     while (linestring != NULL  &&  linestring [0] != EOF && contig->num_base_segs < num_base_segs
1072            && linestring [0] == 'B' && linestring [1] == 'S' && isspace (linestring [2])) {
1073         contig->base_segs[contig->num_base_segs++] = s_ReadBaseSeg (linestring);
1074         free (linestring);
1075         linestring = readfunc (userdata);
1076     }
1077     *firstline = linestring;
1078     if (contig->num_base_segs < num_base_segs) {
1079         return eTooFew;
1080     } else if (linestring != NULL && linestring [0] == 'B' && linestring[1] == 'S') {
1081         return eTooMany;
1082     } else {
1083         return eJustRight;
1084     }
1085 }
1086 
1087 
s_IsEquivN(char ch)1088 static char s_IsEquivN (char ch)
1089 {
1090     if (ch == 'N' || ch == 'X') {
1091         return 1;
1092     } else {
1093         return 0;
1094     }
1095 }
1096 
1097 
1098 /* Terminal Ns will always be trimmed in the GenBank records */
s_AdjustContigReadForTerminalNs(TContigReadPtr read)1099 static void s_AdjustContigReadForTerminalNs (TContigReadPtr read)
1100 {
1101     char * cp_src;
1102     char * cp_dst;
1103     int    len = 0;
1104 
1105     if (read == NULL || read->read_seq == NULL) return;
1106     cp_src = read->read_seq;
1107     while (s_IsEquivN(*cp_src)) {
1108         len++;
1109         cp_src++;
1110     }
1111     if (len > 0) {
1112         read->cons_start += len;
1113         cp_dst = read->read_seq;
1114         while (*cp_src != 0) {
1115             *cp_dst = *cp_src;
1116             cp_dst++;
1117             cp_src++;
1118         }
1119         *cp_dst = 0;
1120     }
1121     len = strlen (read->read_seq);
1122     cp_src = read->read_seq + len - 1;
1123     while (cp_src >= read->read_seq && s_IsEquivN(*cp_src)) {
1124         *cp_src = 0;
1125         cp_src--;
1126     }
1127 }
1128 
1129 
1130 /* Terminal Ns will always be trimmed by the GenBank record */
s_AdjustContigForTerminalNs(TContigPtr contig)1131 static void s_AdjustContigForTerminalNs (TContigPtr contig)
1132 {
1133     char * cp_src;
1134     char * cp_dst;
1135     int    len = 0, i;
1136 
1137     if (contig == NULL || contig->consensus_seq == NULL) return;
1138     cp_src = contig->consensus_seq;
1139     while (s_IsEquivN(*cp_src)) {
1140         len++;
1141         cp_src++;
1142     }
1143     if (len > 0) {
1144         /* adjust quality scores */
1145         if (contig->qual_scores != NULL) {
1146             contig->num_qual_scores -= len;
1147             for (i = 0; i < contig->num_qual_scores; i++) {
1148                 contig->qual_scores[i] = contig->qual_scores [i + len];
1149             }
1150         }
1151         /* adjust reads */
1152         if (contig->reads != NULL) {
1153             for (i = 0; i < contig->num_reads; i++) {
1154                 if (contig->reads[i] != NULL) {
1155                     contig->reads[i]->cons_start -= len;
1156                 }
1157             }
1158         }
1159         /* adjust consensus sequence */
1160         cp_dst = contig->consensus_seq;
1161         while (*cp_src != 0) {
1162             *cp_dst = *cp_src;
1163             cp_dst++;
1164             cp_src++;
1165         }
1166         *cp_dst = 0;
1167         contig->consensus_assem_len -= len;
1168     }
1169     /* trim 3' Ns */
1170     len = 0;
1171     cp_src = contig->consensus_seq + contig->consensus_assem_len - 1;
1172     while (cp_src >= contig->consensus_seq && s_IsEquivN(*cp_src)) {
1173         *cp_src = 0;
1174         cp_src--;
1175         contig->consensus_assem_len--;
1176         len++;
1177     }
1178     /* truncate quality scores if 3' Ns trimmed */
1179     if (len > 0 && contig->qual_scores != NULL) {
1180         contig->num_qual_scores -= len;
1181     }
1182 }
1183 
1184 
1185 /* Clips the sequence read in according to the QA clipping.
1186  * The real coordinates will be recovered when an alignment is generated between
1187  * the sequence in the structure and the sequence downloaded from the Trace Archive.
1188  */
ApplyQALineToRead(TContigReadPtr read,char * linestring,char * id,char * has_errors)1189 static char ApplyQALineToRead (TContigReadPtr read, char *linestring, char *id, char *has_errors)
1190 {
1191     char *cp;
1192     int  values[4];
1193     int  i = 0;
1194 
1195     if (read == NULL || linestring == NULL) {
1196         PrintACEFormatErrorXML ("File end where QA line should be", id, has_errors);
1197         return 0;
1198     }
1199 
1200     cp = linestring;
1201     if (*cp != 'Q') {
1202         PrintACEFormatErrorXMLStart (id, has_errors);
1203         printf ("Expected QA line, found %s", linestring);
1204         PrintACEFormatErrorXMLEnd ();
1205         return 0;
1206     }
1207     cp++;
1208     if (*cp != 'A') {
1209         PrintACEFormatErrorXMLStart (id, has_errors);
1210         printf ("Expected QA line, found %s", linestring);
1211         PrintACEFormatErrorXMLEnd ();
1212         return 0;
1213     }
1214     cp++;
1215     while (*cp != 0 && i < 4) {
1216         while (isspace (*cp)) {
1217             cp++;
1218         }
1219         if (*cp != '-' && !isdigit (*cp)) {
1220           PrintACEFormatErrorXML ("Found non-number on QA line", id, has_errors);
1221           return 0;
1222         }
1223         values[i] = atoi (cp);
1224         i++;
1225         while (*cp == '-' || isdigit (*cp)) {
1226             cp++;
1227         }
1228     }
1229     if (*cp != 0 || i < 4) {
1230         PrintACEFormatErrorXML ("Fewer than four numbers on line", id, has_errors);
1231         return 0;
1232     }
1233     if (values[0] > 0 || values[2] > 0) {
1234         if (values[0] > values[2]) {
1235             read->read_assem_start = values[0];
1236         } else {
1237             read->read_assem_start = values[2];
1238         }
1239     }
1240 
1241     if (values[1] > 0 && values[3] > 0) {
1242         if (values[1] < values[3]) {
1243             read->read_assem_stop = values[1];
1244         } else {
1245             read->read_assem_stop = values[3];
1246         }
1247     } else if (values[1] > 0) {
1248         read->read_assem_stop = values[1];
1249     } else if (values[3] > 0) {
1250         read->read_assem_stop = values[3];
1251     }
1252 
1253     /* adjust first gap position for start */
1254     if (read->read_assem_start > 1 && read->gaps != NULL && read->gaps->num_gaps > 0 && read->gaps->gap_offsets != NULL) {
1255         read->gaps->gap_offsets[0] -= read->read_assem_start - 1;
1256     }
1257 
1258     return 1;
1259 }
1260 
1261 
1262 /* calculate gap info for consensus sequence */
1263 /* calculate cons_stop positions and tiling positions for each read */
s_CalculateContigOffsets(TContigPtr contig)1264 static void s_CalculateContigOffsets (TContigPtr contig)
1265 {
1266     int i;
1267 
1268     if (contig == NULL) return;
1269 
1270     for (i = 0; i < contig->num_reads; i++) {
1271         contig->reads[i]->tiling_start = contig->reads[i]->read_assem_start + contig->reads[i]->cons_start;
1272         contig->reads[i]->tiling_stop = contig->reads[i]->read_assem_stop + contig->reads[i]->cons_start;
1273         contig->reads[i]->cons_stop = SeqPosFromTilingPos (contig->reads[i]->tiling_stop - 1, contig->gaps) + 1;
1274         contig->reads[i]->read_start = SeqPosFromTilingPos(contig->reads[i]->read_assem_start - 1, contig->reads[i]->gaps) + 1;
1275         contig->reads[i]->read_stop = SeqPosFromTilingPos(contig->reads[i]->read_assem_stop - 1, contig->reads[i]->gaps) + 1;
1276     }
1277 
1278 }
1279 
1280 
s_GetUngappedSeqLen(char * str,char gap_char)1281 static int s_GetUngappedSeqLen (char *str, char gap_char)
1282 {
1283     int len = 0;
1284 
1285     if (str == NULL) return 0;
1286     while (*str != 0) {
1287         if (*str != gap_char) {
1288             len++;
1289         }
1290         str++;
1291     }
1292     return len;
1293 }
1294 
1295 
s_AddToTagComment(char * orig,char * extra)1296 static char * s_AddToTagComment (char *orig, char *extra)
1297 {
1298     char * tag = NULL;
1299     int    tag_len;
1300 
1301     if (orig == NULL) {
1302         tag = extra;
1303     } else if (extra == NULL) {
1304         tag = orig;
1305     } else {
1306         tag_len = strlen (orig) + strlen (extra) + 1;
1307         tag = malloc (sizeof (char) * (tag_len + 1));
1308         strcpy (tag, orig);
1309         strcat (tag, "\n");
1310         strcat (tag, extra);
1311         free (orig);
1312         free (extra);
1313     }
1314     return tag;
1315 }
1316 
1317 
1318 typedef enum {
1319     eTagCommentStatus_ok = 0,
1320     eTagCommentStatus_runon
1321 } ETagCommentStatus;
1322 
s_ReadTagComment(FReadLineFunction readfunc,void * userdata,ETagCommentStatus * comment_status)1323 static char * s_ReadTagComment
1324 (FReadLineFunction    readfunc,
1325  void *               userdata,
1326  ETagCommentStatus    *comment_status)
1327 {
1328     char *linestring;
1329     char *tag = NULL;
1330     char *cp = NULL;
1331     char *tmp;
1332     int   tag_len = 0, end_len;
1333 
1334     linestring = readfunc (userdata);
1335     while (linestring != NULL  &&  linestring [0] != EOF && (cp = strchr (linestring, '}')) == NULL) {
1336         if (tag == NULL) {
1337             tag_len = strlen (linestring);
1338             tag = malloc (sizeof (char) * (tag_len + 1));
1339             strcpy (tag, linestring);
1340         } else {
1341             tag_len = tag_len + strlen (linestring) + 1;
1342             tmp = malloc (sizeof (char) * (tag_len + 1));
1343             strcpy (tmp, tag);
1344             strcat (tmp, "\n");
1345             strcat (tmp, linestring);
1346             free (tag);
1347             tag = tmp;
1348         }
1349         free (linestring);
1350         if (tag_len > 1000) {
1351            *comment_status = eTagCommentStatus_runon;
1352            free (tag);
1353            return NULL;
1354         }
1355         linestring = readfunc (userdata);
1356     }
1357     if (cp != NULL && cp > linestring) {
1358         end_len = cp - linestring;
1359         tag_len = tag_len + end_len + 1;
1360         tmp = malloc (sizeof (char) * (tag_len + 1));
1361         strcpy (tmp, tag);
1362         strcat (tmp, "\n");
1363         strncat (tmp, linestring, end_len);
1364         tmp[tag_len] = 0;
1365         free (tag);
1366         tag = tmp;
1367     }
1368     if (linestring != NULL) {
1369         free (linestring);
1370     }
1371 
1372     return tag;
1373 }
1374 
1375 
1376 /* Reads the portion of and ACE file for a single contig, including the reads */
s_ReadContig(char ** initline,FReadLineFunction readfunc,void * userdata,char make_qual_scores,char * has_errors)1377 static TContigPtr s_ReadContig
1378 (char **              initline,
1379  FReadLineFunction    readfunc,
1380  void *               userdata,
1381  char                 make_qual_scores,
1382  char *               has_errors)
1383 {
1384     char      *linestring;
1385     char      *firstline;
1386     char      *cp;
1387     int        len = 0, read_num = 0, num_base_segs = 0, report_read_num = 0;
1388     EFound     val;
1389     char       found_comp_char = 0;
1390     TContigPtr contig = NULL;
1391     ETagCommentStatus comment_status = eTagCommentStatus_ok;
1392 
1393     if (initline == NULL) return NULL;
1394     firstline = *initline;
1395     if (firstline == NULL || readfunc == NULL) return NULL;
1396 
1397     if (firstline [0] != 'C' || firstline [1] != 'O' || ! isspace (firstline [2])) {
1398         return NULL;
1399     }
1400 
1401     contig = ContigNew ();
1402     len = strlen (firstline + 3);
1403     contig->consensus_id = malloc (len + 1);
1404     strcpy (contig->consensus_id, firstline + 3);
1405 
1406     cp = contig->consensus_id;
1407     while (*cp != 0 && !isspace (*cp)) {
1408         cp++;
1409     }
1410     if (isspace (*cp)) {
1411         *cp = 0;
1412         cp++;
1413         contig->consensus_assem_len = atoi (cp);
1414         while (isdigit (*cp)) {
1415             cp++;
1416         }
1417         if (isspace (*cp)) {
1418             cp++;
1419             contig->num_reads = atoi (cp);
1420             while (isdigit (*cp)) {
1421                 cp++;
1422             }
1423             if (isspace (*cp)) {
1424                 cp++;
1425                 num_base_segs = atoi (cp);
1426                 while (isdigit (*cp)) {
1427                     cp++;
1428                 }
1429                 if (isspace (*cp)) {
1430                     cp++;
1431                     found_comp_char = 1;
1432                     if (*cp == 'C') {
1433                         contig->is_complement = 1;
1434                     } else {
1435                         contig->is_complement = 0;
1436                     }
1437                 }
1438             }
1439         }
1440     }
1441     if (contig->consensus_assem_len == 0 || contig->num_reads == 0 || !found_comp_char) {
1442         PrintACEFormatErrorXML ("Error in consensus line", contig->consensus_id, has_errors);
1443         ContigFree (contig);
1444         return NULL;
1445     }
1446 
1447     /* now copy in sequence data */
1448     contig->consensus_seq = s_ReadSequenceFromFile (contig->consensus_assem_len, readfunc, userdata, contig->consensus_id, has_errors);
1449     if (contig->consensus_seq == NULL) {
1450         ContigFree (contig);
1451         return NULL;
1452     }
1453 
1454     /* record actual length of consensus seq */
1455     contig->consensus_seq_len = s_GetUngappedSeqLen (contig->consensus_seq, '*');
1456 
1457     /* calculate gap info */
1458     contig->gaps = GapInfoFromSequenceString (contig->consensus_seq, '*');
1459 
1460     /* read quality scores */
1461     if (make_qual_scores) {
1462         val = s_ReadQualScores (contig, readfunc, userdata);
1463         if (val != eNone && val != eJustRight) {
1464             s_ReportFound (val, "quality scores", contig->consensus_id, has_errors);
1465             ContigFree (contig);
1466             return NULL;
1467         }
1468     } else {
1469         s_SkipQualScores (readfunc, userdata);
1470     }
1471 
1472     /* collect reads */
1473     val = s_ReadAFLines (contig, readfunc, userdata, &linestring);
1474     if (val != eJustRight) {
1475         s_ReportFound (val, "AF lines", contig->consensus_id, has_errors);
1476         ContigFree (contig);
1477         if (linestring != NULL) free (linestring);
1478         return NULL;
1479     }
1480 
1481     if (num_base_segs > 0) {
1482         val = s_ReadBaseSegs (contig, num_base_segs, &linestring, readfunc, userdata);
1483         if (val != eJustRight) {
1484             s_ReportFound (val, "base segments", contig->consensus_id, has_errors);
1485             ContigFree (contig);
1486             return NULL;
1487         }
1488     }
1489 
1490 
1491     read_num = 0;
1492     while (linestring != NULL  &&  linestring [0] != EOF) {
1493         if (linestring [0] == 'R' && linestring[1] == 'D' && isspace (linestring [2])) {
1494             len = strlen (contig->reads[read_num]->read_id);
1495             if (strncmp (linestring + 3, contig->reads[read_num]->read_id, len) != 0
1496                 || !isspace (linestring [3 + len])) {
1497                 PrintACEFormatErrorXML ("Read IDs out of order!", contig->consensus_id, has_errors);
1498                 ContigFree (contig);
1499                 return NULL;
1500             }
1501             len = atoi (linestring + 3 + len);
1502             contig->reads[read_num]->read_seq = s_ReadSequenceFromFile (len, readfunc, userdata, contig->reads[read_num]->read_id, has_errors);
1503             if (contig->reads[read_num]->read_seq == NULL) {
1504                 ContigFree (contig);
1505                 return NULL;
1506             }
1507             s_AdjustContigReadForTerminalNs (contig->reads[read_num]);
1508             contig->reads[read_num]->read_len = s_GetUngappedSeqLen (contig->reads[read_num]->read_seq, '*');
1509             contig->reads[read_num]->gaps = GapInfoFromSequenceString (contig->reads[read_num]->read_seq, '*');
1510             read_num++;
1511             report_read_num = read_num - 1;
1512         } else if (linestring [0] == 'Q' && linestring[1] == 'A' && isspace (linestring[2])) {
1513             if (read_num < 1) {
1514                 PrintACEFormatErrorXML ("Found QA line before RD!", contig->consensus_id, has_errors);
1515                 ContigFree (contig);
1516                 return NULL;
1517             } else if (!ApplyQALineToRead (contig->reads[report_read_num], linestring, contig->reads[report_read_num]->read_id, has_errors)) {
1518                 PrintACEFormatErrorXML ("Error in QA line format!", contig->reads[report_read_num]->read_id, has_errors);
1519                 ContigFree (contig);
1520                 return NULL;
1521             }
1522         } else if (linestring[0] == 'D' && linestring[1] == 'S' && (isspace (linestring[2]) || linestring[2] == 0)) {
1523             /* skip DS lines */
1524         } else if (strncmp (linestring, "RT{", 3) == 0) {
1525             contig->reads[read_num - 1]->tag = s_AddToTagComment (contig->reads[report_read_num]->tag, s_ReadTagComment (readfunc, userdata, &comment_status));
1526         } else if (strncmp (linestring, "WR{", 3) == 0) {
1527             contig->reads[read_num - 1]->tag = s_AddToTagComment (contig->reads[report_read_num]->tag, s_ReadTagComment (readfunc, userdata, &comment_status));
1528         } else if (strncmp (linestring, "CT{", 3) == 0) {
1529             contig->tag = s_AddToTagComment (contig->tag, s_ReadTagComment (readfunc, userdata, &comment_status));
1530         } else if (strncmp (linestring, "WA{", 3) == 0) {
1531             contig->tag = s_AddToTagComment (contig->tag, s_ReadTagComment (readfunc, userdata, &comment_status));
1532         } else if (linestring[0] != 0) {
1533             /* found next line */
1534             *initline = linestring;
1535             s_AdjustContigForTerminalNs (contig);
1536             s_CalculateContigOffsets (contig);
1537             return contig;
1538         }
1539         free (linestring);
1540         if (comment_status == eTagCommentStatus_runon) {
1541             PrintACEFormatErrorXML ("Error in CT line - comment tag is unusally long, suspect missing terminating }", contig->reads[report_read_num]->read_id, has_errors);
1542             ContigFree (contig);
1543             return NULL;
1544         }
1545 
1546         linestring = readfunc (userdata);
1547     }
1548     *initline = NULL;
1549     s_AdjustContigForTerminalNs (contig);
1550     s_CalculateContigOffsets (contig);
1551     return contig;
1552 }
1553 
1554 
1555 /* Used to detect errors in ACE file formatting */
s_UnexpectedLineBetweenContigs(char * linestring)1556 static char s_UnexpectedLineBetweenContigs (char *linestring)
1557 {
1558     if (linestring == NULL) {
1559         return 0;
1560     } else if (linestring [0] == 'A' && linestring [1] == 'F') {
1561         return 1;
1562     } else if (linestring [0] == 'R' && linestring [1] == 'D') {
1563         return 1;
1564     } else {
1565         return 0;
1566     }
1567 }
1568 
1569 
1570 /* This is the main function for reading in an ACE file */
1571 extern TACEFilePtr
ReadACEFile(FReadLineFunction readfunc,void * userdata,char make_qual_scores,char * has_errors)1572 ReadACEFile
1573 (FReadLineFunction    readfunc,
1574  void *               userdata,
1575  char                 make_qual_scores,
1576  char *               has_errors)
1577 {
1578     char *              linestring;
1579     TACEFilePtr         afp;
1580     char *              cp;
1581     int                 contig_num = 0, read_num = 0;
1582     int                 num_reads_expected = 0;
1583 
1584     if (readfunc == NULL) {
1585         return NULL;
1586     }
1587 
1588     afp = ACEFileNew ();
1589     if (afp == NULL) {
1590         return NULL;
1591     }
1592 
1593     linestring = readfunc (userdata);
1594 
1595     while (linestring != NULL  &&  linestring [0] != EOF) {
1596         if (linestring [0] == 'A' && linestring [1] == 'S' && isspace (linestring [2])) {
1597             if (num_reads_expected > 0) {
1598                 PrintACEFormatErrorXML ("Two file header lines!", NULL, has_errors);
1599                 ACEFileFree (afp);
1600                 free (linestring);
1601                 return NULL;
1602             }
1603             /* first line in file, number of contigs */
1604             cp = linestring + 3;
1605             afp->num_contigs = atoi (cp);
1606             afp->contigs = malloc (afp->num_contigs * sizeof (TContigPtr));
1607             if (afp->contigs == NULL) {
1608                 PrintACEFormatErrorXML ("Memory allocation failed!", NULL, has_errors);
1609                 free (linestring);
1610                 ACEFileFree (afp);
1611                 return NULL;
1612             }
1613             while (isdigit (*cp)) {
1614                 cp++;
1615             }
1616             num_reads_expected = atoi (cp);
1617             free (linestring);
1618             linestring = readfunc (userdata);
1619         } else if (linestring [0] == 'C' && linestring [1] == 'O' && isspace (linestring [2])) {
1620             if (contig_num >= afp->num_contigs) {
1621                 PrintACEFormatErrorXML ("Too many contigs!", NULL, has_errors);
1622                 free (linestring);
1623                 ACEFileFree (afp);
1624                 return NULL;
1625             }
1626             afp->contigs[contig_num] = s_ReadContig (&linestring, readfunc, userdata, make_qual_scores, has_errors);
1627             if (afp->contigs[contig_num] == NULL) {
1628                 PrintACEFormatErrorXMLStart (NULL, has_errors);
1629                 printf ("Unable to read contig (%d)", contig_num);
1630                 PrintACEFormatErrorXMLEnd ();
1631                 ACEFileFree (afp);
1632                 return NULL;
1633             }
1634             read_num += afp->contigs[contig_num]->num_reads;
1635             contig_num++;
1636         } else if (s_UnexpectedLineBetweenContigs (linestring)) {
1637             PrintACEFormatErrorXMLStart (NULL, has_errors);
1638             printf ("Unexpected line after contig %d:%s", read_num, linestring);
1639             PrintACEFormatErrorXMLEnd ();
1640             free (linestring);
1641             ACEFileFree (afp);
1642             return NULL;
1643         } else {
1644             free (linestring);
1645             linestring = readfunc (userdata);
1646         }
1647     }
1648     if (contig_num < afp->num_contigs) {
1649         PrintACEFormatErrorXML ("Not enough contigs!", NULL, has_errors);
1650         ACEFileFree (afp);
1651         afp = NULL;
1652     } else if (read_num < num_reads_expected) {
1653         PrintACEFormatErrorXML ("Not enough reads!", NULL, has_errors);
1654         ACEFileFree (afp);
1655         afp = NULL;
1656     }
1657 
1658     return afp;
1659 }
1660 
1661 
1662 /* This function writes out sequence characters, 60 per line. */
s_WriteSeq(FILE * fp,char * seq)1663 static void s_WriteSeq (FILE *fp, char *seq)
1664 {
1665     int    i;
1666     char * cp;
1667 
1668     if (fp == NULL || seq == NULL) return;
1669     cp = seq;
1670     while (*cp != 0) {
1671         for (i = 0; i < 60 && *cp != 0; i++, cp++) {
1672             fprintf (fp, "%c", *cp);
1673         }
1674         fprintf (fp, "\n");
1675     }
1676 }
1677 
1678 
1679 /* This function writes out quality scores in the ACE file format. */
s_WriteQualScores(FILE * fp,TContigPtr contig)1680 static void s_WriteQualScores (FILE *fp, TContigPtr contig)
1681 {
1682     int q_pos, line_pos;
1683 
1684     if (fp == NULL || contig == NULL || contig->num_qual_scores == 0) return;
1685 
1686     fprintf (fp, "BQ\n");
1687     q_pos = 0;
1688     while (q_pos < contig->num_qual_scores) {
1689         line_pos = 0;
1690         while (line_pos < 60 && q_pos < contig->num_qual_scores) {
1691             if (contig->consensus_seq[q_pos] != '*') {
1692                 fprintf (fp, "%d ", contig->qual_scores[q_pos]);
1693                 line_pos++;
1694             }
1695             q_pos++;
1696         }
1697         fprintf (fp, "\n");
1698     }
1699 }
1700 
1701 
1702 /* NOTE - this file does not provide all of the information required for an ACE file. */
s_WriteContig(FILE * fp,TContigPtr contig)1703 static void s_WriteContig (FILE *fp, TContigPtr contig)
1704 {
1705     int i;
1706 
1707     if (contig == NULL) return;
1708 
1709     fprintf (fp, "CO %s %d %d\n\n", contig->consensus_id, contig->consensus_assem_len, contig->num_reads);
1710     s_WriteSeq (fp, contig->consensus_seq);
1711     fprintf (fp, "\n");
1712 
1713     s_WriteQualScores (fp, contig);
1714     fprintf (fp, "\n");
1715 
1716     for (i = 0; i < contig->num_reads; i++) {
1717         fprintf (fp, "AF %s %c %d\n", contig->reads[i]->read_id,
1718                                       contig->reads[i]->is_complement ? 'C' : 'U',
1719                                       contig->reads[i]->cons_start + 1);
1720     }
1721     fprintf (fp, "\n");
1722     for (i = 0; i < contig->num_reads; i++) {
1723         fprintf (fp, "RD %s %d\n", contig->reads[i]->read_id, (int) strlen (contig->reads[i]->read_seq));
1724         s_WriteSeq (fp, contig->reads[i]->read_seq);
1725         fprintf (fp, "\n");
1726     }
1727 
1728 }
1729 
1730 
1731 /* NOTE - This generates an incomplete ACE file - the data structure we currently use
1732  * does not provide enough data to create a complete ACE file.
1733  */
WriteACEFile(FILE * fp,TACEFilePtr afp)1734 extern void WriteACEFile (FILE *fp, TACEFilePtr afp)
1735 {
1736   int i, tot_reads = 0;
1737   if (fp == NULL || afp == NULL) return;
1738 
1739   for (i = 0; i < afp->num_contigs; i++) {
1740     tot_reads += afp->contigs[i]->num_reads;
1741   }
1742   fprintf (fp, "AS %d %d\n\n", afp->num_contigs, tot_reads);
1743 
1744   for (i = 0; i < afp->num_contigs; i++) {
1745     s_WriteContig (fp, afp->contigs[i]);
1746   }
1747 }
1748 
1749 
1750 /* This function generates a string that uses the FASTA+GAP method for specifying gaps
1751  * (dashes instead of asterisks)
1752  */
1753 static char *
s_AlignmentSeqFromContigSeq(char * contig_seq,int cons_start,int aln_len,int read_start,int read_stop)1754 s_AlignmentSeqFromContigSeq
1755 (char *contig_seq,
1756  int   cons_start,
1757  int   aln_len,
1758  int   read_start,
1759  int   read_stop)
1760 {
1761     char * aln_seq;
1762     char * cp;
1763     int  pos = 0, i;
1764 
1765     aln_seq = malloc (sizeof (char) * (aln_len + 1));
1766     /* pad start */
1767     for (i = 0; i < cons_start; i++) {
1768         aln_seq[pos] = '-';
1769         pos++;
1770     }
1771     cp = contig_seq;
1772     if (read_start > 1) {
1773         i = 1;
1774         while (*cp != 0 && i < read_start) {
1775             aln_seq[pos] = '-';
1776             pos++;
1777             cp++;
1778             i++;
1779         }
1780     }
1781     while (*cp != 0 && (read_stop < 1 || i < read_stop)) {
1782         if (*cp == '*') {
1783             aln_seq[pos] = '-';
1784         } else {
1785             aln_seq[pos] = *cp;
1786         }
1787         pos++;
1788         cp++;
1789         i++;
1790     }
1791     while (pos < aln_len) {
1792         aln_seq[pos] = '-';
1793         pos++;
1794     }
1795     aln_seq[pos] = 0;
1796     return aln_seq;
1797 }
1798 
1799 
s_FarPointerIdFromReadId(char * read_id)1800 static char * s_FarPointerIdFromReadId (char * read_id)
1801 {
1802     char * far_id = NULL;
1803     far_id = malloc (sizeof (char) * (strlen (read_id) + 1));
1804     strcpy (far_id, read_id);
1805     return far_id;
1806 }
1807 
1808 
1809 /* This function generates an intermediate data format suitable for generating
1810  * a SeqEntry with an alignment.
1811  */
AlignmentFileFromContig(TContigPtr contig)1812 extern TAlignmentFilePtr AlignmentFileFromContig (TContigPtr contig)
1813 {
1814     TAlignmentFilePtr aln;
1815     int               i, len;
1816     int               consensus_pad = 0, pad, end_pad = 0, aln_len;
1817 
1818     if (contig == NULL) return NULL;
1819 
1820     aln = AlignmentFileNew ();
1821     aln->num_sequences = 1 + contig->num_reads;
1822     aln->num_organisms = 0;
1823     aln->num_deflines = 0;
1824     aln->num_segments = 1;
1825     aln->ids = malloc (sizeof (char *) * aln->num_sequences);
1826     aln->sequences = malloc (sizeof (char *) * aln->num_sequences);
1827     aln->organisms = NULL;
1828     aln->deflines = NULL;
1829     aln->align_format_found = 1;
1830     /* calculate padding for consensus */
1831     for (i = 0; i < contig->num_reads; i++) {
1832         if (contig->reads[i]->cons_start < 0) {
1833             pad = 0 - contig->reads[i]->cons_start;
1834             if (consensus_pad < pad) {
1835                 consensus_pad = pad;
1836             }
1837         }
1838         len = contig->reads[i]->cons_start + strlen (contig->reads[i]->read_seq);
1839         if (len > contig->consensus_assem_len) {
1840             pad = len - contig->consensus_assem_len;
1841             if (pad > end_pad) {
1842                 end_pad = pad;
1843             }
1844         }
1845     }
1846     aln_len = consensus_pad + contig->consensus_assem_len + end_pad;
1847     /* seq for consensus */
1848     len = strlen (contig->consensus_id);
1849     aln->ids[0] = malloc (sizeof (char) * (len + 1));
1850     strcpy (aln->ids[0], contig->consensus_id);
1851     aln->sequences[0] = s_AlignmentSeqFromContigSeq (contig->consensus_seq,
1852                                                      consensus_pad,
1853                                                      aln_len, 0, 0);
1854     for (i = 1; i < aln->num_sequences; i++) {
1855         len = strlen (contig->reads[i - 1]->read_id);
1856         aln->ids[i] = s_FarPointerIdFromReadId (contig->reads[i - 1]->read_id);
1857         aln->sequences[i] = s_AlignmentSeqFromContigSeq (contig->reads[i - 1]->read_seq,
1858                                                          consensus_pad + contig->reads[i - 1]->cons_start,
1859                                                          aln_len,
1860                                                          contig->reads[i - 1]->read_assem_start,
1861                                                          contig->reads[i - 1]->read_assem_stop);
1862     }
1863     return aln;
1864 }
1865 
1866 
1867 /* The Trace Archive Gap String is a list of the number of nucleotides to skip before adding the next gap */
TraceArchiveGapStringFromACESequence(char * seq_str)1868 extern char * TraceArchiveGapStringFromACESequence (char *seq_str)
1869 {
1870     char *cp;
1871     char * gap_str = NULL;
1872     char * print_pos;
1873     int    len = 0, pos;
1874 
1875     if (seq_str == NULL) return NULL;
1876 
1877     /* first determine length of gap string */
1878     cp = seq_str;
1879     while (*cp != 0) {
1880         if (*cp == '*' || *cp == '-') {
1881             len++;
1882         }
1883         cp++;
1884     }
1885     len = 15 * len + 1;
1886     gap_str = malloc (sizeof (char) * len);
1887     cp = seq_str;
1888     print_pos = gap_str;
1889     pos = 0;
1890     while (*cp != 0) {
1891         if (*cp == '*' || *cp == '-') {
1892             sprintf (print_pos, "%d,", pos);
1893             print_pos += strlen (print_pos);
1894             pos = 0;
1895         } else {
1896             pos++;
1897         }
1898         cp++;
1899     }
1900     /* trim final comma */
1901     print_pos[strlen(print_pos) - 1] = 0;
1902     return gap_str;
1903 }
1904 
1905 
1906 /* NOTE - These functions are currently incomplete */
WriteTraceArchiveRead(FILE * fp,TContigReadPtr read)1907 extern void WriteTraceArchiveRead (FILE *fp, TContigReadPtr read)
1908 {
1909     char *cp;
1910     if (fp == NULL || read == NULL) {
1911         return;
1912     }
1913 
1914     fprintf (fp, "<trace>\n");
1915     fprintf (fp, "<trace_name>%s</trace_name>\n", read->read_id);
1916     fprintf (fp, "<traceconsensus>");
1917     cp = read->read_seq;
1918     while (*cp != 0) {
1919         if (*cp != '*') {
1920             fprintf (fp, "%c", *cp);
1921         }
1922         cp++;
1923     }
1924     fprintf (fp, "</traceconsensus>\n");
1925     cp = TraceArchiveGapStringFromACESequence (read->read_seq);
1926     fprintf (fp, "<tracegaps>%s</tracegaps>\n", cp);
1927     free (cp);
1928     fprintf (fp, "</trace>\n");
1929 }
1930 
1931 
s_GetTokenLen(char * str)1932 static int s_GetTokenLen (char *str)
1933 {
1934     char *cp;
1935     int   len = 0;
1936 
1937     if (str == NULL) return 0;
1938 
1939     cp = str;
1940     while (*cp != 0 && !isspace (*cp)) {
1941         len++;
1942         cp++;
1943     }
1944     return len;
1945 }
1946 
1947 
s_SkipTokens(char * str,int num_tokens)1948 static char * s_SkipTokens (char *str, int num_tokens)
1949 {
1950     char *cp;
1951     int   i;
1952 
1953     if (str == NULL || num_tokens < 0) return NULL;
1954 
1955     cp = str;
1956     /* skip leading whitespace */
1957     while (isspace (*cp)) {
1958         cp++;
1959     }
1960 
1961     for (i = 0; i < num_tokens && *cp != 0; i++) {
1962         /* skip token */
1963         while (*cp != 0 && !isspace (*cp)) {
1964             cp++;
1965         }
1966         /* skip trailing whitespace */
1967         while (isspace (*cp)) {
1968             cp++;
1969         }
1970     }
1971     if (*cp == 0) {
1972         return NULL;
1973     } else {
1974         return cp;
1975     }
1976 }
1977 
1978 
1979 /* for reading other formats */
1980 extern TContigReadPtr
ReadContigFromString(char * str,char ** consensus_id,int id_col,int seq_col,int contig_id_col,int strand_col,int start_col,int interpret_n_col)1981 ReadContigFromString
1982 (char *str,
1983  char **consensus_id,
1984  int    id_col,
1985  int    seq_col,
1986  int    contig_id_col,
1987  int    strand_col,
1988  int    start_col,
1989  int    interpret_n_col
1990  )
1991 {
1992     TContigReadPtr read = NULL;
1993     char *cp;
1994     int len, col_num = 1, n_is_gap = 0;
1995     int max_col;
1996 
1997     if (str == NULL) {
1998         return NULL;
1999     }
2000 
2001     max_col = id_col;
2002     if (seq_col > max_col) {
2003       max_col = seq_col;
2004     }
2005     if (contig_id_col > max_col) {
2006       max_col = contig_id_col;
2007     }
2008     if (strand_col > max_col) {
2009       max_col = strand_col;
2010     }
2011     if (start_col > max_col) {
2012       max_col = start_col;
2013     }
2014     if (interpret_n_col > max_col) {
2015       max_col = interpret_n_col;
2016     }
2017 
2018     read = ContigReadNew ();
2019 
2020     cp = str;
2021     len = s_GetTokenLen (cp);
2022     while (cp != NULL && *cp != 0 && col_num <= max_col) {
2023         if (id_col == col_num) {
2024             read->read_id = malloc (len + 1);
2025             strncpy (read->read_id, cp, len);
2026             read->read_id[len] = 0;
2027         } else if (seq_col == col_num) {
2028             read->read_seq = malloc (len + 1);
2029             strncpy (read->read_seq, cp, len);
2030             read->read_seq[len] = 0;
2031             read->read_len = len;
2032         } else if (contig_id_col == col_num) {
2033             if (consensus_id != NULL) {
2034                 *consensus_id = malloc (len + 1);
2035                 strncpy (*consensus_id, cp, len);
2036                 (*consensus_id)[len] = 0;
2037             }
2038         } else if (strand_col == col_num) {
2039             if (*cp == 'R' || *cp == '-') {
2040                 read->is_complement = 1;
2041             }
2042         } else if (start_col == col_num) {
2043             read->cons_start = atoi (cp);
2044         } else if (interpret_n_col == col_num) {
2045             if (*cp == 'I') {
2046                 n_is_gap = 1;
2047             }
2048         }
2049         /* advance to next token */
2050         col_num++;
2051         cp = s_SkipTokens (cp, 1);
2052         len = s_GetTokenLen (cp);
2053     }
2054 
2055     if (max_col > col_num) {
2056         ContigReadFree (read);
2057         read = NULL;
2058     } else {
2059         read->cons_stop = read->cons_start + read->read_len - 1;
2060         read->tiling_start = read->cons_start;
2061         read->tiling_stop = read->cons_stop;
2062         read->read_assem_start = 0;
2063         read->read_assem_stop = read->read_len - 1;
2064         read->read_start = 1;
2065         read->read_stop = read->read_len;
2066         if (n_is_gap) {
2067             /* adjust for gaps */
2068             read->gaps = GapInfoFromSequenceString (read->read_seq, 'N');
2069             if (read->gaps->num_gaps > 0) {
2070                 RemoveGapCharsFromSequenceString (read->read_seq, 'N');
2071                 read->read_stop -= read->gaps->num_gaps;
2072                 read->read_len -= read->gaps->num_gaps;
2073             }
2074         }
2075     }
2076 
2077     return read;
2078 }
2079 
2080 
ReadFromMAQString(char * str,char ** consensus_id)2081 extern TContigReadPtr ASSEMBLY_CALLBACK ReadFromMAQString (char *str, char **consensus_id)
2082 {
2083     TContigReadPtr read = NULL;
2084 
2085     read = ReadContigFromString (str, consensus_id, 1, 15, 2, 4, 3, 0);
2086     return read;
2087 }
2088 
2089 
ReadFromElandMostCompressed(char * str,char ** consensus_id)2090 extern TContigReadPtr ASSEMBLY_CALLBACK ReadFromElandMostCompressed (char *str, char **consensus_id)
2091 {
2092     TContigReadPtr read = NULL;
2093 
2094     read = ReadContigFromString (str, consensus_id, 0, 1, 0, 5, 4, 0);
2095     return read;
2096 }
2097 
2098 
ReadFromElandSanger(char * str,char ** consensus_id)2099 extern TContigReadPtr ASSEMBLY_CALLBACK ReadFromElandSanger (char *str, char **consensus_id)
2100 {
2101     TContigReadPtr read = NULL;
2102 
2103     read = ReadContigFromString (str, consensus_id, 1, 2, 4, 6, 5, 0);
2104     return read;
2105 }
2106 
2107 
ReadFromElandStandalone(char * str,char ** consensus_id)2108 extern TContigReadPtr ASSEMBLY_CALLBACK ReadFromElandStandalone (char *str, char **consensus_id)
2109 {
2110     TContigReadPtr read = NULL;
2111 
2112     read = ReadContigFromString (str, consensus_id, 1, 2, 7, 9, 8, 10);
2113     return read;
2114 }
2115 
2116 
2117 #define READ_BLOCK_SIZE 50
2118 
2119 typedef struct ReadList {
2120   TContigReadPtr reads[READ_BLOCK_SIZE];
2121   int         num_reads;
2122   struct ReadList * next;
2123 } SReadList, * TReadListPtr;
2124 
2125 
ReadListNew()2126 static TReadListPtr ReadListNew ()
2127 {
2128     TReadListPtr r;
2129 
2130     r = malloc (sizeof (SReadList));
2131     r->num_reads = 0;
2132     r->next = NULL;
2133     return r;
2134 }
2135 
2136 
ReadListFree(TReadListPtr r)2137 static TReadListPtr ReadListFree (TReadListPtr r)
2138 {
2139     TReadListPtr r_next;
2140     int          i;
2141 
2142     while (r != NULL) {
2143         for (i = 0; i < r->num_reads; i++) {
2144             ContigReadFree (r->reads[i]);
2145         }
2146         r_next = r;
2147         free (r);
2148         r = r_next;
2149     }
2150     return r;
2151 }
2152 
2153 
AddToReadList(TContigReadPtr read,TReadListPtr read_list)2154 static TReadListPtr AddToReadList (TContigReadPtr read, TReadListPtr read_list)
2155 {
2156     if (read_list == NULL) {
2157         read_list = ReadListNew();
2158     } else {
2159         while (read_list->next != NULL && read_list->num_reads == READ_BLOCK_SIZE) {
2160             read_list = read_list->next;
2161         }
2162         if (read_list->num_reads == READ_BLOCK_SIZE) {
2163             read_list->next = ReadListNew();
2164             read_list = read_list->next;
2165         }
2166     }
2167     read_list->reads[read_list->num_reads++] = read;
2168     return read_list;
2169 }
2170 
2171 
2172 typedef struct ConsensusReads {
2173   TContigPtr   contig;
2174   TReadListPtr read_list;
2175   TReadListPtr last_read;
2176   struct ConsensusReads * next;
2177 } SConsensusReads, * TConsensusReadsPtr;
2178 
2179 
ConsensusReadsNew(char * consensus_id)2180 static TConsensusReadsPtr ConsensusReadsNew (char *consensus_id)
2181 {
2182     TConsensusReadsPtr c;
2183 
2184     c = malloc (sizeof (SConsensusReads));
2185     c->contig = ContigNew ();
2186     if (consensus_id != NULL) {
2187         c->contig->consensus_id = malloc (strlen (consensus_id) + 1);
2188         strcpy (c->contig->consensus_id, consensus_id);
2189     }
2190 
2191     c->read_list = NULL;
2192     c->last_read = NULL;
2193     c->next = NULL;
2194     return c;
2195 }
2196 
2197 
ConsensusReadsFree(TConsensusReadsPtr c)2198 static TConsensusReadsPtr ConsensusReadsFree (TConsensusReadsPtr c)
2199 {
2200     TConsensusReadsPtr c_next;
2201 
2202     while (c != NULL) {
2203         c_next = c->next;
2204         ContigFree (c->contig);
2205         c->read_list = ReadListFree (c->read_list);
2206         free (c);
2207         c = c_next;
2208     }
2209     return c;
2210 }
2211 
2212 
AddReadToConsensusReads(TConsensusReadsPtr c,TContigReadPtr read)2213 static void AddReadToConsensusReads (TConsensusReadsPtr c, TContigReadPtr read)
2214 {
2215     if (c != NULL && read != NULL) {
2216         c->last_read = AddToReadList (read, c->read_list);
2217         if (c->read_list == NULL) {
2218             c->read_list = c->last_read;
2219         }
2220     }
2221 }
2222 
2223 #define CONSENSUS_BLOCK_SIZE 50
2224 
2225 typedef struct ConsensusReadsList {
2226   TConsensusReadsPtr contigs[CONSENSUS_BLOCK_SIZE];
2227   int                num_contigs;
2228   struct ConsensusReadsList * next;
2229 } SConsensusReadsList, * TConsensusReadsListPtr;
2230 
2231 
ConsensusReadsListNew()2232 static TConsensusReadsListPtr ConsensusReadsListNew ()
2233 {
2234     TConsensusReadsListPtr c;
2235 
2236     c = malloc (sizeof (SConsensusReadsList));
2237     c->num_contigs = 0;
2238     c->next = NULL;
2239     return c;
2240 }
2241 
2242 
ConsensusReadsListFree(TConsensusReadsListPtr c)2243 static TConsensusReadsListPtr ConsensusReadsListFree (TConsensusReadsListPtr c)
2244 {
2245     TConsensusReadsListPtr c_next;
2246     int                    i;
2247 
2248     while (c != NULL) {
2249         c_next = c->next;
2250         for (i = 0; i < c->num_contigs; i++) {
2251              c->contigs[i] = ConsensusReadsFree (c->contigs[i]);
2252         }
2253         free (c);
2254         c = c_next;
2255     }
2256     return c;
2257 }
2258 
2259 
FindConsensusIDInConsensusReadsList(TConsensusReadsListPtr c,char * consensus_id)2260 static TConsensusReadsPtr FindConsensusIDInConsensusReadsList (TConsensusReadsListPtr c, char *consensus_id)
2261 {
2262     int i;
2263     TConsensusReadsPtr r = NULL;
2264 
2265     if (consensus_id == NULL) {
2266         return NULL;
2267     }
2268     while (c != NULL && r == NULL)  {
2269         for (i = 0; i < c->num_contigs && r == NULL; i++) {
2270             if (c->contigs[i] != NULL
2271                 && c->contigs[i]->contig != NULL
2272                 && strcmp (c->contigs[i]->contig->consensus_id, consensus_id) == 0) {
2273                 r = c->contigs[i];
2274             }
2275         }
2276         c = c->next;
2277     }
2278     return r;
2279 }
2280 
2281 
2282 static TConsensusReadsListPtr
AddConsensusReadToConsensusReadsList(TConsensusReadsListPtr c,char * consensus_id,TContigReadPtr read)2283 AddConsensusReadToConsensusReadsList
2284 (TConsensusReadsListPtr c,
2285  char                 * consensus_id,
2286  TContigReadPtr         read)
2287 {
2288     TConsensusReadsPtr r = NULL;
2289 
2290     if (c == NULL) {
2291         c = ConsensusReadsListNew ();
2292         r = ConsensusReadsNew(consensus_id);
2293         c->contigs[c->num_contigs++] = r;
2294     } else {
2295         r = FindConsensusIDInConsensusReadsList (c, consensus_id);
2296         if (r == NULL) {
2297             while (c->next != NULL && c->num_contigs == CONSENSUS_BLOCK_SIZE) {
2298                 c = c->next;
2299             }
2300             if (c->num_contigs == CONSENSUS_BLOCK_SIZE) {
2301                 c->next = ConsensusReadsListNew ();
2302                 c = c->next;
2303             }
2304             r = ConsensusReadsNew(consensus_id);
2305             c->contigs[c->num_contigs++] = r;
2306         }
2307     }
2308     AddReadToConsensusReads (r, read);
2309 
2310     return c;
2311 }
2312 
2313 
MoveReadsToContigFromReadList(TContigPtr contig,TReadListPtr read_list)2314 static void MoveReadsToContigFromReadList (TContigPtr contig, TReadListPtr read_list)
2315 {
2316     TReadListPtr r;
2317     int          n = 0, i;
2318 
2319     if (contig == NULL) {
2320         return;
2321     }
2322 
2323     for (r = read_list; r != NULL; r = r->next) {
2324         n += r->num_reads;
2325     }
2326 
2327     contig->num_reads = n;
2328     contig->reads = malloc (contig->num_reads * sizeof (TContigReadPtr));
2329     n = 0;
2330 
2331     for (r = read_list; r != NULL; r = r->next) {
2332         for (i = 0; i < r->num_reads; i++) {
2333             contig->reads[n++] = r->reads[i];
2334             r->reads[i] = NULL;
2335         }
2336         r->num_reads = 0;
2337     }
2338 }
2339 
2340 
ACEFileFromConsensusReadsList(TConsensusReadsListPtr contig_list)2341 static TACEFilePtr ACEFileFromConsensusReadsList (TConsensusReadsListPtr contig_list)
2342 {
2343     TACEFilePtr afp = NULL;
2344     TConsensusReadsListPtr c;
2345     int                    i, n = 0;
2346 
2347     if (contig_list == NULL || contig_list->num_contigs == 0) {
2348         return NULL;
2349     }
2350 
2351     afp = ACEFileNew ();
2352     for (c = contig_list; c != NULL; c=c->next) {
2353         afp->num_contigs += c->num_contigs;
2354     }
2355 
2356     afp->contigs = malloc (afp->num_contigs * sizeof (TContigPtr));
2357     for (c = contig_list; c != NULL; c = c->next) {
2358         for (i = 0; i < c->num_contigs; i++) {
2359             MoveReadsToContigFromReadList (c->contigs[i]->contig, c->contigs[i]->read_list);
2360             afp->contigs[n++] = c->contigs[i]->contig;
2361             c->contigs[i]->contig = NULL;
2362             c->contigs[i]->last_read = NULL;
2363         }
2364         c->num_contigs = 0;
2365     }
2366     return afp;
2367 }
2368 
2369 
ReadAssemblyFile(FReadLineFunction readfunc,void * fileuserdata,FReadFromStringFunction makeread_func)2370 extern TACEFilePtr ReadAssemblyFile
2371 (FReadLineFunction    readfunc,      /* function for reading lines of
2372                                        * alignment file
2373                                        */
2374  void *               fileuserdata,  /* data to be passed back each time
2375                                        * readfunc is invoked
2376                                        */
2377  FReadFromStringFunction makeread_func) /* function to transform a string into a read */
2378 {
2379     TACEFilePtr afp = NULL;
2380     TContigReadPtr read;
2381     TConsensusReadsListPtr contig_list = NULL, contig_last = NULL;
2382     char *linestring;
2383     char *consensus_id = NULL;
2384 
2385     if (readfunc == NULL || makeread_func == NULL) {
2386         return NULL;
2387     }
2388     linestring = readfunc (fileuserdata);
2389 
2390     while (linestring != NULL  &&  linestring [0] != EOF) {
2391         /* get ContigRead */
2392         read = makeread_func (linestring, &consensus_id);
2393 
2394         /* group with other ContigReads from the same consensus_id */
2395         if (read != NULL && consensus_id != NULL) {
2396             contig_last = AddConsensusReadToConsensusReadsList (contig_last, consensus_id, read);
2397             if (contig_list == NULL) {
2398                 contig_list = contig_last;
2399             }
2400             read = NULL;
2401         }
2402         if (consensus_id != NULL) {
2403             free (consensus_id);
2404             consensus_id = NULL;
2405         }
2406         ContigReadFree (read);
2407         free (linestring);
2408         linestring = readfunc (fileuserdata);
2409     }
2410 
2411     afp = ACEFileFromConsensusReadsList (contig_list);
2412     contig_list = ConsensusReadsListFree (contig_list);
2413     return afp;
2414 }
2415 
2416 
ReadMAQFile(FReadLineFunction readfunc,void * fileuserdata)2417 extern TACEFilePtr ReadMAQFile
2418 (FReadLineFunction    readfunc,      /* function for reading lines of
2419                                        * alignment file
2420                                        */
2421  void *               fileuserdata)  /* data to be passed back each time
2422                                        * readfunc is invoked
2423                                        */
2424 {
2425     return ReadAssemblyFile (readfunc, fileuserdata, ReadFromMAQString);
2426 }
2427 
2428 
ReadElandStandaloneFile(FReadLineFunction readfunc,void * fileuserdata)2429 extern TACEFilePtr ReadElandStandaloneFile
2430 (FReadLineFunction    readfunc,      /* function for reading lines of
2431                                        * alignment file
2432                                        */
2433  void *               fileuserdata)  /* data to be passed back each time
2434                                        * readfunc is invoked
2435                                        */
2436 {
2437     return ReadAssemblyFile (readfunc, fileuserdata, ReadFromElandStandalone);
2438 }
2439 
2440 
2441 /* functions for writing out XML */
WriteTraceGapsXML(TGapInfoPtr gap_info,FILE * fp)2442 static void WriteTraceGapsXML (TGapInfoPtr gap_info, FILE *fp)
2443 {
2444   int i;
2445 
2446   if (gap_info != NULL && fp != NULL) {
2447     fprintf (fp, "    <ntracegaps>%d</ntracegaps>\n", gap_info->num_gaps);
2448     if (gap_info->num_gaps > 0) {
2449       fprintf (fp, "    <tracegaps source=\"INLINE\">");
2450       for (i = 0; i < gap_info->num_gaps - 1; i++) {
2451         fprintf (fp, "%d ", gap_info->gap_offsets[i]);
2452       }
2453       fprintf (fp, "%d</tracegaps>\n", gap_info->gap_offsets[gap_info->num_gaps - 1]);
2454     } else {
2455       fprintf (fp, "    <tracegaps source=\"INLINE\"> </tracegaps>\n");
2456     }
2457   }
2458 }
2459 
2460 
WriteTraceReadXML(TContigReadPtr read,FILE * fp)2461 static void WriteTraceReadXML (TContigReadPtr read, FILE *fp)
2462 {
2463   if (read != NULL && fp != NULL) {
2464     fprintf (fp, "<trace>\n");
2465     if (read->ti > 0) {
2466       fprintf (fp, "  <ti>%d</ti>\n", read->ti);
2467     }
2468     if (read->srr != NULL) {
2469       fprintf (fp, "  <srr>%s</srr>\n", read->srr);
2470     }
2471     if (read->read_id != NULL) {
2472       fprintf (fp, "  <trace_name>%s</trace_name>\n", read->read_id);
2473     }
2474     fprintf (fp, "  <nbasecalls>%d</nbasecalls>\n", read->read_len);
2475     fprintf (fp, "  <valid>\n");
2476     fprintf (fp, "    <start>%d</start>\n", read->read_start);
2477     fprintf (fp, "    <stop>%d</stop>\n", read->read_stop);
2478     fprintf (fp, "  </valid>\n");
2479     fprintf (fp, "  <tiling direction = \"%s\">\n", read->is_complement ? "REVERSE" : "FORWARD");
2480     fprintf (fp, "    <start>%d</start>\n", read->tiling_start);
2481     fprintf (fp, "    <stop>%d</stop>\n", read->tiling_stop);
2482     fprintf (fp, "  </tiling>\n");
2483     fprintf (fp, "  <traceconsensus>\n");
2484     fprintf (fp, "    <start>%d</start>\n", read->cons_start);
2485     fprintf (fp, "    <stop>%d</stop>\n", read->cons_stop);
2486     fprintf (fp, "  </traceconsensus>\n");
2487     WriteTraceGapsXML (read->gaps, fp);
2488     fprintf (fp, "</trace>\n");
2489   }
2490 }
2491 
2492 
WriteTraceAssemblyFromContig(TContigPtr contig,FILE * fp)2493 extern void WriteTraceAssemblyFromContig (TContigPtr contig, FILE *fp)
2494 {
2495   int i;
2496 
2497   if (contig == NULL || fp == NULL) return;
2498 
2499   /* NOTE - need to add new field to TContigPtr for submitter reference, where orig ID should move to */
2500   fprintf (fp, "  <contig submitter_reference=\"%s\" conformation=\"LINEAR\" type=\"NEW\">\n",
2501            contig->consensus_id == NULL ? "not supplied" : contig->consensus_id);
2502 
2503   fprintf (fp, "    <ntraces>%d</ntraces>\n", contig->num_reads);
2504 
2505   fprintf (fp, "    <nconbases>%d</nconbases>\n", contig->consensus_seq_len);
2506 
2507   /* need nbasecalls */
2508 
2509   if (contig->gaps == NULL) {
2510     fprintf (fp, "    <ncongaps>0</ncongaps>\n");
2511   } else {
2512     fprintf (fp, "    <ncongaps>%d</ncongaps>\n", contig->gaps->num_gaps);
2513     if (contig->gaps->num_gaps > 0) {
2514       fprintf (fp, "  <congaps source=\"INLINE\">");
2515       for (i = 0; i < contig->gaps->num_gaps - 1; i++) {
2516         fprintf (fp, "%d ", contig->gaps->gap_offsets[i]);
2517       }
2518       fprintf (fp, "%d</congaps>\n", contig->gaps->gap_offsets[contig->gaps->num_gaps - 1]);
2519     }
2520   }
2521   fprintf (fp, "    <consensus>%s</consensus>\n",
2522            contig->consensus_id == NULL ? "not supplied" : contig->consensus_id);
2523   if (contig->num_qual_scores > 0) {
2524     fprintf (fp, "    <conqualities source=\"INLINE\">");
2525     for (i = 0; i < contig->num_qual_scores; i++) {
2526       fprintf (fp, "%d ", contig->qual_scores[i]);
2527     }
2528     fprintf (fp, "</conqualities>\n");
2529   }
2530 
2531   for (i = 0; i < contig->num_reads; i++) {
2532     WriteTraceReadXML (contig->reads[i], fp);
2533   }
2534   fprintf (fp, "  </contig>\n");
2535 }
2536 
2537 
2538 extern void
WriteTraceAssemblyHeader(char * assembly_type,char * subref,char * center_name,int taxid,char * description,char * assembly,int num_contigs,unsigned int num_conbases,int num_reads,unsigned int num_readbases,FILE * fp)2539 WriteTraceAssemblyHeader
2540 (char * assembly_type,
2541  char * subref,
2542  char * center_name,
2543  int    taxid,
2544  char * description,
2545  char * assembly,
2546  int    num_contigs,
2547  unsigned int    num_conbases,
2548  int    num_reads,
2549  unsigned int    num_readbases,
2550  FILE * fp)
2551 {
2552     if (fp == NULL) {
2553         return;
2554     }
2555 
2556     fprintf (fp, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
2557 
2558     fprintf (fp, "<assembly submitter_reference=\"%s\" type = \"%s\">\n",
2559                  subref == NULL ? "Not supplied" : subref,
2560                  assembly_type == NULL ? "NEW" : assembly_type);
2561     fprintf (fp, "  <center_name>%s</center_name>\n", center_name == NULL ? "Not supplied" : center_name);\
2562     fprintf (fp, "  <organism descriptor=\"TAXID\">%d</organism>\n", taxid);
2563     fprintf (fp, "  <description>%s</description>\n", description == NULL ? "Not supplied" : description);
2564     fprintf (fp, "  <structure>%s</structure>\n", assembly == NULL ? "transcript assembly" : assembly);
2565     fprintf (fp, "  <ncontigs>%d</ncontigs>\n", num_contigs);
2566     fprintf (fp, "  <nconbases>%u</nconbases>\n", num_conbases);
2567     fprintf (fp, "  <ntraces>%d</ntraces>\n", num_reads);
2568     fprintf (fp, "  <nbasecalls>%u</nbasecalls>\n", num_readbases);
2569     fprintf (fp, "  <coverage>%f</coverage>\n", num_conbases == 0 ? 0 : (float) ((float) num_readbases/ (float) num_conbases));
2570 }
2571 
2572 
WriteTraceAssemblyTrailer(FILE * fp)2573 extern void WriteTraceAssemblyTrailer (FILE *fp)
2574 {
2575     if (fp == NULL) {
2576         return;
2577     }
2578     fprintf (fp, "</assembly>\n");
2579 }
2580 
2581 
2582 extern void
WriteTraceAssemblyFromAceFile(TACEFilePtr afp,char * subref,char * center_name,int taxid,char * description,FILE * fp)2583 WriteTraceAssemblyFromAceFile
2584 (TACEFilePtr afp,
2585  char *      subref,
2586  char *      center_name,
2587  int         taxid,
2588  char *      description,
2589  FILE        *fp)
2590 {
2591   int i, j, traces = 0;
2592   unsigned int conbases = 0, basecalls = 0;
2593 
2594   if (afp == NULL || fp == NULL) return;
2595 
2596 
2597   for (i = 0; i < afp->num_contigs; i++) {
2598     conbases += afp->contigs[i]->consensus_seq_len;
2599     traces += afp->contigs[i]->num_reads;
2600     for (j = 0; j < afp->contigs[i]->num_reads; j++) {
2601       basecalls += afp->contigs[i]->reads[j]->read_len;
2602     }
2603   }
2604   WriteTraceAssemblyHeader (NULL, subref, center_name, taxid, description, NULL, afp->num_contigs, conbases, traces, basecalls, fp);
2605 
2606   for (i = 0; i < afp->num_contigs; i++) {
2607     WriteTraceAssemblyFromContig (afp->contigs[i], fp);
2608   }
2609   WriteTraceAssemblyTrailer (fp);
2610 }
2611 
2612 
WriteFASTAFromContig(TContigPtr contig,FILE * fp)2613 extern void WriteFASTAFromContig
2614 (TContigPtr contig,
2615  FILE       *fp)
2616 {
2617     int   k;
2618     char *cp;
2619 
2620     if (contig == NULL || fp == NULL) return;
2621 
2622     fprintf (fp, ">%s\n", contig->consensus_id);
2623     cp = contig->consensus_seq;
2624     while (*cp != 0) {
2625         k = 0;
2626         while (k < 40 && *cp != 0) {
2627             if (*cp != '*') {
2628                 fprintf (fp, "%c", *cp);
2629                 k++;
2630             }
2631             cp++;
2632         }
2633         fprintf (fp, "\n");
2634     }
2635     fprintf (fp, "\n");
2636 }
2637 
2638 
2639 extern void
WriteFASTAFromAceFile(TACEFilePtr afp,FILE * fp)2640 WriteFASTAFromAceFile
2641 (TACEFilePtr afp,
2642  FILE        *fp)
2643 {
2644   int i;
2645 
2646   if (afp == NULL || fp == NULL) return;
2647 
2648   for (i = 0; i < afp->num_contigs; i++) {
2649     WriteFASTAFromContig (afp->contigs[i], fp);
2650   }
2651 }
2652 
2653 
2654 #define kFASTASeqBufSize 100
2655 
2656 typedef struct fastaseqbuf {
2657   char buf[kFASTASeqBufSize];
2658   int  num_used;
2659   struct fastaseqbuf *next;
2660 } SFASTASeqBuf, * TFASTASeqBufPtr;
2661 
2662 
s_FASTASeqBufNew()2663 static TFASTASeqBufPtr s_FASTASeqBufNew ()
2664 {
2665     TFASTASeqBufPtr s;
2666 
2667     s = (TFASTASeqBufPtr) malloc (sizeof (SFASTASeqBuf));
2668     if (s != NULL) {
2669         s->num_used = 0;
2670         s->next = NULL;
2671     }
2672     return s;
2673 }
2674 
2675 
s_FASTASeqBufFree(TFASTASeqBufPtr s)2676 static void s_FASTASeqBufFree (TFASTASeqBufPtr s)
2677 {
2678     TFASTASeqBufPtr s_next;
2679 
2680     while (s != NULL) {
2681         s_next = s->next;
2682         free (s);
2683         s = s_next;
2684     }
2685 }
2686 
2687 
s_AddFASTAToBuf(char * line,TFASTASeqBufPtr buf)2688 static TFASTASeqBufPtr s_AddFASTAToBuf (char *line, TFASTASeqBufPtr buf)
2689 {
2690     TFASTASeqBufPtr last_buf;
2691     char *cp;
2692 
2693     if (buf == NULL) {
2694         buf = s_FASTASeqBufNew();
2695         last_buf = buf;
2696     } else {
2697         last_buf = buf;
2698         while (last_buf->next != NULL) {
2699             last_buf = last_buf->next;
2700         }
2701     }
2702 
2703     cp = line;
2704     while (*cp != 0 && *cp != '\r' && *cp != '\n') {
2705         while (isspace (*cp)) {
2706             cp++;
2707         }
2708         if (*cp != 0) {
2709             if (!isalpha (*cp)) {
2710                 printf ("Found bad character in FASTA file!\n");
2711                 s_FASTASeqBufFree (buf);
2712                 buf = NULL;
2713                 return buf;
2714             }
2715             if (last_buf->num_used == kFASTASeqBufSize) {
2716                 last_buf->next = s_FASTASeqBufNew ();
2717                 last_buf = last_buf->next;
2718             }
2719             last_buf->buf[last_buf->num_used++] = *cp;
2720             cp++;
2721         }
2722     }
2723     return buf;
2724 }
2725 
2726 
s_StripStars(char * str)2727 static char * s_StripStars (char *str)
2728 {
2729     char *cp_src;
2730     char *cp_dst;
2731     char *stripped;
2732 
2733     if (str == NULL) {
2734         return 0;
2735     }
2736     cp_src = str;
2737     stripped = (char *) malloc (sizeof (char) * (strlen (str) + 1));
2738     cp_dst = stripped;
2739     while (*cp_src != 0) {
2740         if (*cp_src != '*') {
2741             *cp_dst = *cp_src;
2742             cp_dst++;
2743         }
2744         cp_src++;
2745     }
2746     *cp_dst = 0;
2747     return stripped;
2748 }
2749 
2750 
s_DoesFASTAMatchSeq(TFASTASeqBufPtr buf,char * trimmed_seq)2751 static int s_DoesFASTAMatchSeq (TFASTASeqBufPtr buf, char *trimmed_seq)
2752 {
2753     int does_match = 1, match_len, seq_len;
2754 
2755     if (buf == NULL || trimmed_seq == NULL || *trimmed_seq == 0) {
2756         return 1;
2757     }
2758 
2759     seq_len = strlen (trimmed_seq);
2760     while (buf != NULL && seq_len > 0 && does_match) {
2761         if (seq_len < buf->num_used) {
2762             match_len = seq_len;
2763         } else {
2764             match_len = buf->num_used;
2765         }
2766         if (strncmp (buf->buf, trimmed_seq, match_len) == 0) {
2767             buf = buf->next;
2768             trimmed_seq += match_len;
2769             seq_len -= match_len;
2770         } else {
2771             does_match = 0;
2772         }
2773     }
2774     return does_match;
2775 }
2776 
2777 
s_CompLetter(char ch)2778 static char s_CompLetter (char ch)
2779 {
2780     switch (ch) {
2781         case 'A':
2782             ch = 'T';
2783             break;
2784         case 'T':
2785             ch = 'A';
2786             break;
2787         case 'G':
2788             ch = 'C';
2789             break;
2790         case 'C':
2791             ch = 'G';
2792             break;
2793     }
2794     return ch;
2795 }
2796 
2797 
s_RevCompSequence(char * seq)2798 static void s_RevCompSequence (char *seq)
2799 {
2800     char tmp;
2801     int len, i;
2802 
2803     if (seq == NULL || *seq == 0) {
2804         return;
2805     }
2806     len = strlen (seq);
2807 
2808     for (i = 0; i < len / 2; i++) {
2809         tmp = seq[i];
2810         seq[i] = s_CompLetter (seq[len - i - 1]);
2811         seq[len - i - 1] = s_CompLetter (tmp);
2812     }
2813     if (len %2 > 0) {
2814         seq[i] = s_CompLetter (seq[i]);
2815     }
2816 }
2817 
2818 
s_GetSequenceOffset(TFASTASeqBufPtr buf,char * trimmed_seq,char is_complement)2819 static int s_GetSequenceOffset (TFASTASeqBufPtr buf, char *trimmed_seq, char is_complement)
2820 {
2821     int offset = 0, buf_offset;
2822     int match_found = 0;
2823     int match_len, seq_len;
2824 
2825     if (buf == NULL || trimmed_seq == NULL) {
2826         return 0;
2827     }
2828 
2829     trimmed_seq = s_StripStars (trimmed_seq);
2830 
2831     if (is_complement) {
2832         s_RevCompSequence (trimmed_seq);
2833     }
2834 
2835     seq_len = strlen (trimmed_seq);
2836 
2837     while (buf != NULL && !match_found) {
2838         buf_offset = 0;
2839         while (buf_offset < buf->num_used && !match_found) {
2840             if (buf->num_used - buf_offset < seq_len) {
2841                 match_len = buf->num_used - buf_offset;
2842             } else {
2843                 match_len = seq_len;
2844             }
2845             if (match_len < seq_len && buf->next == NULL) {
2846                 /* ran out of sequence, no match */
2847                 buf_offset = buf->num_used;
2848             } else if (strncmp (buf->buf + buf_offset, trimmed_seq, match_len) == 0
2849                 && s_DoesFASTAMatchSeq (buf->next, trimmed_seq + match_len)) {
2850                 match_found = 1;
2851             } else {
2852                 buf_offset++;
2853                 offset++;
2854             }
2855         }
2856         if (!match_found) {
2857             buf = buf->next;
2858         }
2859     }
2860     free (trimmed_seq);
2861     if (match_found) {
2862         return offset;
2863     } else {
2864         return -1;
2865     }
2866 }
2867 
2868 
2869 #define kSeqListBufSize 100
2870 
2871 typedef struct seqlistbuf {
2872   TFASTASeqBufPtr buf[kSeqListBufSize];
2873   char * id_list[kSeqListBufSize];
2874   int  num_used;
2875   struct seqlistbuf *next;
2876 } SSeqListBuf, * TSeqListBufPtr;
2877 
2878 
s_SeqListBufNew()2879 static TSeqListBufPtr s_SeqListBufNew ()
2880 {
2881     TSeqListBufPtr s;
2882 
2883     s = (TSeqListBufPtr) malloc (sizeof (SSeqListBuf));
2884     if (s != NULL) {
2885         s->num_used = 0;
2886         s->next = NULL;
2887     }
2888     return s;
2889 }
2890 
2891 
s_SeqListBufFree(TSeqListBufPtr s)2892 static void s_SeqListBufFree (TSeqListBufPtr s)
2893 {
2894     TSeqListBufPtr s_next;
2895     int i;
2896 
2897     while (s != NULL) {
2898         s_next = s->next;
2899         for (i = 0; i < s->num_used; i++) {
2900             s_FASTASeqBufFree (s->buf[i]);
2901             free (s->id_list[i]);
2902             s->buf[i] = NULL;
2903         }
2904         free (s);
2905         s = s_next;
2906     }
2907 }
2908 
2909 
s_GetFASTAIdFromString(char * str)2910 static char * s_GetFASTAIdFromString (char * str)
2911 {
2912     char * cp;
2913     char * id;
2914     int    len;
2915 
2916     if (str == NULL) {
2917         return NULL;
2918     }
2919 
2920     cp = str;
2921     cp += strspn (str, " >\t");
2922     len = strcspn (cp, " \t\r\n");
2923     if (len == 0) {
2924         return NULL;
2925     }
2926     id = (char *)malloc (len + 1);
2927     if (id == NULL) {
2928         return NULL;
2929     }
2930     strncpy (id, cp, len);
2931     id [ len ] = 0;
2932     return id;
2933 }
2934 
2935 
s_AddToSeqList(char * line,TSeqListBufPtr buf)2936 static TSeqListBufPtr s_AddToSeqList (char *line, TSeqListBufPtr buf)
2937 {
2938     TSeqListBufPtr last_buf;
2939 
2940     if (line == NULL) {
2941         return buf;
2942     }
2943     if (buf == NULL) {
2944         buf = s_SeqListBufNew();
2945         last_buf = buf;
2946     } else {
2947         last_buf = buf;
2948         while (last_buf->next != NULL) {
2949             last_buf = last_buf->next;
2950         }
2951     }
2952 
2953     if (*line == '>') {
2954         if (last_buf->num_used == kSeqListBufSize) {
2955             last_buf->next = s_SeqListBufNew();
2956             last_buf = last_buf->next;
2957         }
2958         last_buf->buf[last_buf->num_used] = s_FASTASeqBufNew ();
2959         last_buf->id_list[last_buf->num_used] = s_GetFASTAIdFromString (line);
2960         last_buf->num_used++;
2961     } else if (last_buf->num_used > 0) {
2962         last_buf->buf[last_buf->num_used - 1] = s_AddFASTAToBuf (line, last_buf->buf[last_buf->num_used - 1]);
2963     }
2964 
2965     return buf;
2966 }
2967 
2968 
s_GetFastaSeq(TSeqListBufPtr buf,char * id)2969 static TFASTASeqBufPtr s_GetFastaSeq (TSeqListBufPtr buf, char *id)
2970 {
2971     TFASTASeqBufPtr seq = NULL;
2972     char *cp;
2973     int i, match_len;
2974 
2975     if (id == NULL) {
2976         return NULL;
2977     }
2978 
2979     cp = strchr (id, '.');
2980     if (cp == NULL) {
2981         match_len = strlen (id);
2982     } else {
2983         match_len = cp - id;
2984     }
2985 
2986     while (buf != NULL && seq == NULL) {
2987         for (i = 0; i < buf->num_used && seq == NULL; i++) {
2988             if (strncmp (id, buf->id_list[i], match_len) == 0) {
2989                 seq = buf->buf[i];
2990             }
2991         }
2992         buf = buf->next;
2993     }
2994     return seq;
2995 }
2996 
2997 
s_ReadFastaFile(FReadLineFunction readfunc,void * userdata)2998 static TSeqListBufPtr s_ReadFastaFile (FReadLineFunction readfunc, void * userdata)
2999 {
3000     char *linestring;
3001     TSeqListBufPtr fasta = NULL;
3002 
3003     if (readfunc == NULL) {
3004         return NULL;
3005     }
3006     linestring = readfunc (userdata);
3007     while (!s_IsEOF (linestring)) {
3008         fasta = s_AddToSeqList (linestring, fasta);
3009         free (linestring);
3010         linestring = readfunc (userdata);
3011     }
3012     return fasta;
3013 }
3014 
3015 
3016 
3017 
3018 #define kQualScoreBufSize 100
3019 
3020 typedef struct qualscorelist {
3021   int scores[kQualScoreBufSize];
3022   int num_used;
3023   struct qualscorelist *next;
3024 } SQualScoreList, * TQualScoreListPtr;
3025 
3026 
s_QualScoreNew()3027 static TQualScoreListPtr s_QualScoreNew ()
3028 {
3029     TQualScoreListPtr s;
3030 
3031     s =(TQualScoreListPtr) malloc (sizeof (SQualScoreList));
3032     if (s != NULL) {
3033         s->num_used = 0;
3034         s->next = NULL;
3035     }
3036     return s;
3037 }
3038 
3039 
s_QualScoreFree(TQualScoreListPtr s)3040 static void s_QualScoreFree (TQualScoreListPtr s)
3041 {
3042     TQualScoreListPtr s_next;
3043 
3044     while (s != NULL) {
3045         s_next = s->next;
3046         free (s);
3047         s = s_next;
3048     }
3049 }
3050 
3051 
s_AddQualScores(char * line,TQualScoreListPtr scores)3052 static TQualScoreListPtr s_AddQualScores (char *line, TQualScoreListPtr scores)
3053 {
3054     TQualScoreListPtr last_score;
3055     char *cp;
3056 
3057     if (scores == NULL) {
3058         scores = s_QualScoreNew();
3059         last_score = scores;
3060     } else {
3061         last_score = scores;
3062         while (last_score->next != NULL) {
3063             last_score = last_score->next;
3064         }
3065     }
3066 
3067     cp = line;
3068     while (*cp != 0 && *cp != '\r' && *cp != '\n') {
3069         while (isspace (*cp)) {
3070             cp++;
3071         }
3072         if (*cp != 0) {
3073             if (!isdigit (*cp)) {
3074                 printf ("Found bad character in quality scores file!\n");
3075                 s_QualScoreFree (scores);
3076                 scores = NULL;
3077                 return scores;
3078             }
3079             if (last_score->num_used == kQualScoreBufSize) {
3080                 last_score->next = s_QualScoreNew ();
3081                 last_score = last_score->next;
3082             }
3083             last_score->scores[last_score->num_used++] = atoi (cp);
3084             while (isdigit (*cp)) {
3085                 cp++;
3086             }
3087         }
3088     }
3089     return scores;
3090 }
3091 
3092 
s_AddScoresToRead(TContigReadPtr read,TQualScoreListPtr scores,TSeqListBufPtr fasta)3093 static int s_AddScoresToRead (TContigReadPtr read, TQualScoreListPtr scores, TSeqListBufPtr fasta)
3094 {
3095     int score_pos = 0;
3096     int offset = 0;
3097     int skip, score_len;
3098     char *cp;
3099     int *dst;
3100     TFASTASeqBufPtr fasta_seq;
3101 
3102     if (read == NULL || scores == NULL) {
3103         return 0;
3104     }
3105 
3106     if (fasta == NULL) {
3107         skip = read->read_start - 1;
3108     } else {
3109         fasta_seq = s_GetFastaSeq (fasta, read->read_id);
3110         if (fasta_seq == NULL) {
3111             printf ("Unable to locate fasta for %s\n", read->read_id);
3112             return 0;
3113         }
3114 
3115         skip = s_GetSequenceOffset (fasta_seq, read->read_seq, read->is_complement);
3116         if (skip < 0) {
3117             printf ("ACE read did not match FASTA read for %s\n", read->read_id);
3118             return 0;
3119         }
3120     }
3121 
3122     /* skip over scores before part used in assembly */
3123     while (scores != NULL && score_pos < skip) {
3124         if (skip - score_pos < scores->num_used) {
3125             offset = skip - score_pos;
3126             score_pos = skip;
3127         } else if (scores->next == NULL) {
3128             printf ("Not enough scores read for %s\n", read->read_id);
3129             return 0;
3130         } else {
3131             score_pos += kQualScoreBufSize;
3132             scores = scores->next;
3133         }
3134     }
3135 
3136     score_len = strlen (read->read_seq);
3137     read->qual_scores = malloc (sizeof (int) * score_len);
3138 
3139     if (read->is_complement) {
3140         /* need to read scores in reverse direction */
3141         cp = read->read_seq + score_len - 1;
3142         dst = read->qual_scores + score_len - 1;
3143         while (scores != NULL && read->num_qual_scores < score_len) {
3144             if (*cp == '*') {
3145                 *dst = 0;
3146             } else {
3147                 *dst = scores->scores[offset];
3148                 offset++;
3149             }
3150             cp--;
3151             dst--;
3152             read->num_qual_scores++;
3153 
3154             if (offset == kQualScoreBufSize) {
3155                 scores = scores->next;
3156                 offset = 0;
3157             }
3158         }
3159     } else {
3160         cp = read->read_seq;
3161         dst = read->qual_scores;
3162         while (scores != NULL && read->num_qual_scores < score_len) {
3163             if (*cp == '*') {
3164                 *dst = 0;
3165             } else {
3166                 *dst = scores->scores[offset];
3167                 offset++;
3168             }
3169             cp++;
3170             dst++;
3171             read->num_qual_scores++;
3172 
3173             if (offset == kQualScoreBufSize) {
3174                 scores = scores->next;
3175                 offset = 0;
3176             }
3177         }
3178     }
3179     if (read->num_qual_scores == score_len) {
3180         return 1;
3181     } else {
3182         printf ("Not enough qual scores for %s\n", read->read_id);
3183         return 0;
3184     }
3185 }
3186 
3187 
s_AddQualScoresToReadsInAceFile(TACEFilePtr afp,char * id,TQualScoreListPtr scores,TSeqListBufPtr fasta)3188 static int s_AddQualScoresToReadsInAceFile (TACEFilePtr afp, char *id, TQualScoreListPtr scores, TSeqListBufPtr fasta)
3189 {
3190     int i, j, found = 0, match_len, rval = 1;
3191     char *cp;
3192 
3193     if (afp == NULL || id == NULL) {
3194         return 0;
3195     }
3196 
3197     cp = strchr (id, '.');
3198     if (cp == NULL) {
3199         match_len = strlen (id);
3200     } else {
3201         match_len = cp - id;
3202     }
3203 
3204     for (i = 0; i < afp->num_contigs; i++) {
3205         for (j = 0; j < afp->contigs[i]->num_reads; j++) {
3206             if (strncmp (afp->contigs[i]->reads[j]->read_id, id, match_len) == 0) {
3207                 found = 1;
3208                 rval &= s_AddScoresToRead (afp->contigs[i]->reads[j], scores, fasta);
3209             }
3210         }
3211     }
3212     if (!found) {
3213         printf ("Unable to locate %s in ACE file\n", id);
3214         rval = 0;
3215     }
3216     return rval;
3217 }
3218 
3219 
3220 extern int
AddReadQualScores(TACEFilePtr afp,FReadLineFunction readfunc,void * userdata,FReadLineFunction fasta_readfunc,void * fasta_userdata)3221 AddReadQualScores
3222 (TACEFilePtr          afp,
3223  FReadLineFunction    readfunc,
3224  void *               userdata,
3225  FReadLineFunction    fasta_readfunc,
3226  void *               fasta_userdata)
3227 {
3228     char *linestring;
3229     char *score_id = NULL;
3230     TQualScoreListPtr scores = NULL;
3231     TSeqListBufPtr    fasta = NULL;
3232     int               rval = 1;
3233 
3234     if (afp == NULL || readfunc == NULL) {
3235         return 0;
3236     }
3237 
3238     if (fasta_readfunc != NULL) {
3239         fasta = s_ReadFastaFile (fasta_readfunc, fasta_userdata);
3240         if (fasta == NULL) {
3241             printf ("Unable to read FASTA file\n");
3242             return 0;
3243         }
3244     }
3245 
3246     linestring = readfunc (userdata);
3247     while (!s_IsEOF (linestring)) {
3248         if (linestring[0] == '>') {
3249             if (score_id != NULL && scores != NULL) {
3250                 /* add previously read scores to last read */
3251                 if (s_AddQualScoresToReadsInAceFile (afp, score_id, scores, fasta) == 0) {
3252                     printf ("Failed to add quality scores from %s\n", score_id);
3253                     rval = 0;
3254                 }
3255             }
3256             s_QualScoreFree (scores);
3257             scores = NULL;
3258             free (score_id);
3259 
3260             score_id = s_GetFASTAIdFromString (linestring);
3261         } else if (score_id != NULL) {
3262             scores = s_AddQualScores (linestring, scores);
3263         }
3264         free (linestring);
3265         linestring = readfunc (userdata);
3266     }
3267 
3268     /* handle last set of scores read */
3269     if (score_id != NULL && scores != NULL) {
3270         /* add previously read scores to last read */
3271         if (s_AddQualScoresToReadsInAceFile (afp, score_id, scores, fasta) == 0) {
3272             printf ("Failed to add quality scores from %s\n", score_id);
3273             rval = 0;
3274         }
3275     }
3276     s_SeqListBufFree (fasta);
3277     s_QualScoreFree (scores);
3278     scores = NULL;
3279     free (score_id);
3280     return rval;
3281 }
3282 
3283 
s_LetterPos(char ch)3284 static int s_LetterPos (char ch) {
3285     int rval = -1;
3286 
3287     switch (ch) {
3288         case 'A':
3289             rval = 0;
3290             break;
3291         case 'T':
3292             rval = 1;
3293             break;
3294         case 'G':
3295             rval = 2;
3296             break;
3297         case 'C':
3298             rval = 3;
3299             break;
3300         case '*':
3301             rval = 4;
3302             break;
3303     }
3304     return rval;
3305 }
3306 
3307 
s_GetUngappedPosition(int gapped_pos,char * seq)3308 static int s_GetUngappedPosition (int gapped_pos, char *seq)
3309 {
3310     int pos = 0, ungapped_pos = 0;
3311     int gaps_found = 0;
3312     char *cp;
3313 
3314     cp = seq;
3315     while (*cp != 0 && pos < gapped_pos ) {
3316         if (*cp == '*') {
3317             gaps_found++;
3318         } else {
3319             ungapped_pos++;
3320         }
3321         cp++;
3322         pos++;
3323     }
3324     return ungapped_pos;
3325 }
3326 
3327 
s_GetQualScoreForReadPos(TContigReadPtr r,int pos)3328 static int s_GetQualScoreForReadPos (TContigReadPtr r, int pos)
3329 {
3330     if (r == NULL || pos < 0) {
3331         return 0;
3332     }
3333 
3334     /* note - don't need to get ungapped position because 0s are inserted when qual scores
3335      * are added to the reads.
3336      */
3337     /*pos = s_GetUngappedPosition (pos, r->read_seq); */
3338     if (pos > r->num_qual_scores) {
3339         return 0;
3340     } else {
3341         return r->qual_scores[pos];
3342     }
3343 }
3344 
3345 
ReplaceConsensusSequenceFromTraces(TContigPtr contig,char only_ns)3346 extern int ReplaceConsensusSequenceFromTraces (TContigPtr contig, char only_ns)
3347 {
3348     char * consensus_buf;
3349     int  * new_qual_scores = NULL;
3350     int    num_qual_scores = 0;
3351     int    i, k, best, letter_pos;
3352     int    char_counts[5];
3353     char   best_ch, ch;
3354     int    num_best, sum_best;
3355     int  * consensus_qual_ptr = NULL;
3356     int    read_offset, len;
3357     int    num_change = 0;
3358 
3359     if (contig == NULL) {
3360         return 0;
3361     }
3362 
3363     consensus_buf = (char *) malloc (sizeof (char) * (contig->consensus_assem_len + 1));
3364     if (contig->reads[0]->num_qual_scores > 0) {
3365         new_qual_scores = (int *) malloc (sizeof (int) * contig->consensus_seq_len);
3366     }
3367 
3368     consensus_qual_ptr = contig->qual_scores;
3369 
3370     for (i = 0; i < contig->consensus_assem_len; i++) {
3371         if (only_ns && contig->consensus_seq[i] != 'N') {
3372             /* just use existing consensus character */
3373             consensus_buf[i] = contig->consensus_seq[i];
3374             /* add in qual scores */
3375             if (consensus_qual_ptr != NULL && new_qual_scores != NULL && contig->consensus_seq[i] != '*') {
3376                 new_qual_scores[num_qual_scores++] = *consensus_qual_ptr;
3377             }
3378         } else {
3379             for (k = 0; k < 5; k++) {
3380                 char_counts[k] = 0;
3381             }
3382             best = 0;
3383             best_ch = 'N';
3384             for (k = 0; k < contig->num_reads; k++) {
3385                 read_offset = i - contig->reads[k]->cons_start;
3386                 len = strlen (contig->reads[k]->read_seq);
3387                 if (len > read_offset
3388                     && read_offset >= 0) {
3389                     ch = toupper (contig->reads[k]->read_seq[read_offset]);
3390                     letter_pos = s_LetterPos (ch);
3391                     if (letter_pos > -1) {
3392                       char_counts[letter_pos]++;
3393                       if (char_counts[letter_pos] > best
3394                           || (char_counts[letter_pos] == best && best_ch == '*')) {
3395                         best_ch = ch;
3396                         best = char_counts[letter_pos];
3397                       }
3398                     }
3399                 }
3400             }
3401             if (toupper (consensus_buf[i]) != best_ch) {
3402                 num_change++;
3403                 consensus_buf[i] = best_ch;
3404                 if (best_ch != '*') {
3405                     /* calculate quality score */
3406                     if (new_qual_scores != NULL) {
3407                         sum_best = 0;
3408                         num_best = 0;
3409                         for (k = 0; k < contig->num_reads; k++) {
3410                             if (contig->reads[k]->num_qual_scores > i - contig->reads[k]->cons_start
3411                                 &&  best_ch == toupper (contig->reads[k]->read_seq[i - contig->reads[k]->cons_start])) {
3412                                 num_best ++;
3413                                 sum_best += s_GetQualScoreForReadPos (contig->reads[k], i - contig->reads[k]->cons_start);
3414                             }
3415                         }
3416                         if (num_best == 0) {
3417                             new_qual_scores[num_qual_scores++] = 0;
3418                         } else {
3419                             new_qual_scores[num_qual_scores++] = sum_best / num_best;
3420                         }
3421                     }
3422                 }
3423             }
3424         }
3425         if (consensus_qual_ptr != NULL && contig->consensus_seq[i] != '*') {
3426             consensus_qual_ptr++;
3427         }
3428     }
3429     consensus_buf[i] = 0;
3430 
3431     free (contig->consensus_seq);
3432     contig->consensus_seq = consensus_buf;
3433     if (contig->qual_scores != NULL) {
3434         free (contig->qual_scores);
3435     }
3436     contig->qual_scores = new_qual_scores;
3437     contig->num_qual_scores = num_qual_scores;
3438 
3439     return num_change;
3440 }
3441 
3442 
RecalculateConsensusSequences(TACEFilePtr ace_file,char only_ns)3443 extern void RecalculateConsensusSequences (TACEFilePtr ace_file, char only_ns)
3444 {
3445     int i;
3446 
3447     if (ace_file == NULL) {
3448         return;
3449     }
3450 
3451     for (i = 0; i < ace_file->num_contigs; i++) {
3452         ReplaceConsensusSequenceFromTraces(ace_file->contigs[i], only_ns);
3453     }
3454 
3455 }
3456 
3457 
WriteContigQualScores(TContigPtr contig,FILE * out)3458 extern void WriteContigQualScores (TContigPtr contig, FILE *out)
3459 {
3460     int i = 0, j;
3461 
3462     if (contig == NULL || contig->qual_scores == NULL || contig->num_qual_scores < 1 || out == NULL) {
3463         return;
3464     }
3465     fprintf (out, ">%s\n", contig->consensus_id);
3466 
3467     while (i < contig->num_qual_scores) {
3468         for (j = 0; j < 60 && i < contig->num_qual_scores; j++, i++) {
3469             fprintf (out, "%d ", contig->qual_scores[i]);
3470         }
3471         fprintf (out, "\n");
3472     }
3473     fprintf (out, "\n");
3474 }
3475 
3476 
3477 extern char
ProcessLargeACEFileForContigFastaAndQualScores(FReadLineFunction readfunc,void * userdata,char make_qual_scores,char * has_errors,ProcessContigFunc process_func,void * process_data)3478 ProcessLargeACEFileForContigFastaAndQualScores
3479 (FReadLineFunction    readfunc,
3480  void *               userdata,
3481  char                 make_qual_scores,
3482  char *               has_errors,
3483  ProcessContigFunc    process_func,
3484  void *               process_data)
3485 {
3486     char *              linestring;
3487     char *              cp;
3488     int                 contig_num = 0, read_num = 0;
3489     int                 num_reads_expected = 0;
3490     int                 num_contigs = 0;
3491     TContigPtr          contig = NULL;
3492     char                rval = 1;
3493 
3494     if (readfunc == NULL || process_func == NULL) {
3495         return 0;
3496     }
3497 
3498     linestring = readfunc (userdata);
3499 
3500     while (linestring != NULL  &&  linestring [0] != EOF) {
3501         if (linestring [0] == 'A' && linestring [1] == 'S' && isspace (linestring [2])) {
3502             if (num_reads_expected > 0) {
3503                 PrintACEFormatErrorXML ("Two file header lines!", NULL, has_errors);
3504                 return 0;
3505             }
3506             /* first line in file, number of contigs */
3507             cp = linestring + 3;
3508             num_contigs = atoi (cp);
3509             while (isdigit (*cp)) {
3510                 cp++;
3511             }
3512             num_reads_expected = atoi (cp);
3513             linestring = readfunc (userdata);
3514         } else if (linestring [0] == 'C' && linestring [1] == 'O' && isspace (linestring [2])) {
3515             if (contig_num >= num_contigs) {
3516                 PrintACEFormatErrorXML ("Too many contigs!", NULL, has_errors);
3517                 return 0;
3518             }
3519 
3520             contig = s_ReadContig (&linestring, readfunc, userdata, make_qual_scores, has_errors);
3521             if (contig == NULL) {
3522                 PrintACEFormatErrorXMLStart (NULL, has_errors);
3523                 printf ("Unable to read contig (%d)", contig_num);
3524                 PrintACEFormatErrorXMLEnd ();
3525                 return 0;
3526             }
3527             read_num += contig->num_reads;
3528             process_func (contig, process_data);
3529             ContigFree (contig);
3530             contig = NULL;
3531             contig_num++;
3532         } else if (s_UnexpectedLineBetweenContigs (linestring)) {
3533             PrintACEFormatErrorXMLStart (NULL, has_errors);
3534             printf ("Unexpected line after contig %d", read_num);
3535             PrintACEFormatErrorXMLEnd ();
3536             return 0;
3537         } else {
3538             linestring = readfunc (userdata);
3539         }
3540     }
3541     if (contig_num < num_contigs) {
3542         PrintACEFormatErrorXML ("Not enough contigs!", NULL, has_errors);
3543         rval = 0;
3544     } else if (read_num < num_reads_expected) {
3545         PrintACEFormatErrorXML ("Not enough reads!", NULL, has_errors);
3546         rval = 0;
3547     }
3548 
3549     return rval;
3550 }
3551