1 //------------------------------------------------------------------------------
2 // Programmer: Adam M Phillippy, The Institute for Genomic Research
3 // File: show-tiling.cc
4 // Date: 09 / 13 / 2002
5 //
6 // Usage: show-tiling [options] <deltafile>
7 // Try 'show-tiling -h' for more information
8 //
9 // Description: For use in conjunction with the MUMmer package.
10 // "show-tiling" predicts and displays a query to reference tiling
11 // path. Note that this program is most effective if the query
12 // sequences are small fragments of a large refence.
13 //
14 //------------------------------------------------------------------------------
15
16 #include <mummer/delta.hh>
17 #include <mummer/tigrinc.hh>
18 #include <mummer/redirect_to_pager.hpp>
19 #include <vector>
20 #include <algorithm>
21 using namespace std;
22
23 //------------------------------------------------------------- Constants ----//
24 const int CHARS_PER_LINE = 60;
25
26 const long int DEFAULT_MIN_CONTIG_LENGTH = 1;
27
28 const long int DEFAULT_NUCMER_MAX_GAP_SIZE = 1000;
29 const long int DEFAULT_PROMER_MAX_GAP_SIZE = -1;
30
31 const float DEFAULT_NUCMER_MIN_COVERAGE = 95.0;
32 const float DEFAULT_PROMER_MIN_COVERAGE = 50.0;
33
34 const float DEFAULT_NUCMER_MIN_COVERAGE_DIFF = 10.0;
35 const float DEFAULT_PROMER_MIN_COVERAGE_DIFF = 30.0;
36
37 const float DEFAULT_NUCMER_MIN_PIDY = 90.0;
38 const float DEFAULT_PROMER_MIN_PIDY = 55.0;
39
40 const char FORWARD_CHAR = '+';
41 const char REVERSE_CHAR = '-';
42
43 const float REPEAT_PIDY_DIFF = 0.25;
44
45 const int UNUSABLE_TILE_LEVEL = -2;
46 const int IGNORE_TILE_LEVEL = -1;
47 const int UNUSED_TILE_LEVEL = 0;
48 const int USED_TILE_LEVEL = 1;
49
50 char NULL_STRING[1] = "";
51
52
53
54 //------------------------------------------------------ Type Definitions ----//
55 struct AlignStats
56 //-- Alignment statistics data structure
57 {
58 float Idy; // percent identity [0.0, 100.0]
59 float Sim; // percent similarity [0.0, 100.0]
60 float Stp; // precent stop codon [0.0, 100.0]
61
62 char DirQ; // contig orientation (relative to ref)
63
64 long int sQ, eQ, sR, eR; // start and end in Query and Reference
65 // relative to the directional strand
66
67 char * IdR; // FASTA Id of the mapping reference
68 long int SeqLenR; // length of the reference
69
70 bool isTiled; // is the alignment be tiled?
71 };
72
73
74
75 struct QueryContig
76 //-- Query sequence tiling contig data structure
77 {
78 char * IdQ; // FASTA Id of the query
79 long int SeqLenQ; // length of the query
80 char * SeqQ; // query sequence
81
82 vector<AlignStats> Aligns; // alignments for this contig
83
84 int TileLevel; // describes tiling status of query contig
85
86 //-- Things to be filled in once the contig is tiled
87 char DirQ; // orientation of the contig
88 char * IdR; // Id of the mapping reference
89 long int SeqLenR; // sequence length of the reference
90 long int StartR, EndR; // contig -> reference mapping coords
91
92 long int LoTrim, HiTrim; // lo and hi trim lengths
93
94 vector<QueryContig>::iterator linksTo; // who does this contig link to?
95 bool isLinkHead; // is the head of the linked list?
96 };
97
98
99
100 struct IdR_StartR_Sort
101 //-- For sorting QueryContigs by IdR, then StartR coordinates
102 {
operator ( )IdR_StartR_Sort103 bool operator( ) (const QueryContig & pA, const QueryContig & pB)
104 {
105 int cmp = strcmp (pA.IdR, pB.IdR);
106
107 //-- sort IdR
108 if ( cmp < 0 )
109 return true;
110 else if ( cmp > 0 )
111 return false;
112
113 //-- sort StartR
114 else if ( pA.StartR < pB.StartR )
115 return true;
116 else if ( pA.StartR > pB.StartR )
117 return false;
118
119 //-- sort SeqLenQ
120 else if ( pA.SeqLenQ > pB.SeqLenQ )
121 return true;
122 else
123 return false;
124 }
125 };
126
127
128
129 struct IdR_StartRTrimmed_Sort
130 //-- For sorting contig output by trim offset
131 {
operator ( )IdR_StartRTrimmed_Sort132 bool operator( ) (const QueryContig & pA, const QueryContig & pB)
133 {
134 int cmp = strcmp (pA.IdR, pB.IdR);
135
136 //-- sort IdR
137 if ( cmp < 0 )
138 return true;
139 else if ( cmp > 0 )
140 return false;
141
142 //-- sort StartR
143 else if ( pA.StartR + pA.LoTrim < pB.StartR + pA.LoTrim )
144 return true;
145 else
146 return false;
147 }
148 };
149
150
151
152 struct IdQ_Sort
153 //-- For sorting QueryContigs by IdQ
154 {
operator ( )IdQ_Sort155 bool operator( ) (const QueryContig & pA, const QueryContig & pB)
156 {
157 //-- sort IdQ
158 if ( strcmp (pA.IdQ, pB.IdQ) < 0 )
159 return true;
160 else
161 return false;
162 }
163 };
164
165
166
167 struct IdR_sQ_Sort
168 //-- For sorting AlignStats by IdR, then sQ
169 {
operator ( )IdR_sQ_Sort170 bool operator( ) (const AlignStats & pA, const AlignStats & pB)
171 {
172 int cmp = strcmp (pA.IdR, pB.IdR);
173
174 //-- sort IdR
175 if ( cmp < 0 )
176 return true;
177 else if ( cmp > 0 )
178 return false;
179
180 //-- sort sQ
181 else if ( pA.sQ < pB.sQ )
182 return true;
183 else
184 return false;
185 }
186 };
187
188
189
190
191 //------------------------------------------------------ Global Variables ----//
192 bool isOutputContigs = false; // set by -t option
193 bool isOutputPseudoMolecule = false; // set by -p option
194 bool isOutputUnusable = false; // set by -u option
195 bool isPrintAlignments = false; // set by -a option
196 bool isPrintXML = false; // set by -x option
197 bool isCircularReference = false; // set by -c option
198 bool isRandomRepeats = false; // set by -R option
199
200 long int MIN_CONTIG_LENGTH = DEFAULT_MIN_CONTIG_LENGTH;
201
202 long int MAX_GAP_SIZE = 0;
203 float MIN_COVERAGE = 0;
204 float MIN_COVERAGE_DIFF = 0;
205 float MIN_PIDY = 0;
206
207 bool isdef_MAX_GAP_SIZE = false;
208 bool isdef_MIN_COVERAGE = false;
209 bool isdef_MIN_COVERAGE_DIFF = false;
210 bool isdef_MIN_PIDY = false;
211
212 int DATA_TYPE = NUCMER_DATA; // set by .delta header
213
214 char InputFileName [MAX_LINE];
215 char RefFileName [MAX_LINE], QryFileName [MAX_LINE];
216
217
218
219 //------------------------------------------------- Function Declarations ----//
220 float getAlignmentQryCoverage
221 (vector<AlignStats>::iterator Ap, long int SeqLenQ);
222
223 float getSubsetQryCoverage
224 (vector<AlignStats>::iterator begin,
225 vector<AlignStats>::iterator end, long int SeqLenQ);
226
227 float getSubsetQrySyntenyCoverage
228 (vector<AlignStats>::iterator begin,
229 vector<AlignStats>::iterator end, long int SeqLenQ);
230
231 float getSubsetIdentity
232 (vector<AlignStats>::iterator begin,
233 vector<AlignStats>::iterator end);
234
235 void linkContigs
236 (vector<QueryContig> & Contigs);
237
238 long int longestConsistentSubset
239 (vector<AlignStats>::iterator begin,
240 vector<AlignStats>::iterator end);
241
242 void outputContigs
243 (vector<QueryContig> Contigs, FILE * Output);
244
245 void outputPseudoMolecule
246 (vector<QueryContig> & Contigs, FILE * QryFile, FILE * Output);
247
248 void outputUnusable
249 (vector<QueryContig> Contigs, FILE * Output);
250
251 void parseDelta
252 (vector<QueryContig> & Contigs);
253
254 void placeContig
255 (vector<QueryContig>::iterator Cp);
256
257 void printAlignment
258 (vector<QueryContig>::iterator Cp, vector<AlignStats>::iterator Ap,
259 FILE * Output);
260
261 void printTilingAlignments
262 (vector<QueryContig> & Contigs);
263
264 void printTilingPath
265 (vector<QueryContig> & Contigs);
266
267 void printTilingXML
268 (vector<QueryContig> & Contigs, char * QryFileName,
269 int argc, char ** argv);
270
271 void printHelp
272 (const char * s);
273
274 void printUsage
275 (const char * s);
276
277 inline long int revC
278 (long int Coord, long int Len);
279
280 void tileContigs
281 (vector<QueryContig> & Contigs);
282
283
284
285
286 //-------------------------------------------------- Function Definitions ----//
main(int argc,char ** argv)287 int main
288 (int argc, char ** argv)
289 {
290 char ContigsFileName [MAX_LINE];
291 char PseudoMoleculeFileName [MAX_LINE];
292 char UnusableFileName [MAX_LINE];
293
294 FILE * ContigsFile = NULL;
295 FILE * PseudoMoleculeFile = NULL;
296 FILE * UnusableFile = NULL;
297 FILE * QryFile = NULL;
298
299 vector<QueryContig> Contigs;
300
301 //-- Parse the command line arguments
302 {
303 int ch, errflg = 0;
304 optarg = NULL;
305
306 while ( !errflg && ((ch = getopt
307 (argc, argv, "achg:i:l:p:Rt:u:v:V:x")) != EOF) )
308 switch (ch)
309 {
310 case 'a' :
311 isPrintAlignments = true;
312 break;
313
314 case 'c' :
315 isCircularReference = true;
316 break;
317
318 case 'h' :
319 printHelp (argv[0]);
320 exit (EXIT_SUCCESS);
321 break;
322
323 case 'g' :
324 MAX_GAP_SIZE = atoi (optarg);
325 isdef_MAX_GAP_SIZE = true;
326 break;
327
328 case 'i' :
329 MIN_PIDY = atof (optarg);
330 isdef_MIN_PIDY = true;
331 break;
332
333 case 'l' :
334 MIN_CONTIG_LENGTH = atoi (optarg);
335 break;
336
337 case 'p' :
338 strcpy ( PseudoMoleculeFileName, optarg );
339 isOutputPseudoMolecule = true;
340 break;
341
342 case 'R' :
343 isRandomRepeats = true;
344 break;
345
346 case 't' :
347 strcpy ( ContigsFileName, optarg );
348 isOutputContigs = true;
349 break;
350
351 case 'u' :
352 strcpy ( UnusableFileName, optarg );
353 isOutputUnusable = true;
354 break;
355
356 case 'v' :
357 MIN_COVERAGE = atof (optarg);
358 isdef_MIN_COVERAGE = true;
359 break;
360
361 case 'V' :
362 MIN_COVERAGE_DIFF = atof (optarg);
363 isdef_MIN_COVERAGE_DIFF = true;
364 break;
365
366 case 'x' :
367 isPrintXML = true;
368 break;
369
370 default :
371 errflg ++;
372 }
373
374 if ( errflg > 0 || argc - optind != 1 )
375 {
376 printUsage (argv[0]);
377 exit (EXIT_FAILURE);
378 }
379
380 if ( MIN_PIDY < 0.0 || MIN_PIDY > 100.0 ||
381 MIN_COVERAGE < 0.0 || MIN_COVERAGE > 100.0 ||
382 MIN_COVERAGE_DIFF < 0.0 || MIN_COVERAGE_DIFF > 100.0 )
383 {
384 fprintf(stderr,
385 "\nERROR: Percentages must be within the range [0.0, 100.0]\n");
386 exit (EXIT_FAILURE);
387 }
388
389 if ( MIN_CONTIG_LENGTH < -1 || MAX_GAP_SIZE < -1 )
390 {
391 fprintf(stderr,
392 "\nERROR: Size values must be within the range [-1, oo]\n");
393 exit (EXIT_FAILURE);
394 }
395
396 if ( isRandomRepeats )
397 {
398 srand ( time(NULL) );
399 MIN_COVERAGE_DIFF = 0;
400 isdef_MIN_COVERAGE_DIFF = true;
401 }
402 }
403
404
405 //-- Parse the delta file
406 strcpy (InputFileName, argv[optind ++]);
407 parseDelta (Contigs);
408
409
410 //-- Try and open all the output files
411 if ( isOutputContigs )
412 ContigsFile = File_Open ( ContigsFileName, "w" );
413 if ( isOutputPseudoMolecule )
414 PseudoMoleculeFile = File_Open ( PseudoMoleculeFileName, "w" );
415 if ( isOutputPseudoMolecule )
416 QryFile = File_Open ( QryFileName, "r" );
417 if ( isOutputUnusable )
418 UnusableFile = File_Open ( UnusableFileName, "w" );
419
420
421 //-- Find each contig's mapping and sort by reference
422 tileContigs (Contigs);
423
424
425 if ( isOutputContigs )
426 {
427 outputContigs (Contigs, ContigsFile);
428 fclose (ContigsFile);
429 }
430
431
432 //-- Print the unusable contigs w/ alignments to the user specified file
433 if ( isOutputUnusable )
434 {
435 outputUnusable (Contigs, UnusableFile);
436 fclose (UnusableFile);
437 }
438
439
440 //-- Link the contigs to form the tiling path
441 linkContigs (Contigs);
442
443
444 //-- Print the pseudo molecule to the user specifed file
445 if ( isOutputPseudoMolecule )
446 {
447 outputPseudoMolecule (Contigs, QryFile, PseudoMoleculeFile);
448 fclose (QryFile);
449 fclose (PseudoMoleculeFile);
450 }
451
452
453 //-- Print the tiling path to stdout
454 stdio_launch_pager redirect_to_pager;
455 if ( isPrintAlignments )
456 printTilingAlignments (Contigs);
457 else if ( isPrintXML )
458 printTilingXML (Contigs, QryFileName, argc, argv);
459 else
460 printTilingPath (Contigs);
461
462
463 return EXIT_SUCCESS;
464 }
465
466
467
468
getAlignmentQryCoverage(vector<AlignStats>::iterator Ap,long int SeqLenQ)469 float getAlignmentQryCoverage
470 (vector<AlignStats>::iterator Ap, long int SeqLenQ)
471 {
472 //-- Return query coverage for a single alignment
473 return (float)(Ap->eQ - Ap->sQ + 1) / (float)SeqLenQ * 100.0;
474 }
475
476
477
478
getSubsetQryCoverage(vector<AlignStats>::iterator begin,vector<AlignStats>::iterator end,long int SeqLenQ)479 float getSubsetQryCoverage
480 (vector<AlignStats>::iterator begin,
481 vector<AlignStats>::iterator end, long int SeqLenQ)
482 {
483 vector<AlignStats>::iterator Ap;
484 vector<AlignStats>::iterator preAp = end;
485
486 long int cov = 0;
487 long int olap;
488
489 //-- Get non-redundant query coverage for the subset of alignments
490 for ( Ap = begin; Ap < end; Ap ++ )
491 {
492 if ( ! Ap->isTiled )
493 continue;
494
495 if ( preAp == end )
496 cov += Ap->eQ - Ap->sQ + 1;
497 else
498 {
499 //-- Alignments must be pre-sorted by sQ
500 assert ( Ap->sQ >= preAp->sQ );
501 olap = preAp->eQ - Ap->sQ + 1;
502 olap = olap > 0 ? olap : 0;
503 cov += Ap->eQ - Ap->sQ + 1 - olap;
504 }
505 preAp = Ap;
506 }
507
508 return (float)(cov) / (float)SeqLenQ * 100.0;
509 }
510
511
512
513
getSubsetQrySyntenyCoverage(vector<AlignStats>::iterator begin,vector<AlignStats>::iterator end,long int SeqLenQ)514 float getSubsetQrySyntenyCoverage
515 (vector<AlignStats>::iterator begin,
516 vector<AlignStats>::iterator end, long int SeqLenQ)
517 {
518 vector<AlignStats>::iterator Ap;
519 long int cov = 0;
520 long int low, high;
521
522 low = LONG_MAX;
523 high = -LONG_MAX;
524 for ( Ap = begin; Ap < end; Ap ++ )
525 if ( Ap->isTiled )
526 {
527 if ( Ap->sQ < low )
528 low = Ap->sQ;
529 if ( Ap->eQ > high )
530 high = Ap->eQ;
531 }
532 cov = high - low + 1;
533
534 return (float)(cov) / (float)SeqLenQ * 100.0;
535 }
536
537
538
539
getSubsetIdentity(vector<AlignStats>::iterator begin,vector<AlignStats>::iterator end)540 float getSubsetIdentity
541 (vector<AlignStats>::iterator begin,
542 vector<AlignStats>::iterator end)
543 {
544 vector<AlignStats>::iterator Ap;
545
546 long int len;
547 long int N = 0;
548 float tot = 0;
549
550 //-- Get weighted (by length) average identity for the subset of alignments
551 for ( Ap = begin; Ap < end; Ap ++ )
552 {
553 if ( ! Ap->isTiled )
554 continue;
555
556 len = Ap->eQ - Ap->sQ + 1;
557 N += len;
558 tot += Ap->Idy * len;
559 }
560
561 return tot / (float)N;
562 }
563
564
565
566
linkContigs(vector<QueryContig> & Contigs)567 void linkContigs
568 (vector<QueryContig> & Contigs)
569 {
570 vector<QueryContig>::iterator Cp;
571 vector<QueryContig>::iterator Cip;
572 vector<QueryContig>::iterator nxtCp;
573 vector<QueryContig>::iterator firstCp;
574 long int best_end;
575
576 //-- Ignore shadowed contigs and generate the tiling path that uses
577 // the minimum number of contigs while covering the maximum area
578 firstCp = Contigs.end( );
579 for ( Cp = Contigs.begin( ); Cp < Contigs.end( ); Cp ++ )
580 {
581 if ( Cp->TileLevel != USED_TILE_LEVEL )
582 continue;
583
584 if ( firstCp == Contigs.end( ) )
585 {
586 firstCp = Cp;
587 firstCp->isLinkHead = true;
588 }
589
590 //-- If several contigs overlap Cp, pick the best one...
591 // or if none overlap, pick the next used tile
592 nxtCp = Contigs.end( );
593 best_end = -(LONG_MAX);
594 for ( Cip = Cp + 1; Cip < Contigs.end( ); Cip ++ )
595 {
596 if ( Cip->TileLevel != USED_TILE_LEVEL )
597 continue;
598
599 if ( strcmp ( Cip->IdR, Cp->IdR ) != 0 )
600 break;
601
602 //-- If no overlap
603 if ( Cip->StartR > Cp->EndR )
604 {
605 if ( nxtCp == Contigs.end( ) )
606 nxtCp = Cip;
607 break;
608 }
609
610 //-- If overlap
611 Cip->TileLevel = IGNORE_TILE_LEVEL;
612 if ( Cip->EndR > Cp->EndR && Cip->EndR > best_end )
613 {
614 best_end = Cip->EndR;
615 nxtCp = Cip;
616 }
617 }
618
619
620 //-- Link Cp to nxtCp
621 if ( nxtCp != Contigs.end( ) )
622 {
623 //-- Link to the next valid tile
624 nxtCp->TileLevel = USED_TILE_LEVEL;
625 Cp->linksTo = nxtCp;
626 }
627 else
628 {
629 if ( isCircularReference )
630 {
631 if ( firstCp->StartR <= 0 )
632 {
633 nxtCp = Cp;
634 for (; Cp > firstCp; Cp -- )
635 if ( Cp->TileLevel == USED_TILE_LEVEL )
636 {
637 if ( firstCp->SeqLenR + firstCp->StartR <= Cp->EndR )
638 {
639 nxtCp = Cp;
640 Cp->TileLevel = IGNORE_TILE_LEVEL;
641 }
642 else
643 break;
644 }
645
646 if ( firstCp->SeqLenR + firstCp->StartR > nxtCp->StartR )
647 {
648 Cp = nxtCp;
649 Cp->TileLevel = USED_TILE_LEVEL;
650 }
651 }
652
653 //-- Circular link (note: this creates a circular linked list)
654 Cp->linksTo = firstCp;
655 }
656 else
657 Cp->linksTo = Contigs.end( );
658
659 firstCp = Contigs.end( );
660 }
661 }
662
663 return;
664 }
665
666
667
668
longestConsistentSubset(vector<AlignStats>::iterator begin,vector<AlignStats>::iterator end)669 long int longestConsistentSubset
670 (vector<AlignStats>::iterator begin, vector<AlignStats>::iterator end)
671 {
672 //-- For dynamic programming scoring and backtracking
673 struct node
674 {
675 vector<AlignStats>::iterator Ap;
676 long int Score;
677 long int From;
678 };
679
680 vector<AlignStats>::iterator Ap;
681 long int i, j, N, OlapR, OlapQ, Olap, Best;
682 node * A;
683
684 //-- Build the data array
685 N = 0;
686 A = (node *) Safe_malloc (sizeof(node) * (end - begin));
687 for ( Ap = begin; Ap < end; Ap ++ )
688 {
689 assert ( Ap->isTiled == false );
690 A[N].Ap = Ap;
691 A[N].Score = Ap->eQ - Ap->sQ + 1;
692 A[N].From = -1;
693 N ++;
694 }
695 assert ( N == (end - begin) );
696
697 //-- Isn't it dynamic?
698 for ( i = 0; i < N; i ++ )
699 for ( j = 0; j < i; j ++ )
700 {
701 assert ( strcmp (A[i].Ap->IdR, A[j].Ap->IdR) == 0 );
702 if ( A[i].Ap->DirQ != A[j].Ap->DirQ )
703 continue;
704
705 OlapR = A[j].Ap->eR - A[i].Ap->sR + 1;
706 if ( isCircularReference && OlapR > A[j].Ap->SeqLenR / 2 )
707 OlapR = (A[j].Ap->eR - A[j].Ap->SeqLenR) + (1 - A[i].Ap->sR);
708
709 if ( MAX_GAP_SIZE >= 0 && OlapR < -(MAX_GAP_SIZE) )
710 continue;
711 Olap = OlapR < 0 ? 0 : OlapR;
712 OlapQ = A[j].Ap->eQ - A[i].Ap->sQ + 1;
713 if ( MAX_GAP_SIZE >= 0 && OlapQ < -(MAX_GAP_SIZE) )
714 continue;
715 Olap = Olap > OlapQ ? Olap : OlapQ;
716
717 if ( A[j].Score + (A[i].Ap->eQ - A[i].Ap->sQ + 1) - Olap > A[i].Score )
718 {
719 A[i].From = j;
720 A[i].Score = A[j].Score + (A[i].Ap->eQ - A[i].Ap->sQ + 1) - Olap;
721 }
722 }
723
724 //-- Find the highest score and backtrack to extract the subset
725 Best = 0;
726 for ( i = 1; i < N; i ++ )
727 {
728 if ( A[i].Score > A[Best].Score )
729 Best = i;
730 else if ( A[i].Score == A[Best].Score &&
731 A[i].Ap->Idy > A[Best].Ap->Idy )
732 Best = i;
733 }
734
735 if ( isRandomRepeats )
736 {
737 vector<long int> ties;
738 for ( i = 0; i < N; i ++ )
739 if ( A[i].Score == A[Best].Score &&
740 A[i].Ap->Idy >= A[Best].Ap->Idy - REPEAT_PIDY_DIFF )
741 ties . push_back (i);
742
743 if ( ties.size( ) > 1 )
744 Best = ties
745 [(long int)((double)ties.size( ) * rand( ) / (RAND_MAX + 1.0))];
746 }
747
748 for ( i = Best; i >= 0; i = A[i].From )
749 A[i].Ap->isTiled = true;
750
751 Best = A[Best].Score;
752
753 free ( A );
754
755 return Best;
756 }
757
758
759
760
outputContigs(vector<QueryContig> Contigs,FILE * Output)761 void outputContigs
762 (vector<QueryContig> Contigs, FILE * Output)
763 {
764 vector<QueryContig>::iterator Cp;
765 vector<QueryContig>::iterator endCp;
766
767 //-- Sort by reference trim offset
768 sort ( Contigs.begin( ), Contigs.end( ), IdR_StartRTrimmed_Sort( ) );
769
770 long int seqs;
771
772 Cp = Contigs.begin( );
773 while ( Cp < Contigs.end( ) )
774 {
775 if ( Cp->TileLevel != USED_TILE_LEVEL )
776 {
777 Cp ++;
778 continue;
779 }
780
781 seqs = 0;
782 for ( endCp = Cp; endCp < Contigs.end( ); endCp ++ )
783 {
784 if ( endCp->TileLevel != USED_TILE_LEVEL )
785 continue;
786
787 if ( strcmp (Cp->IdR, endCp->IdR) != 0 )
788 break;
789 seqs ++;
790 }
791
792 fprintf(Output,
793 "##%s %ld %ld bases, 00000000 checksum.\n",
794 Cp->IdR, seqs, Cp->SeqLenR);
795
796 for ( ; Cp < endCp; Cp ++ )
797 {
798 if ( Cp->TileLevel != USED_TILE_LEVEL )
799 continue;
800
801 if ( Cp->DirQ == FORWARD_CHAR )
802 fprintf(Output,
803 "#%s(%ld) [%s] %ld bases, 00000000 checksum. {%ld %ld} <%ld %ld>\n",
804 Cp->IdQ, Cp->StartR + Cp->LoTrim - 1, "", Cp->SeqLenQ,
805 Cp->LoTrim + 1, Cp->SeqLenQ - Cp->HiTrim,
806 Cp->StartR + Cp->LoTrim, Cp->EndR - Cp->HiTrim);
807 else
808 fprintf(Output,
809 "#%s(%ld) [%s] %ld bases, 00000000 checksum. {%ld %ld} <%ld %ld>\n",
810 Cp->IdQ, Cp->StartR + Cp->LoTrim - 1, "RC", Cp->SeqLenQ,
811 Cp->SeqLenQ - Cp->LoTrim, Cp->HiTrim + 1,
812 Cp->StartR + Cp->LoTrim, Cp->EndR - Cp->HiTrim);
813 }
814 }
815
816 return;
817 }
818
819
820
821
outputPseudoMolecule(vector<QueryContig> & Contigs,FILE * QryFile,FILE * Output)822 void outputPseudoMolecule
823 (vector<QueryContig> & Contigs, FILE * QryFile, FILE * Output)
824 {
825 vector<QueryContig>::iterator Cp;
826 vector<QueryContig>::iterator beginCp;
827
828 vector<AlignStats>::iterator Apb;
829 vector<AlignStats>::reverse_iterator Ape;
830
831 long int ct = 0;
832 long int start, end, gap, i, endR, startR;
833
834 char * A;
835 long int InitSize = INIT_SIZE;
836 char Line [MAX_LINE];
837
838
839 //-- Read in the needed query contig sequences
840 A = (char *) Safe_malloc ( sizeof(char) * InitSize );
841 while ( Read_String (QryFile, A, InitSize, Line, false) )
842 {
843 for ( Cp = Contigs.begin( ); Cp < Contigs.end( ); Cp ++ )
844 if ( Cp->TileLevel == USED_TILE_LEVEL )
845 if ( strcmp ( Line, Cp->IdQ ) == 0 )
846 break;
847
848 if ( Cp < Contigs.end( ) )
849 {
850 assert ( (long int)strlen(A+1) == Cp->SeqLenQ );
851 Cp->SeqQ = (char *) Safe_malloc
852 ( sizeof(char) * (Cp->SeqLenQ + 2) );
853 Cp->SeqQ[0] = '\0';
854 strcpy ( Cp->SeqQ + 1, A + 1 );
855 if ( Cp->DirQ == REVERSE_CHAR )
856 Reverse_Complement (Cp->SeqQ, 1, Cp->SeqLenQ);
857 }
858 }
859 free ( A );
860
861 //-- For all contigs, create pseudo
862 for ( beginCp = Contigs.begin( ); beginCp < Contigs.end( ); beginCp ++ )
863 {
864 if ( ! beginCp->isLinkHead )
865 continue;
866
867 //-- New Reference sequence
868 start = 1;
869 Cp = beginCp;
870 fprintf (Output, ">pseudo_used_%s\n", Cp->IdR);
871
872 //-- For all contigs mapping to this Reference
873 while ( Cp != Contigs.end( ) )
874 {
875 gap = 0;
876 end = Cp->SeqLenQ;
877 if ( Cp->linksTo != Contigs.end( ) )
878 {
879 if ( ! Cp->linksTo->isLinkHead )
880 {
881 //-- Internal sequential link, gap N's needed if necessary
882 startR = Cp->linksTo->StartR;
883 endR = Cp->EndR;
884
885 gap = startR - endR - 1;
886 if ( gap < 0 )
887 {
888 for ( Apb = Cp->linksTo->Aligns.begin( );
889 Apb < Cp->linksTo->Aligns.end( ) && !Apb->isTiled;
890 Apb ++ )
891 {}
892 assert ( Apb < Cp->linksTo->Aligns.end( ) );
893
894 for ( Ape = Cp->Aligns.rbegin( );
895 Ape < Cp->Aligns.rend( ) && !Ape->isTiled;
896 Ape ++ )
897 {}
898 assert ( Ape < Cp->Aligns.rend( ) );
899
900 //-- Use the contig with the least 'junk' sequence
901 if ( Cp->SeqLenQ - Ape->eQ > Apb->sQ - 1 ||
902 ( Cp->SeqLenQ - Ape->eQ == Apb->sQ - 1 &&
903 getSubsetIdentity (Cp->Aligns.begin( ),
904 Cp->Aligns.end( )) <
905 getSubsetIdentity (Cp->linksTo->Aligns.begin( ),
906 Cp->linksTo->Aligns.end( )) ) )
907 end += gap;
908 }
909 }
910 }
911
912 if ( Cp->SeqQ == NULL )
913 {
914 fprintf (stderr,
915 "\nERROR: Sequence \"%s\" was not found in the query file.\n"
916 " Please check the validity of the query file listed\n"
917 " at the top of the .delta input file and rerun.\n",
918 Cp->IdQ);
919 exit (EXIT_FAILURE);
920 }
921
922 //-- Print the sequence
923 for ( i = start; i <= end; i ++ )
924 {
925 fputc (toupper(Cp->SeqQ[i]), Output);
926 if ( ++ ct == CHARS_PER_LINE )
927 {
928 ct = 0;
929 fputc ('\n', Output);
930 }
931 }
932 free (Cp->SeqQ);
933
934 //-- Print the gap
935 for ( i = 1; i <= gap; i ++ )
936 {
937 fputc ('N', Output);
938 if ( ++ ct == CHARS_PER_LINE )
939 {
940 ct = 0;
941 fputc ('\n', Output);
942 }
943 }
944
945 //-- Adjust the next start if this contig was used for overlap
946 if ( gap < 0 && end == Cp->SeqLenQ )
947 start = 1 + -(gap);
948
949 //-- Walk the pointer down the list of contigs
950 if ( Cp->linksTo == Contigs.end( ) || Cp->linksTo->isLinkHead )
951 Cp = Contigs.end( );
952 else
953 Cp = Cp->linksTo;
954 }
955
956 if ( ct != 0 )
957 fputc ('\n', Output);
958 ct = 0;
959 }
960
961 return;
962 }
963
964
965
966
outputUnusable(vector<QueryContig> Contigs,FILE * Output)967 void outputUnusable
968 (vector<QueryContig> Contigs, FILE * Output)
969 {
970 vector<QueryContig>::iterator Cp;
971 vector<AlignStats>::iterator Ap;
972
973 sort ( Contigs.begin( ), Contigs.end( ), IdQ_Sort( ) );
974
975 for ( Cp = Contigs.begin( ); Cp < Contigs.end( ); Cp ++ )
976 {
977 if ( Cp->TileLevel != UNUSABLE_TILE_LEVEL &&
978 Cp->TileLevel != UNUSED_TILE_LEVEL )
979 continue;
980
981 for ( Ap = Cp->Aligns.begin( ); Ap < Cp->Aligns.end( ); Ap ++ )
982 printAlignment (Cp, Ap, Output);
983 }
984
985 return;
986 }
987
988
989
990
parseDelta(vector<QueryContig> & Contigs)991 void parseDelta
992 (vector<QueryContig> & Contigs)
993
994 // parse the delta file
995
996 {
997 vector<QueryContig>::iterator Cp;
998
999 char * CurrIdQ; // the current contig Id
1000 long int temp;
1001
1002 QueryContig aContig; // single query contig
1003 AlignStats aStats; // single alignment region
1004
1005 DeltaReader_t dr;
1006 dr.open (InputFileName);
1007 DATA_TYPE = dr.getDataType( ) == NUCMER_STRING ?
1008 NUCMER_DATA : PROMER_DATA;
1009 strcpy (RefFileName, dr.getReferencePath( ).c_str( ));
1010 strcpy (QryFileName, dr.getQueryPath( ).c_str( ));
1011
1012
1013 Contigs.clear( );
1014 CurrIdQ = NULL_STRING;
1015 aContig.SeqQ = NULL;
1016 aContig.DirQ = '*';
1017 aContig.IdR = NULL_STRING;
1018 aContig.SeqLenR = 0;
1019 aContig.Aligns.clear( );
1020 aContig.StartR = aContig.EndR = 0;
1021 aContig.LoTrim = aContig.HiTrim = 0;
1022 aContig.isLinkHead = false;
1023
1024
1025 if ( DATA_TYPE == NUCMER_DATA )
1026 {
1027 if ( !isdef_MAX_GAP_SIZE )
1028 MAX_GAP_SIZE = DEFAULT_NUCMER_MAX_GAP_SIZE;
1029 if ( !isdef_MIN_COVERAGE )
1030 MIN_COVERAGE = DEFAULT_NUCMER_MIN_COVERAGE;
1031 if ( !isdef_MIN_COVERAGE_DIFF )
1032 MIN_COVERAGE_DIFF = DEFAULT_NUCMER_MIN_COVERAGE_DIFF;
1033 if ( !isdef_MIN_PIDY )
1034 MIN_PIDY = DEFAULT_NUCMER_MIN_PIDY;
1035 }
1036 else
1037 {
1038 if ( !isdef_MAX_GAP_SIZE )
1039 MAX_GAP_SIZE = DEFAULT_PROMER_MAX_GAP_SIZE;
1040 if ( !isdef_MIN_COVERAGE )
1041 MIN_COVERAGE = DEFAULT_PROMER_MIN_COVERAGE;
1042 if ( !isdef_MIN_COVERAGE_DIFF )
1043 MIN_COVERAGE_DIFF = DEFAULT_PROMER_MIN_COVERAGE_DIFF;
1044 if ( !isdef_MIN_PIDY )
1045 MIN_PIDY = DEFAULT_PROMER_MIN_PIDY;
1046 }
1047
1048
1049 //-- Process the delta input file
1050 while ( dr.readNext( ) )
1051 {
1052 aStats.SeqLenR = dr.getRecord( ).lenR;
1053 aContig.SeqLenQ = dr.getRecord( ).lenQ;
1054
1055 aStats.IdR = (char *) Safe_malloc (dr.getRecord( ).idR.length( ) + 1);
1056 aContig.IdQ = (char *) Safe_malloc (dr.getRecord( ).idQ.length( ) + 1);
1057 strcpy (aStats.IdR, dr.getRecord( ).idR.c_str( ));
1058 strcpy (aContig.IdQ, dr.getRecord( ).idQ.c_str( ));
1059
1060 if ( strcmp (CurrIdQ, aContig.IdQ) )
1061 {
1062 CurrIdQ = aContig.IdQ;
1063 aContig.TileLevel = aContig.SeqLenQ < MIN_CONTIG_LENGTH ?
1064 IGNORE_TILE_LEVEL : UNUSED_TILE_LEVEL;
1065 Contigs.push_back (aContig);
1066 }
1067
1068 for ( unsigned int i = 0; i < dr.getRecord( ).aligns.size( ); i ++ )
1069 {
1070 aStats.sR = dr.getRecord( ).aligns[i].sR;
1071 aStats.eR = dr.getRecord( ).aligns[i].eR;
1072 aStats.sQ = dr.getRecord( ).aligns[i].sQ;
1073 aStats.eQ = dr.getRecord( ).aligns[i].eQ;
1074
1075 //-- Check match orientation
1076 if ( (aStats.sR <= aStats.eR && aStats.sQ <= aStats.eQ) ||
1077 (aStats.sR > aStats.eR && aStats.sQ > aStats.eQ) )
1078 aStats.DirQ = FORWARD_CHAR;
1079 else
1080 aStats.DirQ = REVERSE_CHAR;
1081
1082 //-- Force ascending coordinates
1083 if ( aStats.sR > aStats.eR )
1084 {
1085 temp = aStats.sR;
1086 aStats.sR = aStats.eR;
1087 aStats.eR = temp;
1088 }
1089 if ( aStats.sQ > aStats.eQ )
1090 {
1091 temp = aStats.sQ;
1092 aStats.sQ = aStats.eQ;
1093 aStats.eQ = temp;
1094 }
1095
1096 //-- If flipped orientation, reverse query coordinates
1097 if ( aStats.DirQ == REVERSE_CHAR )
1098 {
1099 temp = aStats.sQ;
1100 aStats.sQ = revC (aStats.eQ, aContig.SeqLenQ);
1101 aStats.eQ = revC (temp, aContig.SeqLenQ);
1102 }
1103
1104 //-- Set the statistics for this alignment region
1105 aStats.Idy = dr.getRecord( ).aligns[i].idy;
1106 aStats.Sim = dr.getRecord( ).aligns[i].sim;
1107 aStats.Stp = dr.getRecord( ).aligns[i].stp;
1108 aStats.isTiled = false;
1109
1110 //-- Add the alignment region
1111 Contigs.rbegin( )->Aligns.push_back (aStats);
1112 }
1113 }
1114 dr.close( );
1115
1116 for ( Cp = Contigs.begin( ); Cp < Contigs.end( ); Cp ++ )
1117 sort ( Cp->Aligns.begin( ), Cp->Aligns.end( ), IdR_sQ_Sort( ) );
1118
1119 return;
1120 }
1121
1122
1123
1124
placeContig(vector<QueryContig>::iterator Cp)1125 void placeContig
1126 (vector<QueryContig>::iterator Cp)
1127 {
1128 vector<AlignStats>::iterator Aip, Ap;
1129 float max_cov, cov;
1130 long int start;
1131
1132 assert ( Cp->TileLevel == USED_TILE_LEVEL );
1133
1134 //-- Find the 'representative' alignment
1135 Ap = Cp->Aligns.end( );
1136 max_cov = -(FLT_MAX);
1137 for ( Aip = Cp->Aligns.begin( ); Aip < Cp->Aligns.end( ); Aip ++ )
1138 if ( Aip->isTiled )
1139 {
1140 //-- Set trim values
1141 if ( Ap == Cp->Aligns.end( ) )
1142 Cp->LoTrim = Aip->sQ - 1;
1143 Cp->HiTrim = Cp->SeqLenQ - Aip->eQ;
1144
1145 cov = getAlignmentQryCoverage (Aip, Cp->SeqLenQ);
1146 if ( cov > max_cov )
1147 {
1148 max_cov = cov;
1149 Ap = Aip;
1150 }
1151 }
1152
1153 //-- Set mapping reference data
1154 assert ( Ap != Cp->Aligns.end( ) );
1155 Cp->DirQ = Ap->DirQ;
1156 Cp->IdR = Ap->IdR;
1157 Cp->SeqLenR = Ap->SeqLenR;
1158
1159 //-- Position the contig
1160 start = Ap->sQ;
1161 Cp->StartR = Ap->sR - (start - 1);
1162 Cp->EndR = Cp->StartR + Cp->SeqLenQ - 1;
1163
1164 //-- Force negative offset if circular, else just let it overhang
1165 if ( isCircularReference && Cp->EndR > Cp->SeqLenR )
1166 {
1167 Cp->StartR = Cp->StartR - Cp->SeqLenR;
1168 Cp->EndR = Cp->EndR - Cp->SeqLenR;
1169 }
1170 assert ( Cp->StartR <= Cp->EndR );
1171 assert ( Cp->LoTrim >= 0 && Cp->HiTrim >= 0 );
1172
1173 return;
1174 }
1175
1176
1177
1178
printAlignment(vector<QueryContig>::iterator Cp,vector<AlignStats>::iterator Ap,FILE * Output)1179 void printAlignment
1180 (vector<QueryContig>::iterator Cp, vector<AlignStats>::iterator Ap,
1181 FILE * Output)
1182 {
1183 long int len1, len2;
1184 float covA, covB;
1185
1186 len1 = Ap->eR - Ap->sR + 1;
1187 len2 = Ap->eQ - Ap->sQ + 1;
1188 covA = (float)len1 / (float)Ap->SeqLenR * 100.0;
1189 covB = getAlignmentQryCoverage ( Ap, Cp->SeqLenQ );
1190
1191 //-- Output the statistics for this alignment region
1192 fprintf(Output,"%ld\t%ld\t", Ap->sR, Ap->eR);
1193 if ( Ap->DirQ == FORWARD_CHAR )
1194 fprintf(Output,"%ld\t%ld\t", Ap->sQ, Ap->eQ);
1195 else
1196 fprintf(Output,"%ld\t%ld\t", revC(Ap->sQ, Cp->SeqLenQ),
1197 revC(Ap->eQ, Cp->SeqLenQ));
1198 fprintf(Output,"%ld\t%ld\t", len1, len2);
1199 fprintf(Output,"%.2f\t", Ap->Idy);
1200 fprintf(Output,"%ld\t%ld\t", Ap->SeqLenR, Cp->SeqLenQ);
1201 fprintf(Output,"%.2f\t%.2f\t", covA, covB);
1202 fprintf(Output,"%s\t%s", Ap->IdR, Cp->IdQ);
1203 fprintf(Output,"\n");
1204
1205 return;
1206 }
1207
1208
1209
1210
printTilingAlignments(vector<QueryContig> & Contigs)1211 void printTilingAlignments
1212 (vector<QueryContig> & Contigs)
1213 {
1214 vector<QueryContig>::iterator beginCp;
1215 vector<QueryContig>::iterator Cp;
1216 vector<AlignStats>::iterator Ap;
1217
1218 for ( beginCp = Contigs.begin( ); beginCp < Contigs.end( ); beginCp ++ )
1219 {
1220 if ( ! beginCp->isLinkHead )
1221 continue;
1222
1223 //-- A new Reference sequence
1224 Cp = beginCp;
1225
1226 //-- For all contigs mapping to this reference
1227 while ( Cp != Contigs.end( ) )
1228 {
1229 for ( Ap = Cp->Aligns.begin( ); Ap < Cp->Aligns.end( ); Ap ++ )
1230 if ( ! Ap->isTiled )
1231 continue;
1232 else
1233 printAlignment(Cp,Ap,stdout);
1234
1235 //-- Walk the pointer down the list of contigs
1236 if ( Cp->linksTo == Contigs.end( ) || Cp->linksTo->isLinkHead )
1237 Cp = Contigs.end( );
1238 else
1239 Cp = Cp->linksTo;
1240 }
1241 }
1242
1243 return;
1244 }
1245
1246
1247
1248
printTilingPath(vector<QueryContig> & Contigs)1249 void printTilingPath
1250 (vector<QueryContig> & Contigs)
1251 {
1252 vector<QueryContig>::iterator beginCp;
1253 vector<QueryContig>::iterator Cp;
1254
1255 long int len, gap;
1256 float pcov, pidy;
1257
1258
1259 for ( beginCp = Contigs.begin( ); beginCp < Contigs.end( ); beginCp ++ )
1260 {
1261 if ( ! beginCp->isLinkHead )
1262 continue;
1263
1264 //-- A new Reference sequence
1265 Cp = beginCp;
1266 printf (">%s %ld bases\n", Cp->IdR, Cp->SeqLenR);
1267
1268 //-- For all contigs mapping to this reference
1269 while ( Cp != Contigs.end( ) )
1270 {
1271 len = Cp->SeqLenQ;
1272 pcov = getSubsetQryCoverage
1273 ( Cp->Aligns.begin( ), Cp->Aligns.end( ), Cp->SeqLenQ );
1274 pidy = getSubsetIdentity
1275 ( Cp->Aligns.begin( ), Cp->Aligns.end( ) );
1276
1277 //-- Calculate the gap size between this and the next contig
1278 if ( Cp->linksTo == Contigs.end( ) )
1279 gap = Cp->SeqLenR - Cp->EndR;
1280 else
1281 {
1282 if ( Cp->linksTo->isLinkHead )
1283 gap = (Cp->SeqLenR + Cp->linksTo->StartR) - Cp->EndR - 1;
1284 else
1285 gap = Cp->linksTo->StartR - Cp->EndR - 1;
1286 }
1287
1288 //-- Print the data
1289 printf ("%ld\t%ld\t%ld\t%ld\t%.2f\t%.2f\t%c\t%s\n",
1290 Cp->StartR, Cp->EndR, gap, len, pcov,
1291 pidy, Cp->DirQ, Cp->IdQ);
1292
1293 //-- Walk the pointer down the list of contigs
1294 if ( Cp->linksTo == Contigs.end( ) || Cp->linksTo->isLinkHead )
1295 Cp = Contigs.end( );
1296 else
1297 Cp = Cp->linksTo;
1298 }
1299 }
1300
1301 return;
1302 }
1303
1304
1305
1306
printTilingXML(vector<QueryContig> & Contigs,char * QryFileName,int argc,char ** argv)1307 void printTilingXML
1308 (vector<QueryContig> & Contigs, char * QryFileName, int argc, char ** argv)
1309 {
1310 vector<AlignStats>::iterator Ap;
1311 vector<QueryContig>::iterator Cp;
1312 vector<QueryContig>::iterator beginCp;
1313
1314 int i;
1315 long int gap;
1316 long int ct = 0;
1317
1318 time_t tt = time(NULL);
1319 char * time_str;
1320 time_str = ctime(&tt);
1321 time_str[strlen(time_str)-1] = '\0';
1322
1323 printf ("<?xml version = \"1.0\" ?>\n");
1324
1325 printf ("<EVIDENCE ID = \"project_1\"\n");
1326 printf (" DATE = \"%s\"\n", time_str);
1327 printf (" PROJECT = \"%s\"\n", QryFileName);
1328 printf (" PARAMETERS = \"");
1329 for ( i = 0; i < argc; i ++ ) printf ("%s%s", i == 0 ? "" : " ", argv[i]);
1330 printf ("\">\n");
1331
1332 //-- Print the used contig information
1333 for ( beginCp = Contigs.begin( ); beginCp < Contigs.end( ); beginCp ++ )
1334 {
1335 if ( ! beginCp->isLinkHead )
1336 continue;
1337
1338 //-- A new Reference sequence
1339 Cp = beginCp;
1340
1341 //-- For all contigs mapping to this reference
1342 while ( Cp != Contigs.end( ) )
1343 {
1344 printf
1345 ("\t<CONTIG ID = \"contig_%s\" NAME = \"%s\" LEN = \"%ld\"/>\n",
1346 Cp->IdQ, Cp->IdQ, Cp->SeqLenQ);
1347
1348 //-- Walk the pointer down the list of contigs
1349 if ( Cp->linksTo == Contigs.end( ) || Cp->linksTo->isLinkHead )
1350 Cp = Contigs.end( );
1351 else
1352 Cp = Cp->linksTo;
1353 }
1354 }
1355 printf("\n");
1356
1357
1358 //-- Print the linking information
1359 for ( beginCp = Contigs.begin( ); beginCp < Contigs.end( ); beginCp ++ )
1360 {
1361 if ( ! beginCp->isLinkHead )
1362 continue;
1363
1364 //-- A new Reference sequence
1365 Cp = beginCp;
1366
1367 //-- For all contigs mapping to this reference
1368 while ( Cp != Contigs.end( ) )
1369 {
1370 if ( Cp->linksTo == Contigs.end( ) )
1371 break;
1372
1373 //-- Calculate the gap size between this and the next contig
1374 if ( Cp->linksTo->isLinkHead )
1375 gap = (Cp->SeqLenR + Cp->linksTo->StartR) - Cp->EndR - 1;
1376 else
1377 gap = Cp->linksTo->StartR - Cp->EndR - 1;
1378
1379 printf
1380 ("\t<LINK ID = \"link_%ld\" SIZE = \"%ld\" TYPE = \"MUMmer\">\n",
1381 ++ ct, gap);
1382 printf("\t\t<CONTIG ID = \"contig_%s\" ORI = \"%s\">\n",
1383 Cp->IdQ, Cp->DirQ == FORWARD_CHAR ? "BE" : "EB");
1384
1385 for ( Ap = Cp->Aligns.begin( );
1386 Ap < Cp->Aligns.end( ); Ap ++ )
1387 if ( ! Ap->isTiled )
1388 continue;
1389 else
1390 {
1391 printf("\t\t");
1392 printAlignment(Cp,Ap,stdout);
1393 }
1394
1395 printf("\t\t</CONTIG>\n");
1396 printf("\t\t<CONTIG ID = \"contig_%s\" ORI = \"%s\">\n",
1397 Cp->linksTo->IdQ,
1398 Cp->linksTo->DirQ == FORWARD_CHAR ? "BE" : "EB");
1399
1400 for ( Ap = Cp->linksTo->Aligns.begin( );
1401 Ap < Cp->linksTo->Aligns.end( ); Ap ++ )
1402 if ( ! Ap->isTiled )
1403 continue;
1404 else
1405 {
1406 printf("\t\t");
1407 printAlignment(Cp->linksTo,Ap,stdout);
1408 }
1409
1410 printf("\t\t</CONTIG>\n");
1411 printf("\t</LINK>\n");
1412
1413 //-- Walk the pointer down the list of contigs
1414 if ( Cp->linksTo == Contigs.end( ) || Cp->linksTo->isLinkHead )
1415 Cp = Contigs.end( );
1416 else
1417 Cp = Cp->linksTo;
1418 }
1419 }
1420
1421 printf ("</EVIDENCE>\n");
1422
1423 return;
1424 }
1425
1426
1427
1428
printHelp(const char * s)1429 void printHelp
1430 (const char * s)
1431
1432 // Display the program's help information to stderr.
1433
1434 {
1435 fprintf (stderr,
1436 "\nUSAGE: %s [options] <deltafile>\n\n", s);
1437 fprintf (stderr,
1438 "-a Describe the tiling path by printing the tab-delimited\n"
1439 " alignment region coordinates to stdout\n"
1440 "-c Assume the reference sequences are circular, and allow\n"
1441 " tiled contigs to span the origin\n"
1442 "-h Display help information\n"
1443 "-g int Set maximum gap between clustered alignments [-1, INT_MAX]\n"
1444 " A value of -1 will represent infinity\n"
1445 " (nucmer default = %ld)\n"
1446 " (promer default = %ld)\n"
1447 "-i float Set minimum percent identity to tile [0.0, 100.0]\n"
1448 " (nucmer default = %.1f)\n"
1449 " (promer default = %.1f)\n"
1450 "-l int Set minimum length contig to report [-1, INT_MAX]\n"
1451 " A value of -1 will represent infinity\n"
1452 " (common default = %ld)\n"
1453 "-p file Output a pseudo molecule of the query contigs to 'file'\n"
1454 "-R Deal with repetitive contigs by randomly placing them\n"
1455 " in one of their copy locations (implies -V 0)\n"
1456 "-t file Output a TIGR style contig list of each query sequence\n"
1457 " that sufficiently matches the reference (non-circular)\n"
1458 "-u file Output the tab-delimited alignment region coordinates\n"
1459 " of the unusable contigs to 'file'\n"
1460 "-v float Set minimum contig coverage to tile [0.0, 100.0]\n"
1461 " (nucmer default = %.1f) sum of individual alignments\n"
1462 " (promer default = %.1f) extent of syntenic region\n"
1463 "-V float Set minimum contig coverage difference [0.0, 100.0]\n"
1464 " i.e. the difference needed to determine one alignment\n"
1465 " is 'better' than another alignment\n"
1466 " (nucmer default = %.1f) sum of individual alignments\n"
1467 " (promer default = %.1f) extent of syntenic region\n"
1468 "-x Describe the tiling path by printing the XML contig\n"
1469 " linking information to stdout\n\n",
1470 DEFAULT_NUCMER_MAX_GAP_SIZE, DEFAULT_PROMER_MAX_GAP_SIZE,
1471 DEFAULT_NUCMER_MIN_PIDY, DEFAULT_PROMER_MIN_PIDY,
1472 DEFAULT_MIN_CONTIG_LENGTH,
1473 DEFAULT_NUCMER_MIN_COVERAGE, DEFAULT_PROMER_MIN_COVERAGE,
1474 DEFAULT_NUCMER_MIN_COVERAGE_DIFF, DEFAULT_PROMER_MIN_COVERAGE_DIFF);
1475 fprintf (stderr,
1476 " Input is the .delta output of the nucmer program, run on very\n"
1477 "similar sequence data, or the .delta output of the promer program,\n"
1478 "run on divergent sequence data.\n"
1479 " Output is to stdout, and consists of the predicted location of\n"
1480 "each aligning query contig as mapped to the reference sequences.\n"
1481 "These coordinates reference the extent of the entire query contig,\n"
1482 "even when only a certain percentage of the contig was actually\n"
1483 "aligned (unless the -a option is used). Columns are, start in ref,\n"
1484 "end in ref, distance to next contig, length of this contig, alignment\n"
1485 "coverage, identity, orientation, and ID respectively.\n\n");
1486 return;
1487 }
1488
1489
1490
1491
printUsage(const char * s)1492 void printUsage
1493 (const char * s)
1494
1495 // Display the program's usage information to stderr.
1496
1497 {
1498 fprintf (stderr,
1499 "\nUSAGE: %s [options] <deltafile>\n\n", s);
1500 fprintf (stderr, "Try '%s -h' for more information.\n", s);
1501 return;
1502 }
1503
1504
1505
1506
revC(long int Coord,long int Len)1507 inline long int revC
1508 (long int Coord, long int Len)
1509
1510 // Reverse complement the given coordinate for the given length.
1511
1512 {
1513 assert (Len - Coord + 1 > 0);
1514 return (Len - Coord + 1);
1515 }
1516
1517
1518
1519
tileContigs(vector<QueryContig> & Contigs)1520 void tileContigs
1521 (vector<QueryContig> & Contigs)
1522 {
1523 vector<QueryContig>::iterator Cp;
1524 vector<AlignStats>::iterator Ap, Aip, Atemp;
1525
1526 long int start, end, hang;
1527
1528 char * IdR = NULL;
1529 char * IdRhang = NULL;
1530
1531 float cov, covhang;
1532 float max_cov, maxx_cov;
1533 float idy;
1534 float max_covhang, maxx_covhang;
1535 float idyhang, tmpidy;
1536
1537 for ( Cp = Contigs.begin( ); Cp < Contigs.end( ); Cp ++ )
1538 {
1539 if ( Cp->TileLevel != UNUSED_TILE_LEVEL )
1540 continue;
1541
1542 max_cov = maxx_cov = idy = -(FLT_MAX);
1543 max_covhang = maxx_covhang = idyhang = -(FLT_MAX);
1544 Ap = Cp->Aligns.begin( );
1545 while ( Ap < Cp->Aligns.end( ) )
1546 {
1547 //-- For a single reference
1548 for ( Aip = Ap + 1; Aip < Cp->Aligns.end( ); Aip ++ )
1549 if ( strcmp (Ap->IdR, Aip->IdR) != 0 )
1550 break;
1551
1552 //-- Cluster the alignments
1553 longestConsistentSubset (Ap, Aip);
1554
1555 //-- Get the extents and overhang of the best subset
1556 hang = 0;
1557 end = start = -1;
1558 for ( Atemp = Ap; Atemp < Aip; Atemp ++ )
1559 if ( Atemp->isTiled )
1560 {
1561 if ( Atemp->sQ - Atemp->sR > 0 )
1562 hang += Atemp->sQ - Atemp->sR;
1563 start = Atemp->sR;
1564 break;
1565 }
1566 for ( Atemp = Aip - 1; Atemp >= Ap; Atemp -- )
1567 if ( Atemp->isTiled )
1568 {
1569 if ( Atemp->eR + (Cp->SeqLenQ - Atemp->eQ) > Atemp->SeqLenR )
1570 hang += (Atemp->eR + (Cp->SeqLenQ - Atemp->eQ)) -
1571 Atemp->SeqLenR;
1572 end = Atemp->eR;
1573 break;
1574 }
1575 assert ( end > 0 && start > 0 );
1576
1577 //-- Check the query coverage for mapping to this reference
1578 if ( DATA_TYPE == NUCMER_DATA )
1579 cov = getSubsetQryCoverage (Ap, Aip, Cp->SeqLenQ);
1580 else // PROMER_DATA
1581 cov = getSubsetQrySyntenyCoverage (Ap, Aip, Cp->SeqLenQ);
1582 if ( cov >= max_cov )
1583 {
1584 tmpidy = getSubsetIdentity (Ap, Aip);
1585 if ( cov > max_cov || (cov == max_cov && tmpidy > idy) )
1586 {
1587 IdR = Ap->IdR;
1588 maxx_cov = max_cov;
1589 max_cov = cov;
1590 idy = tmpidy;
1591 }
1592 }
1593 else if ( cov > maxx_cov )
1594 maxx_cov = cov;
1595
1596 if ( !isCircularReference )
1597 {
1598 //-- Check the query coverage for mapping to this reference
1599 if ( DATA_TYPE == NUCMER_DATA )
1600 covhang = getSubsetQryCoverage
1601 (Ap, Aip, Cp->SeqLenQ - hang);
1602 else // PROMER_DATA
1603 covhang = getSubsetQrySyntenyCoverage
1604 (Ap, Aip, Cp->SeqLenQ - hang);
1605 if ( covhang >= max_covhang )
1606 {
1607 tmpidy = getSubsetIdentity (Ap, Aip);
1608 if ( covhang > max_covhang || (covhang == max_covhang &&
1609 tmpidy > idyhang) )
1610 {
1611 IdRhang = Ap->IdR;
1612 maxx_covhang = max_covhang;
1613 max_covhang = covhang;
1614 idyhang = tmpidy;
1615 }
1616 }
1617 else if ( covhang > maxx_covhang )
1618 maxx_covhang = covhang;
1619 }
1620
1621 for ( ; Ap < Aip; Ap ++ )
1622 if ( !Ap->isTiled )
1623 {
1624 //-- Don't worry about the single if it overlaps
1625 if (start <= end && (Ap->sR <= end && Ap->eR >= start))
1626 continue;
1627 else if (start > end && (Ap->eR >= start || Ap->sR <= end))
1628 continue;
1629
1630 //-- Look for competing single alignments
1631 cov = getAlignmentQryCoverage (Ap, Cp->SeqLenQ);
1632 if ( cov > maxx_cov )
1633 maxx_cov = cov;
1634
1635 if ( !isCircularReference )
1636 {
1637 hang = 0;
1638 if ( Ap->sQ - Ap->sR > 0 )
1639 hang += Ap->sQ - Ap->sR;
1640 if ( Ap->eR + (Cp->SeqLenQ - Ap->eQ) >
1641 Ap->SeqLenR )
1642 hang += (Ap->eR + (Cp->SeqLenQ - Ap->eQ)) -
1643 Ap->SeqLenR;
1644
1645 covhang = getAlignmentQryCoverage (Ap, Cp->SeqLenQ - hang);
1646 if ( covhang > maxx_covhang )
1647 maxx_covhang = covhang;
1648 }
1649 }
1650 // Ap now equals Aip
1651 }
1652
1653 //-- If clustered coverage is...
1654 if ( max_cov - maxx_cov >= MIN_COVERAGE_DIFF &&
1655 max_cov >= MIN_COVERAGE && idy >= MIN_PIDY )
1656 {
1657 for ( Aip = Cp->Aligns.begin( ); Aip < Cp->Aligns.end( ); Aip ++ )
1658 if ( Aip->isTiled && strcmp (Aip->IdR, IdR) != 0 )
1659 Aip->isTiled = false;
1660
1661 //-- Tile the contig
1662 Cp->TileLevel = USED_TILE_LEVEL;
1663 placeContig ( Cp );
1664 }
1665 else if ( !isCircularReference &&
1666 max_covhang - maxx_covhang >= MIN_COVERAGE_DIFF &&
1667 max_covhang >= MIN_COVERAGE && idyhang >= MIN_PIDY )
1668 {
1669 for ( Aip = Cp->Aligns.begin( ); Aip < Cp->Aligns.end( ); Aip ++ )
1670 if ( Aip->isTiled && strcmp (Aip->IdR, IdRhang) != 0 )
1671 Aip->isTiled = false;
1672
1673 //-- Tile the contig
1674 Cp->TileLevel = USED_TILE_LEVEL;
1675 placeContig ( Cp );
1676 }
1677 else
1678 Cp->TileLevel = UNUSABLE_TILE_LEVEL;
1679 }
1680
1681 //-- Sort by reference mapping location
1682 sort ( Contigs.begin( ), Contigs.end( ), IdR_StartR_Sort( ) );
1683
1684 return;
1685 }
1686