1 //------------------------------------------------------------------------------
2 //   Programmer: Adam M Phillippy, The Institute for Genomic Research
3 //         File: show-tiling.cc
4 //         Date: 09 / 13 / 2002
5 //
6 //        Usage: show-tiling [options] <deltafile>
7 //               Try 'show-tiling -h' for more information
8 //
9 //  Description: For use in conjunction with the MUMmer package.
10 //              "show-tiling" predicts and displays a query to reference tiling
11 //             path. Note that this program is most effective if the query
12 //            sequences are small fragments of a large refence.
13 //
14 //------------------------------------------------------------------------------
15 
16 #include <mummer/delta.hh>
17 #include <mummer/tigrinc.hh>
18 #include <mummer/redirect_to_pager.hpp>
19 #include <vector>
20 #include <algorithm>
21 using namespace std;
22 
23 //------------------------------------------------------------- Constants ----//
24 const int CHARS_PER_LINE = 60;
25 
26 const long int DEFAULT_MIN_CONTIG_LENGTH        =     1;
27 
28 const long int DEFAULT_NUCMER_MAX_GAP_SIZE      =  1000;
29 const long int DEFAULT_PROMER_MAX_GAP_SIZE      =    -1;
30 
31 const float DEFAULT_NUCMER_MIN_COVERAGE         =  95.0;
32 const float DEFAULT_PROMER_MIN_COVERAGE         =  50.0;
33 
34 const float DEFAULT_NUCMER_MIN_COVERAGE_DIFF    =  10.0;
35 const float DEFAULT_PROMER_MIN_COVERAGE_DIFF    =  30.0;
36 
37 const float DEFAULT_NUCMER_MIN_PIDY             =  90.0;
38 const float DEFAULT_PROMER_MIN_PIDY             =  55.0;
39 
40 const char FORWARD_CHAR  =  '+';
41 const char REVERSE_CHAR  =  '-';
42 
43 const float REPEAT_PIDY_DIFF = 0.25;
44 
45 const int UNUSABLE_TILE_LEVEL  =  -2;
46 const int IGNORE_TILE_LEVEL    =  -1;
47 const int UNUSED_TILE_LEVEL    =   0;
48 const int USED_TILE_LEVEL      =   1;
49 
50 char NULL_STRING[1] = "";
51 
52 
53 
54 //------------------------------------------------------ Type Definitions ----//
55 struct AlignStats
56      //-- Alignment statistics data structure
57 {
58   float Idy;                            // percent identity   [0.0, 100.0]
59   float Sim;                            // percent similarity [0.0, 100.0]
60   float Stp;                            // precent stop codon [0.0, 100.0]
61 
62   char DirQ;                            // contig orientation (relative to ref)
63 
64   long int sQ, eQ, sR, eR;              // start and end in Query and Reference
65                                         // relative to the directional strand
66 
67   char * IdR;                           // FASTA Id of the mapping reference
68   long int SeqLenR;                     // length of the reference
69 
70   bool isTiled;                          // is the alignment be tiled?
71 };
72 
73 
74 
75 struct QueryContig
76      //-- Query sequence tiling contig data structure
77 {
78   char * IdQ;                              // FASTA Id of the query
79   long int SeqLenQ;                        // length of the query
80   char * SeqQ;                             // query sequence
81 
82   vector<AlignStats> Aligns;               // alignments for this contig
83 
84   int TileLevel;                      // describes tiling status of query contig
85 
86   //-- Things to be filled in once the contig is tiled
87   char DirQ;                               // orientation of the contig
88   char * IdR;                              // Id of the mapping reference
89   long int SeqLenR;                        // sequence length of the reference
90   long int StartR, EndR;                   // contig -> reference mapping coords
91 
92   long int LoTrim, HiTrim;                 // lo and hi trim lengths
93 
94   vector<QueryContig>::iterator linksTo;   // who does this contig link to?
95   bool isLinkHead;                         // is the head of the linked list?
96 };
97 
98 
99 
100 struct IdR_StartR_Sort
101 //-- For sorting QueryContigs by IdR, then StartR coordinates
102 {
operator ( )IdR_StartR_Sort103   bool operator( ) (const QueryContig & pA, const QueryContig & pB)
104   {
105     int cmp = strcmp (pA.IdR, pB.IdR);
106 
107     //-- sort IdR
108     if ( cmp < 0 )
109       return true;
110     else if ( cmp > 0 )
111       return false;
112 
113     //-- sort StartR
114     else if ( pA.StartR < pB.StartR )
115       return true;
116     else if ( pA.StartR > pB.StartR )
117       return false;
118 
119     //-- sort SeqLenQ
120     else if ( pA.SeqLenQ > pB.SeqLenQ )
121       return true;
122     else
123       return false;
124   }
125 };
126 
127 
128 
129 struct IdR_StartRTrimmed_Sort
130 //-- For sorting contig output by trim offset
131 {
operator ( )IdR_StartRTrimmed_Sort132   bool operator( ) (const QueryContig & pA, const QueryContig & pB)
133   {
134     int cmp = strcmp (pA.IdR, pB.IdR);
135 
136     //-- sort IdR
137     if ( cmp < 0 )
138       return true;
139     else if ( cmp > 0 )
140       return false;
141 
142     //-- sort StartR
143     else if ( pA.StartR + pA.LoTrim < pB.StartR + pA.LoTrim )
144       return true;
145     else
146       return false;
147   }
148 };
149 
150 
151 
152 struct IdQ_Sort
153 //-- For sorting QueryContigs by IdQ
154 {
operator ( )IdQ_Sort155   bool operator( ) (const QueryContig & pA, const QueryContig & pB)
156   {
157     //-- sort IdQ
158     if ( strcmp (pA.IdQ, pB.IdQ) < 0 )
159       return true;
160     else
161       return false;
162   }
163 };
164 
165 
166 
167 struct IdR_sQ_Sort
168 //-- For sorting AlignStats by IdR, then sQ
169 {
operator ( )IdR_sQ_Sort170   bool operator( ) (const AlignStats & pA, const AlignStats & pB)
171   {
172     int cmp = strcmp (pA.IdR, pB.IdR);
173 
174     //-- sort IdR
175     if ( cmp < 0 )
176       return true;
177     else if ( cmp > 0 )
178       return false;
179 
180     //-- sort sQ
181     else if ( pA.sQ < pB.sQ )
182       return true;
183     else
184       return false;
185   }
186 };
187 
188 
189 
190 
191 //------------------------------------------------------ Global Variables ----//
192 bool isOutputContigs = false;            // set by -t option
193 bool isOutputPseudoMolecule = false;     // set by -p option
194 bool isOutputUnusable = false;           // set by -u option
195 bool isPrintAlignments = false;          // set by -a option
196 bool isPrintXML = false;                 // set by -x option
197 bool isCircularReference = false;        // set by -c option
198 bool isRandomRepeats = false;            // set by -R option
199 
200 long int MIN_CONTIG_LENGTH = DEFAULT_MIN_CONTIG_LENGTH;
201 
202 long int MAX_GAP_SIZE    = 0;
203 float MIN_COVERAGE       = 0;
204 float MIN_COVERAGE_DIFF  = 0;
205 float MIN_PIDY           = 0;
206 
207 bool isdef_MAX_GAP_SIZE       = false;
208 bool isdef_MIN_COVERAGE       = false;
209 bool isdef_MIN_COVERAGE_DIFF  = false;
210 bool isdef_MIN_PIDY           = false;
211 
212 int   DATA_TYPE = NUCMER_DATA;         // set by .delta header
213 
214 char InputFileName [MAX_LINE];
215 char RefFileName [MAX_LINE], QryFileName [MAX_LINE];
216 
217 
218 
219 //------------------------------------------------- Function Declarations ----//
220 float getAlignmentQryCoverage
221      (vector<AlignStats>::iterator Ap, long int SeqLenQ);
222 
223 float getSubsetQryCoverage
224      (vector<AlignStats>::iterator begin,
225       vector<AlignStats>::iterator end, long int SeqLenQ);
226 
227 float getSubsetQrySyntenyCoverage
228      (vector<AlignStats>::iterator begin,
229       vector<AlignStats>::iterator end, long int SeqLenQ);
230 
231 float getSubsetIdentity
232      (vector<AlignStats>::iterator begin,
233       vector<AlignStats>::iterator end);
234 
235 void linkContigs
236      (vector<QueryContig> & Contigs);
237 
238 long int longestConsistentSubset
239      (vector<AlignStats>::iterator begin,
240       vector<AlignStats>::iterator end);
241 
242 void outputContigs
243      (vector<QueryContig> Contigs, FILE * Output);
244 
245 void outputPseudoMolecule
246      (vector<QueryContig> & Contigs, FILE * QryFile, FILE * Output);
247 
248 void outputUnusable
249      (vector<QueryContig> Contigs, FILE * Output);
250 
251 void parseDelta
252      (vector<QueryContig> & Contigs);
253 
254 void placeContig
255      (vector<QueryContig>::iterator Cp);
256 
257 void printAlignment
258      (vector<QueryContig>::iterator Cp, vector<AlignStats>::iterator Ap,
259       FILE * Output);
260 
261 void printTilingAlignments
262      (vector<QueryContig> & Contigs);
263 
264 void printTilingPath
265      (vector<QueryContig> & Contigs);
266 
267 void printTilingXML
268      (vector<QueryContig> & Contigs, char * QryFileName,
269       int argc, char ** argv);
270 
271 void printHelp
272      (const char * s);
273 
274 void printUsage
275      (const char * s);
276 
277 inline long int revC
278      (long int Coord, long int Len);
279 
280 void tileContigs
281      (vector<QueryContig> & Contigs);
282 
283 
284 
285 
286 //-------------------------------------------------- Function Definitions ----//
main(int argc,char ** argv)287 int main
288      (int argc, char ** argv)
289 {
290   char ContigsFileName [MAX_LINE];
291   char PseudoMoleculeFileName [MAX_LINE];
292   char UnusableFileName [MAX_LINE];
293 
294   FILE * ContigsFile = NULL;
295   FILE * PseudoMoleculeFile = NULL;
296   FILE * UnusableFile = NULL;
297   FILE * QryFile = NULL;
298 
299   vector<QueryContig> Contigs;
300 
301   //-- Parse the command line arguments
302   {
303     int ch, errflg = 0;
304     optarg = NULL;
305 
306     while ( !errflg  &&  ((ch = getopt
307 			   (argc, argv, "achg:i:l:p:Rt:u:v:V:x")) != EOF) )
308       switch (ch)
309         {
310 	case 'a' :
311 	  isPrintAlignments = true;
312 	  break;
313 
314 	case 'c' :
315 	  isCircularReference = true;
316 	  break;
317 
318         case 'h' :
319           printHelp (argv[0]);
320           exit (EXIT_SUCCESS);
321           break;
322 
323 	case 'g' :
324 	  MAX_GAP_SIZE = atoi (optarg);
325 	  isdef_MAX_GAP_SIZE = true;
326 	  break;
327 
328 	case 'i' :
329 	  MIN_PIDY = atof (optarg);
330 	  isdef_MIN_PIDY = true;
331 	  break;
332 
333 	case 'l' :
334 	  MIN_CONTIG_LENGTH = atoi (optarg);
335 	  break;
336 
337 	case 'p' :
338 	  strcpy ( PseudoMoleculeFileName, optarg );
339 	  isOutputPseudoMolecule = true;
340 	  break;
341 
342 	case 'R' :
343 	  isRandomRepeats = true;
344 	  break;
345 
346 	case 't' :
347 	  strcpy ( ContigsFileName, optarg );
348 	  isOutputContigs = true;
349 	  break;
350 
351 	case 'u' :
352 	  strcpy ( UnusableFileName, optarg );
353 	  isOutputUnusable = true;
354 	  break;
355 
356 	case 'v' :
357 	  MIN_COVERAGE = atof (optarg);
358 	  isdef_MIN_COVERAGE = true;
359 	  break;
360 
361 	case 'V' :
362 	  MIN_COVERAGE_DIFF = atof (optarg);
363 	  isdef_MIN_COVERAGE_DIFF = true;
364 	  break;
365 
366 	case 'x' :
367 	  isPrintXML = true;
368 	  break;
369 
370         default :
371           errflg ++;
372         }
373 
374     if ( errflg > 0  ||  argc - optind != 1 )
375       {
376         printUsage (argv[0]);
377         exit (EXIT_FAILURE);
378       }
379 
380     if ( MIN_PIDY < 0.0 || MIN_PIDY > 100.0 ||
381 	 MIN_COVERAGE < 0.0 || MIN_COVERAGE > 100.0 ||
382 	 MIN_COVERAGE_DIFF < 0.0 || MIN_COVERAGE_DIFF > 100.0 )
383       {
384 	fprintf(stderr,
385 		"\nERROR: Percentages must be within the range [0.0, 100.0]\n");
386 	exit (EXIT_FAILURE);
387       }
388 
389     if ( MIN_CONTIG_LENGTH < -1 || MAX_GAP_SIZE < -1 )
390       {
391 	fprintf(stderr,
392 		"\nERROR: Size values must be within the range [-1, oo]\n");
393 	exit (EXIT_FAILURE);
394       }
395 
396     if ( isRandomRepeats )
397       {
398 	srand ( time(NULL) );
399 	MIN_COVERAGE_DIFF = 0;
400 	isdef_MIN_COVERAGE_DIFF = true;
401       }
402   }
403 
404 
405   //-- Parse the delta file
406   strcpy (InputFileName, argv[optind ++]);
407   parseDelta (Contigs);
408 
409 
410   //-- Try and open all the output files
411   if ( isOutputContigs )
412     ContigsFile = File_Open ( ContigsFileName, "w" );
413   if ( isOutputPseudoMolecule )
414     PseudoMoleculeFile = File_Open ( PseudoMoleculeFileName, "w" );
415   if ( isOutputPseudoMolecule )
416     QryFile = File_Open ( QryFileName, "r" );
417   if ( isOutputUnusable )
418     UnusableFile = File_Open ( UnusableFileName, "w" );
419 
420 
421   //-- Find each contig's mapping and sort by reference
422   tileContigs (Contigs);
423 
424 
425   if ( isOutputContigs )
426     {
427       outputContigs (Contigs, ContigsFile);
428       fclose (ContigsFile);
429     }
430 
431 
432   //-- Print the unusable contigs w/ alignments to the user specified file
433   if ( isOutputUnusable )
434     {
435       outputUnusable (Contigs, UnusableFile);
436       fclose (UnusableFile);
437     }
438 
439 
440   //-- Link the contigs to form the tiling path
441   linkContigs (Contigs);
442 
443 
444   //-- Print the pseudo molecule to the user specifed file
445   if ( isOutputPseudoMolecule )
446     {
447       outputPseudoMolecule (Contigs, QryFile, PseudoMoleculeFile);
448       fclose (QryFile);
449       fclose (PseudoMoleculeFile);
450     }
451 
452 
453   //-- Print the tiling path to stdout
454   stdio_launch_pager redirect_to_pager;
455   if ( isPrintAlignments )
456     printTilingAlignments (Contigs);
457   else if ( isPrintXML )
458     printTilingXML (Contigs, QryFileName, argc, argv);
459   else
460     printTilingPath (Contigs);
461 
462 
463   return EXIT_SUCCESS;
464 }
465 
466 
467 
468 
getAlignmentQryCoverage(vector<AlignStats>::iterator Ap,long int SeqLenQ)469 float getAlignmentQryCoverage
470      (vector<AlignStats>::iterator Ap, long int SeqLenQ)
471 {
472   //-- Return query coverage for a single alignment
473   return (float)(Ap->eQ - Ap->sQ + 1) / (float)SeqLenQ * 100.0;
474 }
475 
476 
477 
478 
getSubsetQryCoverage(vector<AlignStats>::iterator begin,vector<AlignStats>::iterator end,long int SeqLenQ)479 float getSubsetQryCoverage
480      (vector<AlignStats>::iterator begin,
481       vector<AlignStats>::iterator end, long int SeqLenQ)
482 {
483   vector<AlignStats>::iterator Ap;
484   vector<AlignStats>::iterator preAp = end;
485 
486   long int cov = 0;
487   long int olap;
488 
489   //-- Get non-redundant query coverage for the subset of alignments
490   for ( Ap = begin; Ap < end; Ap ++ )
491     {
492       if ( ! Ap->isTiled )
493 	continue;
494 
495       if ( preAp == end )
496 	cov += Ap->eQ - Ap->sQ + 1;
497       else
498 	{
499 	  //-- Alignments must be pre-sorted by sQ
500 	  assert ( Ap->sQ >= preAp->sQ );
501 	  olap = preAp->eQ - Ap->sQ + 1;
502 	  olap = olap > 0 ? olap : 0;
503 	  cov += Ap->eQ - Ap->sQ + 1 - olap;
504 	}
505       preAp = Ap;
506     }
507 
508   return (float)(cov) / (float)SeqLenQ * 100.0;
509 }
510 
511 
512 
513 
getSubsetQrySyntenyCoverage(vector<AlignStats>::iterator begin,vector<AlignStats>::iterator end,long int SeqLenQ)514 float getSubsetQrySyntenyCoverage
515      (vector<AlignStats>::iterator begin,
516       vector<AlignStats>::iterator end, long int SeqLenQ)
517 {
518   vector<AlignStats>::iterator Ap;
519   long int cov = 0;
520   long int low, high;
521 
522   low = LONG_MAX;
523   high = -LONG_MAX;
524   for ( Ap = begin; Ap < end; Ap ++ )
525     if ( Ap->isTiled )
526       {
527 	if ( Ap->sQ < low )
528 	  low = Ap->sQ;
529 	if ( Ap->eQ > high )
530 	  high = Ap->eQ;
531       }
532   cov = high - low + 1;
533 
534   return (float)(cov) / (float)SeqLenQ * 100.0;
535 }
536 
537 
538 
539 
getSubsetIdentity(vector<AlignStats>::iterator begin,vector<AlignStats>::iterator end)540 float getSubsetIdentity
541      (vector<AlignStats>::iterator begin,
542       vector<AlignStats>::iterator end)
543 {
544   vector<AlignStats>::iterator Ap;
545 
546   long int len;
547   long int N = 0;
548   float tot = 0;
549 
550   //-- Get weighted (by length) average identity for the subset of alignments
551   for ( Ap = begin; Ap < end; Ap ++ )
552     {
553       if ( ! Ap->isTiled )
554 	continue;
555 
556       len = Ap->eQ - Ap->sQ + 1;
557       N += len;
558       tot += Ap->Idy * len;
559     }
560 
561   return tot / (float)N;
562 }
563 
564 
565 
566 
linkContigs(vector<QueryContig> & Contigs)567 void linkContigs
568      (vector<QueryContig> & Contigs)
569 {
570   vector<QueryContig>::iterator Cp;
571   vector<QueryContig>::iterator Cip;
572   vector<QueryContig>::iterator nxtCp;
573   vector<QueryContig>::iterator firstCp;
574   long int best_end;
575 
576   //-- Ignore shadowed contigs and generate the tiling path that uses
577   //   the minimum number of contigs while covering the maximum area
578   firstCp = Contigs.end( );
579   for ( Cp = Contigs.begin( ); Cp < Contigs.end( ); Cp ++ )
580     {
581       if ( Cp->TileLevel != USED_TILE_LEVEL )
582 	continue;
583 
584       if ( firstCp == Contigs.end( ) )
585 	{
586 	  firstCp = Cp;
587 	  firstCp->isLinkHead = true;
588 	}
589 
590       //-- If several contigs overlap Cp, pick the best one...
591       //   or if none overlap, pick the next used tile
592       nxtCp = Contigs.end( );
593       best_end = -(LONG_MAX);
594       for ( Cip = Cp + 1; Cip < Contigs.end( ); Cip ++ )
595 	{
596 	  if ( Cip->TileLevel != USED_TILE_LEVEL )
597 	    continue;
598 
599 	  if ( strcmp ( Cip->IdR, Cp->IdR ) != 0 )
600 	    break;
601 
602 	  //-- If no overlap
603 	  if ( Cip->StartR > Cp->EndR )
604 	    {
605 	      if ( nxtCp == Contigs.end( ) )
606 		nxtCp = Cip;
607 	      break;
608 	    }
609 
610 	  //-- If overlap
611 	  Cip->TileLevel = IGNORE_TILE_LEVEL;
612 	  if ( Cip->EndR > Cp->EndR  &&  Cip->EndR > best_end )
613 	    {
614 	      best_end = Cip->EndR;
615 	      nxtCp = Cip;
616 	    }
617 	}
618 
619 
620       //-- Link Cp to nxtCp
621       if ( nxtCp != Contigs.end( ) )
622 	{
623 	  //-- Link to the next valid tile
624 	  nxtCp->TileLevel = USED_TILE_LEVEL;
625 	  Cp->linksTo = nxtCp;
626 	}
627       else
628 	{
629 	  if ( isCircularReference )
630 	    {
631 	      if ( firstCp->StartR <= 0 )
632 		{
633 		  nxtCp = Cp;
634 		  for (; Cp > firstCp; Cp -- )
635 		    if ( Cp->TileLevel == USED_TILE_LEVEL )
636 		      {
637 			if ( firstCp->SeqLenR + firstCp->StartR <= Cp->EndR )
638 			  {
639 			    nxtCp = Cp;
640 			    Cp->TileLevel = IGNORE_TILE_LEVEL;
641 			  }
642 			else
643 			  break;
644 		      }
645 
646 		  if ( firstCp->SeqLenR + firstCp->StartR > nxtCp->StartR )
647 		    {
648 		      Cp = nxtCp;
649 		      Cp->TileLevel = USED_TILE_LEVEL;
650 		    }
651 		}
652 
653 	      //-- Circular link (note: this creates a circular linked list)
654 	      Cp->linksTo = firstCp;
655 	    }
656 	  else
657 	    Cp->linksTo = Contigs.end( );
658 
659 	  firstCp = Contigs.end( );
660 	}
661     }
662 
663   return;
664 }
665 
666 
667 
668 
longestConsistentSubset(vector<AlignStats>::iterator begin,vector<AlignStats>::iterator end)669 long int longestConsistentSubset
670      (vector<AlignStats>::iterator begin, vector<AlignStats>::iterator end)
671 {
672   //-- For dynamic programming scoring and backtracking
673   struct node
674   {
675     vector<AlignStats>::iterator Ap;
676     long int Score;
677     long int From;
678   };
679 
680   vector<AlignStats>::iterator Ap;
681   long int i, j, N, OlapR, OlapQ, Olap, Best;
682   node * A;
683 
684   //-- Build the data array
685   N = 0;
686   A = (node *) Safe_malloc (sizeof(node) * (end - begin));
687   for ( Ap = begin; Ap < end; Ap ++ )
688     {
689       assert ( Ap->isTiled == false );
690       A[N].Ap = Ap;
691       A[N].Score = Ap->eQ - Ap->sQ + 1;
692       A[N].From = -1;
693       N ++;
694     }
695   assert ( N == (end - begin) );
696 
697   //-- Isn't it dynamic?
698   for ( i = 0; i < N; i ++ )
699     for ( j = 0; j < i; j ++ )
700       {
701 	assert ( strcmp (A[i].Ap->IdR, A[j].Ap->IdR) == 0 );
702 	if ( A[i].Ap->DirQ != A[j].Ap->DirQ )
703 	  continue;
704 
705 	OlapR = A[j].Ap->eR - A[i].Ap->sR + 1;
706 	if ( isCircularReference  &&  OlapR > A[j].Ap->SeqLenR / 2 )
707 	  OlapR = (A[j].Ap->eR - A[j].Ap->SeqLenR) + (1 - A[i].Ap->sR);
708 
709 	if ( MAX_GAP_SIZE >= 0  &&  OlapR < -(MAX_GAP_SIZE) )
710 	  continue;
711 	Olap = OlapR < 0 ? 0 : OlapR;
712 	OlapQ = A[j].Ap->eQ - A[i].Ap->sQ + 1;
713 	if ( MAX_GAP_SIZE >= 0  &&  OlapQ < -(MAX_GAP_SIZE) )
714 	  continue;
715 	Olap = Olap > OlapQ ? Olap : OlapQ;
716 
717 	if ( A[j].Score + (A[i].Ap->eQ - A[i].Ap->sQ + 1) - Olap > A[i].Score )
718 	  {
719 	    A[i].From = j;
720 	    A[i].Score = A[j].Score + (A[i].Ap->eQ - A[i].Ap->sQ + 1) - Olap;
721 	  }
722       }
723 
724   //-- Find the highest score and backtrack to extract the subset
725   Best = 0;
726   for ( i = 1; i < N; i ++ )
727     {
728       if ( A[i].Score > A[Best].Score )
729 	Best = i;
730       else if ( A[i].Score == A[Best].Score &&
731 		A[i].Ap->Idy > A[Best].Ap->Idy )
732 	Best = i;
733     }
734 
735   if ( isRandomRepeats )
736     {
737       vector<long int> ties;
738       for ( i = 0; i < N; i ++ )
739 	if ( A[i].Score == A[Best].Score &&
740 	     A[i].Ap->Idy >= A[Best].Ap->Idy - REPEAT_PIDY_DIFF )
741 	  ties . push_back (i);
742 
743       if ( ties.size( ) > 1 )
744 	Best = ties
745 	  [(long int)((double)ties.size( ) * rand( ) / (RAND_MAX + 1.0))];
746     }
747 
748   for ( i = Best; i >= 0; i = A[i].From )
749     A[i].Ap->isTiled = true;
750 
751   Best = A[Best].Score;
752 
753   free ( A );
754 
755   return Best;
756 }
757 
758 
759 
760 
outputContigs(vector<QueryContig> Contigs,FILE * Output)761 void outputContigs
762      (vector<QueryContig> Contigs, FILE * Output)
763 {
764   vector<QueryContig>::iterator Cp;
765   vector<QueryContig>::iterator endCp;
766 
767   //-- Sort by reference trim offset
768   sort ( Contigs.begin( ), Contigs.end( ), IdR_StartRTrimmed_Sort( ) );
769 
770   long int seqs;
771 
772   Cp = Contigs.begin( );
773   while ( Cp < Contigs.end( ) )
774     {
775       if ( Cp->TileLevel != USED_TILE_LEVEL )
776 	{
777 	  Cp ++;
778 	  continue;
779 	}
780 
781       seqs = 0;
782       for ( endCp = Cp; endCp < Contigs.end( ); endCp ++ )
783 	{
784 	  if ( endCp->TileLevel != USED_TILE_LEVEL )
785 	    continue;
786 
787 	  if ( strcmp (Cp->IdR, endCp->IdR) != 0 )
788 	    break;
789 	  seqs ++;
790 	}
791 
792       fprintf(Output,
793       "##%s %ld %ld bases, 00000000 checksum.\n",
794 	      Cp->IdR, seqs, Cp->SeqLenR);
795 
796       for ( ; Cp < endCp; Cp ++ )
797 	{
798 	  if ( Cp->TileLevel != USED_TILE_LEVEL )
799 	    continue;
800 
801 	  if ( Cp->DirQ == FORWARD_CHAR )
802 	    fprintf(Output,
803             "#%s(%ld) [%s] %ld bases, 00000000 checksum. {%ld %ld} <%ld %ld>\n",
804 		    Cp->IdQ, Cp->StartR + Cp->LoTrim - 1, "", Cp->SeqLenQ,
805 		    Cp->LoTrim + 1, Cp->SeqLenQ - Cp->HiTrim,
806 		    Cp->StartR + Cp->LoTrim, Cp->EndR - Cp->HiTrim);
807 	  else
808 	    fprintf(Output,
809             "#%s(%ld) [%s] %ld bases, 00000000 checksum. {%ld %ld} <%ld %ld>\n",
810 		    Cp->IdQ, Cp->StartR + Cp->LoTrim - 1, "RC", Cp->SeqLenQ,
811 		    Cp->SeqLenQ - Cp->LoTrim, Cp->HiTrim + 1,
812 		    Cp->StartR + Cp->LoTrim, Cp->EndR - Cp->HiTrim);
813 	}
814     }
815 
816   return;
817 }
818 
819 
820 
821 
outputPseudoMolecule(vector<QueryContig> & Contigs,FILE * QryFile,FILE * Output)822 void outputPseudoMolecule
823      (vector<QueryContig> & Contigs, FILE * QryFile, FILE * Output)
824 {
825   vector<QueryContig>::iterator Cp;
826   vector<QueryContig>::iterator beginCp;
827 
828   vector<AlignStats>::iterator Apb;
829   vector<AlignStats>::reverse_iterator Ape;
830 
831   long int ct = 0;
832   long int start, end, gap, i, endR, startR;
833 
834   char * A;
835   long int InitSize = INIT_SIZE;
836   char Line [MAX_LINE];
837 
838 
839   //-- Read in the needed query contig sequences
840   A = (char *) Safe_malloc ( sizeof(char) * InitSize );
841   while ( Read_String (QryFile, A, InitSize, Line, false) )
842     {
843       for ( Cp = Contigs.begin( ); Cp < Contigs.end( ); Cp ++ )
844 	if ( Cp->TileLevel == USED_TILE_LEVEL )
845 	  if ( strcmp ( Line, Cp->IdQ ) == 0 )
846 	    break;
847 
848       if ( Cp < Contigs.end( ) )
849 	{
850 	  assert ( (long int)strlen(A+1) == Cp->SeqLenQ );
851 	  Cp->SeqQ = (char *) Safe_malloc
852 	    ( sizeof(char) * (Cp->SeqLenQ + 2) );
853 	  Cp->SeqQ[0] = '\0';
854 	  strcpy ( Cp->SeqQ + 1, A + 1 );
855 	  if ( Cp->DirQ == REVERSE_CHAR )
856 	    Reverse_Complement (Cp->SeqQ, 1, Cp->SeqLenQ);
857 	}
858     }
859   free ( A );
860 
861   //-- For all contigs, create pseudo
862   for ( beginCp = Contigs.begin( ); beginCp < Contigs.end( ); beginCp ++ )
863     {
864       if ( ! beginCp->isLinkHead )
865 	continue;
866 
867       //-- New Reference sequence
868       start = 1;
869       Cp = beginCp;
870       fprintf (Output, ">pseudo_used_%s\n", Cp->IdR);
871 
872       //-- For all contigs mapping to this Reference
873       while ( Cp != Contigs.end( ) )
874 	{
875 	  gap = 0;
876 	  end = Cp->SeqLenQ;
877 	  if ( Cp->linksTo != Contigs.end( ) )
878 	    {
879 	      if ( ! Cp->linksTo->isLinkHead )
880 		{
881 		  //-- Internal sequential link, gap N's needed if necessary
882 		  startR = Cp->linksTo->StartR;
883 		  endR = Cp->EndR;
884 
885 		  gap = startR - endR - 1;
886 		  if ( gap < 0 )
887 		    {
888 		      for ( Apb = Cp->linksTo->Aligns.begin( );
889 			    Apb < Cp->linksTo->Aligns.end( )  &&  !Apb->isTiled;
890 			    Apb ++ )
891 			{}
892 		      assert ( Apb < Cp->linksTo->Aligns.end( ) );
893 
894 		      for ( Ape = Cp->Aligns.rbegin( );
895 			    Ape < Cp->Aligns.rend( )  &&  !Ape->isTiled;
896 			    Ape ++ )
897 			{}
898 		      assert ( Ape < Cp->Aligns.rend( ) );
899 
900 		      //-- Use the contig with the least 'junk' sequence
901 		      if ( Cp->SeqLenQ - Ape->eQ > Apb->sQ - 1  ||
902 			   ( Cp->SeqLenQ - Ape->eQ == Apb->sQ - 1  &&
903 			     getSubsetIdentity (Cp->Aligns.begin( ),
904 						Cp->Aligns.end( ))  <
905 			     getSubsetIdentity (Cp->linksTo->Aligns.begin( ),
906 						Cp->linksTo->Aligns.end( )) ) )
907 			end += gap;
908 		    }
909 		}
910 	    }
911 
912 	  if ( Cp->SeqQ == NULL )
913 	    {
914 	      fprintf (stderr,
915 		 "\nERROR: Sequence \"%s\" was not found in the query file.\n"
916 		 "       Please check the validity of the query file listed\n"
917 		 "       at the top of the .delta input file and rerun.\n",
918 		       Cp->IdQ);
919 	      exit (EXIT_FAILURE);
920 	    }
921 
922 	  //-- Print the sequence
923 	  for ( i = start; i <= end; i ++ )
924 	    {
925 	      fputc (toupper(Cp->SeqQ[i]), Output);
926 	      if ( ++ ct == CHARS_PER_LINE )
927 		{
928 		  ct = 0;
929 		  fputc ('\n', Output);
930 		}
931 	    }
932 	  free (Cp->SeqQ);
933 
934 	  //-- Print the gap
935 	  for ( i = 1; i <= gap; i ++ )
936 	    {
937 	      fputc ('N', Output);
938 	      if ( ++ ct == CHARS_PER_LINE )
939 		{
940 		  ct = 0;
941 		  fputc ('\n', Output);
942 		}
943 	    }
944 
945 	  //-- Adjust the next start if this contig was used for overlap
946 	  if ( gap < 0  &&  end == Cp->SeqLenQ )
947 	    start = 1 + -(gap);
948 
949 	  //-- Walk the pointer down the list of contigs
950 	  if ( Cp->linksTo == Contigs.end( )  ||  Cp->linksTo->isLinkHead )
951 	    Cp = Contigs.end( );
952 	  else
953 	    Cp = Cp->linksTo;
954 	}
955 
956       if ( ct != 0 )
957 	fputc ('\n', Output);
958       ct = 0;
959     }
960 
961   return;
962 }
963 
964 
965 
966 
outputUnusable(vector<QueryContig> Contigs,FILE * Output)967 void outputUnusable
968      (vector<QueryContig> Contigs, FILE * Output)
969 {
970   vector<QueryContig>::iterator Cp;
971   vector<AlignStats>::iterator Ap;
972 
973   sort ( Contigs.begin( ), Contigs.end( ), IdQ_Sort( ) );
974 
975   for ( Cp = Contigs.begin( ); Cp < Contigs.end( ); Cp ++ )
976     {
977       if ( Cp->TileLevel != UNUSABLE_TILE_LEVEL  &&
978 	   Cp->TileLevel != UNUSED_TILE_LEVEL )
979 	continue;
980 
981       for ( Ap = Cp->Aligns.begin( ); Ap < Cp->Aligns.end( ); Ap ++ )
982 	printAlignment (Cp, Ap, Output);
983     }
984 
985   return;
986 }
987 
988 
989 
990 
parseDelta(vector<QueryContig> & Contigs)991 void parseDelta
992      (vector<QueryContig> & Contigs)
993 
994      //  parse the delta file
995 
996 {
997   vector<QueryContig>::iterator Cp;
998 
999   char * CurrIdQ;                        // the current contig Id
1000   long int temp;
1001 
1002   QueryContig aContig;                   //  single query contig
1003   AlignStats aStats;                     //  single alignment region
1004 
1005   DeltaReader_t dr;
1006   dr.open (InputFileName);
1007   DATA_TYPE = dr.getDataType( ) == NUCMER_STRING ?
1008     NUCMER_DATA : PROMER_DATA;
1009   strcpy (RefFileName, dr.getReferencePath( ).c_str( ));
1010   strcpy (QryFileName, dr.getQueryPath( ).c_str( ));
1011 
1012 
1013   Contigs.clear( );
1014   CurrIdQ = NULL_STRING;
1015   aContig.SeqQ = NULL;
1016   aContig.DirQ = '*';
1017   aContig.IdR = NULL_STRING;
1018   aContig.SeqLenR = 0;
1019   aContig.Aligns.clear( );
1020   aContig.StartR = aContig.EndR = 0;
1021   aContig.LoTrim = aContig.HiTrim = 0;
1022   aContig.isLinkHead = false;
1023 
1024 
1025   if ( DATA_TYPE == NUCMER_DATA )
1026     {
1027       if ( !isdef_MAX_GAP_SIZE )
1028 	MAX_GAP_SIZE = DEFAULT_NUCMER_MAX_GAP_SIZE;
1029       if ( !isdef_MIN_COVERAGE )
1030 	MIN_COVERAGE = DEFAULT_NUCMER_MIN_COVERAGE;
1031       if ( !isdef_MIN_COVERAGE_DIFF )
1032 	MIN_COVERAGE_DIFF = DEFAULT_NUCMER_MIN_COVERAGE_DIFF;
1033       if ( !isdef_MIN_PIDY )
1034 	MIN_PIDY = DEFAULT_NUCMER_MIN_PIDY;
1035     }
1036   else
1037     {
1038       if ( !isdef_MAX_GAP_SIZE )
1039 	MAX_GAP_SIZE = DEFAULT_PROMER_MAX_GAP_SIZE;
1040       if ( !isdef_MIN_COVERAGE )
1041 	MIN_COVERAGE = DEFAULT_PROMER_MIN_COVERAGE;
1042       if ( !isdef_MIN_COVERAGE_DIFF )
1043 	MIN_COVERAGE_DIFF = DEFAULT_PROMER_MIN_COVERAGE_DIFF;
1044       if ( !isdef_MIN_PIDY )
1045 	MIN_PIDY = DEFAULT_PROMER_MIN_PIDY;
1046     }
1047 
1048 
1049   //-- Process the delta input file
1050   while ( dr.readNext( ) )
1051     {
1052       aStats.SeqLenR = dr.getRecord( ).lenR;
1053       aContig.SeqLenQ = dr.getRecord( ).lenQ;
1054 
1055       aStats.IdR = (char *) Safe_malloc (dr.getRecord( ).idR.length( ) + 1);
1056       aContig.IdQ = (char *) Safe_malloc (dr.getRecord( ).idQ.length( ) + 1);
1057       strcpy (aStats.IdR, dr.getRecord( ).idR.c_str( ));
1058       strcpy (aContig.IdQ, dr.getRecord( ).idQ.c_str( ));
1059 
1060       if ( strcmp (CurrIdQ, aContig.IdQ) )
1061 	{
1062 	  CurrIdQ = aContig.IdQ;
1063 	  aContig.TileLevel = aContig.SeqLenQ < MIN_CONTIG_LENGTH ?
1064 	    IGNORE_TILE_LEVEL : UNUSED_TILE_LEVEL;
1065 	  Contigs.push_back (aContig);
1066 	}
1067 
1068       for ( unsigned int i = 0; i < dr.getRecord( ).aligns.size( ); i ++ )
1069 	{
1070 	  aStats.sR = dr.getRecord( ).aligns[i].sR;
1071 	  aStats.eR = dr.getRecord( ).aligns[i].eR;
1072 	  aStats.sQ = dr.getRecord( ).aligns[i].sQ;
1073 	  aStats.eQ = dr.getRecord( ).aligns[i].eQ;
1074 
1075           //-- Check match orientation
1076 	  if ( (aStats.sR <= aStats.eR  &&  aStats.sQ <= aStats.eQ) ||
1077 	       (aStats.sR > aStats.eR  &&  aStats.sQ > aStats.eQ) )
1078 	    aStats.DirQ = FORWARD_CHAR;
1079           else
1080 	    aStats.DirQ = REVERSE_CHAR;
1081 
1082 	  //-- Force ascending coordinates
1083 	  if ( aStats.sR > aStats.eR )
1084 	    {
1085 	      temp = aStats.sR;
1086 	      aStats.sR = aStats.eR;
1087 	      aStats.eR = temp;
1088 	    }
1089 	  if ( aStats.sQ > aStats.eQ )
1090 	    {
1091 	      temp = aStats.sQ;
1092 	      aStats.sQ = aStats.eQ;
1093 	      aStats.eQ = temp;
1094 	    }
1095 
1096 	  //-- If flipped orientation, reverse query coordinates
1097 	  if ( aStats.DirQ == REVERSE_CHAR )
1098 	    {
1099 	      temp = aStats.sQ;
1100 	      aStats.sQ = revC (aStats.eQ, aContig.SeqLenQ);
1101 	      aStats.eQ = revC (temp, aContig.SeqLenQ);
1102 	    }
1103 
1104 	  //-- Set the statistics for this alignment region
1105 	  aStats.Idy = dr.getRecord( ).aligns[i].idy;
1106 	  aStats.Sim = dr.getRecord( ).aligns[i].sim;
1107 	  aStats.Stp = dr.getRecord( ).aligns[i].stp;
1108 	  aStats.isTiled = false;
1109 
1110 	  //-- Add the alignment region
1111 	  Contigs.rbegin( )->Aligns.push_back (aStats);
1112         }
1113     }
1114   dr.close( );
1115 
1116   for ( Cp = Contigs.begin( ); Cp < Contigs.end( ); Cp ++ )
1117     sort ( Cp->Aligns.begin( ), Cp->Aligns.end( ), IdR_sQ_Sort( ) );
1118 
1119   return;
1120 }
1121 
1122 
1123 
1124 
placeContig(vector<QueryContig>::iterator Cp)1125 void placeContig
1126      (vector<QueryContig>::iterator Cp)
1127 {
1128   vector<AlignStats>::iterator Aip, Ap;
1129   float max_cov, cov;
1130   long int start;
1131 
1132   assert ( Cp->TileLevel == USED_TILE_LEVEL );
1133 
1134   //-- Find the 'representative' alignment
1135   Ap = Cp->Aligns.end( );
1136   max_cov = -(FLT_MAX);
1137   for ( Aip = Cp->Aligns.begin( ); Aip < Cp->Aligns.end( ); Aip ++ )
1138     if ( Aip->isTiled )
1139       {
1140 	//-- Set trim values
1141 	if ( Ap == Cp->Aligns.end( ) )
1142 	  Cp->LoTrim = Aip->sQ - 1;
1143 	Cp->HiTrim = Cp->SeqLenQ - Aip->eQ;
1144 
1145 	cov = getAlignmentQryCoverage (Aip, Cp->SeqLenQ);
1146 	if ( cov > max_cov )
1147 	  {
1148 	    max_cov = cov;
1149 	    Ap = Aip;
1150 	  }
1151       }
1152 
1153   //-- Set mapping reference data
1154   assert ( Ap != Cp->Aligns.end( ) );
1155   Cp->DirQ = Ap->DirQ;
1156   Cp->IdR = Ap->IdR;
1157   Cp->SeqLenR = Ap->SeqLenR;
1158 
1159   //-- Position the contig
1160   start = Ap->sQ;
1161   Cp->StartR = Ap->sR - (start - 1);
1162   Cp->EndR = Cp->StartR + Cp->SeqLenQ - 1;
1163 
1164   //-- Force negative offset if circular, else just let it overhang
1165   if ( isCircularReference  &&  Cp->EndR > Cp->SeqLenR )
1166     {
1167       Cp->StartR = Cp->StartR - Cp->SeqLenR;
1168       Cp->EndR = Cp->EndR - Cp->SeqLenR;
1169     }
1170   assert ( Cp->StartR <= Cp->EndR );
1171   assert ( Cp->LoTrim >= 0  &&  Cp->HiTrim >= 0 );
1172 
1173   return;
1174 }
1175 
1176 
1177 
1178 
printAlignment(vector<QueryContig>::iterator Cp,vector<AlignStats>::iterator Ap,FILE * Output)1179 void printAlignment
1180      (vector<QueryContig>::iterator Cp, vector<AlignStats>::iterator Ap,
1181       FILE * Output)
1182 {
1183   long int len1, len2;
1184   float covA, covB;
1185 
1186   len1 = Ap->eR - Ap->sR + 1;
1187   len2 = Ap->eQ - Ap->sQ + 1;
1188   covA = (float)len1 / (float)Ap->SeqLenR * 100.0;
1189   covB = getAlignmentQryCoverage ( Ap, Cp->SeqLenQ );
1190 
1191   //-- Output the statistics for this alignment region
1192   fprintf(Output,"%ld\t%ld\t", Ap->sR, Ap->eR);
1193   if ( Ap->DirQ == FORWARD_CHAR )
1194     fprintf(Output,"%ld\t%ld\t", Ap->sQ, Ap->eQ);
1195   else
1196     fprintf(Output,"%ld\t%ld\t", revC(Ap->sQ, Cp->SeqLenQ),
1197 	                         revC(Ap->eQ, Cp->SeqLenQ));
1198   fprintf(Output,"%ld\t%ld\t", len1, len2);
1199   fprintf(Output,"%.2f\t", Ap->Idy);
1200   fprintf(Output,"%ld\t%ld\t", Ap->SeqLenR, Cp->SeqLenQ);
1201   fprintf(Output,"%.2f\t%.2f\t", covA, covB);
1202   fprintf(Output,"%s\t%s", Ap->IdR, Cp->IdQ);
1203   fprintf(Output,"\n");
1204 
1205   return;
1206 }
1207 
1208 
1209 
1210 
printTilingAlignments(vector<QueryContig> & Contigs)1211 void printTilingAlignments
1212      (vector<QueryContig> & Contigs)
1213 {
1214   vector<QueryContig>::iterator beginCp;
1215   vector<QueryContig>::iterator Cp;
1216   vector<AlignStats>::iterator Ap;
1217 
1218   for ( beginCp = Contigs.begin( ); beginCp < Contigs.end( ); beginCp ++ )
1219     {
1220       if ( ! beginCp->isLinkHead )
1221 	continue;
1222 
1223       //-- A new Reference sequence
1224       Cp = beginCp;
1225 
1226       //-- For all contigs mapping to this reference
1227       while ( Cp != Contigs.end( ) )
1228 	{
1229 	  for ( Ap = Cp->Aligns.begin( ); Ap < Cp->Aligns.end( ); Ap ++ )
1230 	    if ( ! Ap->isTiled )
1231 	      continue;
1232 	    else
1233 	      printAlignment(Cp,Ap,stdout);
1234 
1235 	  //-- Walk the pointer down the list of contigs
1236 	  if ( Cp->linksTo == Contigs.end( )  ||  Cp->linksTo->isLinkHead )
1237 	    Cp = Contigs.end( );
1238 	  else
1239 	    Cp = Cp->linksTo;
1240 	}
1241     }
1242 
1243   return;
1244 }
1245 
1246 
1247 
1248 
printTilingPath(vector<QueryContig> & Contigs)1249 void printTilingPath
1250      (vector<QueryContig> & Contigs)
1251 {
1252   vector<QueryContig>::iterator beginCp;
1253   vector<QueryContig>::iterator Cp;
1254 
1255   long int len, gap;
1256   float pcov, pidy;
1257 
1258 
1259   for ( beginCp = Contigs.begin( ); beginCp < Contigs.end( ); beginCp ++ )
1260     {
1261       if ( ! beginCp->isLinkHead )
1262 	continue;
1263 
1264       //-- A new Reference sequence
1265       Cp = beginCp;
1266       printf (">%s %ld bases\n", Cp->IdR, Cp->SeqLenR);
1267 
1268       //-- For all contigs mapping to this reference
1269       while ( Cp != Contigs.end( ) )
1270 	{
1271 	  len = Cp->SeqLenQ;
1272 	  pcov = getSubsetQryCoverage
1273 	    ( Cp->Aligns.begin( ), Cp->Aligns.end( ), Cp->SeqLenQ );
1274 	  pidy = getSubsetIdentity
1275 	    ( Cp->Aligns.begin( ), Cp->Aligns.end( ) );
1276 
1277 	  //-- Calculate the gap size between this and the next contig
1278 	  if ( Cp->linksTo == Contigs.end( ) )
1279 	    gap = Cp->SeqLenR - Cp->EndR;
1280 	  else
1281 	    {
1282 	      if ( Cp->linksTo->isLinkHead )
1283 		gap = (Cp->SeqLenR + Cp->linksTo->StartR) - Cp->EndR - 1;
1284 	      else
1285 		gap = Cp->linksTo->StartR - Cp->EndR - 1;
1286 	    }
1287 
1288 	  //-- Print the data
1289 	  printf ("%ld\t%ld\t%ld\t%ld\t%.2f\t%.2f\t%c\t%s\n",
1290 		  Cp->StartR, Cp->EndR, gap, len, pcov,
1291 		  pidy, Cp->DirQ, Cp->IdQ);
1292 
1293 	  //-- Walk the pointer down the list of contigs
1294 	  if ( Cp->linksTo == Contigs.end( )  ||  Cp->linksTo->isLinkHead )
1295 	    Cp = Contigs.end( );
1296 	  else
1297 	    Cp = Cp->linksTo;
1298 	}
1299     }
1300 
1301   return;
1302 }
1303 
1304 
1305 
1306 
printTilingXML(vector<QueryContig> & Contigs,char * QryFileName,int argc,char ** argv)1307 void printTilingXML
1308      (vector<QueryContig> & Contigs, char * QryFileName, int argc, char ** argv)
1309 {
1310   vector<AlignStats>::iterator Ap;
1311   vector<QueryContig>::iterator Cp;
1312   vector<QueryContig>::iterator beginCp;
1313 
1314   int i;
1315   long int gap;
1316   long int ct = 0;
1317 
1318   time_t tt = time(NULL);
1319   char * time_str;
1320   time_str = ctime(&tt);
1321   time_str[strlen(time_str)-1] = '\0';
1322 
1323   printf ("<?xml version = \"1.0\" ?>\n");
1324 
1325   printf ("<EVIDENCE ID = \"project_1\"\n");
1326   printf ("        DATE = \"%s\"\n", time_str);
1327   printf ("     PROJECT = \"%s\"\n", QryFileName);
1328   printf ("  PARAMETERS = \"");
1329   for ( i = 0; i < argc; i ++ ) printf ("%s%s", i == 0 ? "" : " ", argv[i]);
1330   printf ("\">\n");
1331 
1332   //-- Print the used contig information
1333   for ( beginCp = Contigs.begin( ); beginCp < Contigs.end( ); beginCp ++ )
1334     {
1335       if ( ! beginCp->isLinkHead )
1336 	continue;
1337 
1338       //-- A new Reference sequence
1339       Cp = beginCp;
1340 
1341       //-- For all contigs mapping to this reference
1342       while ( Cp != Contigs.end( ) )
1343 	{
1344 	  printf
1345 	    ("\t<CONTIG ID = \"contig_%s\" NAME = \"%s\" LEN = \"%ld\"/>\n",
1346 	     Cp->IdQ, Cp->IdQ, Cp->SeqLenQ);
1347 
1348 	  //-- Walk the pointer down the list of contigs
1349 	  if ( Cp->linksTo == Contigs.end( )  ||  Cp->linksTo->isLinkHead )
1350 	    Cp = Contigs.end( );
1351 	  else
1352 	    Cp = Cp->linksTo;
1353 	}
1354     }
1355   printf("\n");
1356 
1357 
1358   //-- Print the linking information
1359   for ( beginCp = Contigs.begin( ); beginCp < Contigs.end( ); beginCp ++ )
1360     {
1361       if ( ! beginCp->isLinkHead )
1362 	continue;
1363 
1364       //-- A new Reference sequence
1365       Cp = beginCp;
1366 
1367       //-- For all contigs mapping to this reference
1368       while ( Cp != Contigs.end( ) )
1369 	{
1370 	  if ( Cp->linksTo == Contigs.end( ) )
1371 	    break;
1372 
1373 	  //-- Calculate the gap size between this and the next contig
1374 	  if ( Cp->linksTo->isLinkHead )
1375 	    gap = (Cp->SeqLenR + Cp->linksTo->StartR) - Cp->EndR - 1;
1376 	  else
1377 	    gap = Cp->linksTo->StartR - Cp->EndR - 1;
1378 
1379 	  printf
1380 	    ("\t<LINK ID = \"link_%ld\" SIZE = \"%ld\" TYPE = \"MUMmer\">\n",
1381 	     ++ ct, gap);
1382 	  printf("\t\t<CONTIG ID = \"contig_%s\" ORI = \"%s\">\n",
1383 		 Cp->IdQ, Cp->DirQ == FORWARD_CHAR ? "BE" : "EB");
1384 
1385 	  for ( Ap = Cp->Aligns.begin( );
1386 		Ap < Cp->Aligns.end( ); Ap ++ )
1387 	    if ( ! Ap->isTiled )
1388 	      continue;
1389 	    else
1390 	      {
1391 		printf("\t\t");
1392 		printAlignment(Cp,Ap,stdout);
1393 	      }
1394 
1395 	  printf("\t\t</CONTIG>\n");
1396 	  printf("\t\t<CONTIG ID = \"contig_%s\" ORI = \"%s\">\n",
1397 		 Cp->linksTo->IdQ,
1398 		 Cp->linksTo->DirQ == FORWARD_CHAR ? "BE" : "EB");
1399 
1400 	  for ( Ap = Cp->linksTo->Aligns.begin( );
1401 		Ap < Cp->linksTo->Aligns.end( ); Ap ++ )
1402 	    if ( ! Ap->isTiled )
1403 	      continue;
1404 	    else
1405 	      {
1406 		printf("\t\t");
1407 		printAlignment(Cp->linksTo,Ap,stdout);
1408 	      }
1409 
1410 	  printf("\t\t</CONTIG>\n");
1411 	  printf("\t</LINK>\n");
1412 
1413 	  //-- Walk the pointer down the list of contigs
1414 	  if ( Cp->linksTo == Contigs.end( )  ||  Cp->linksTo->isLinkHead )
1415 	    Cp = Contigs.end( );
1416 	  else
1417 	    Cp = Cp->linksTo;
1418 	}
1419     }
1420 
1421   printf ("</EVIDENCE>\n");
1422 
1423   return;
1424 }
1425 
1426 
1427 
1428 
printHelp(const char * s)1429 void printHelp
1430      (const char * s)
1431 
1432      //  Display the program's help information to stderr.
1433 
1434 {
1435   fprintf (stderr,
1436            "\nUSAGE: %s  [options]  <deltafile>\n\n", s);
1437   fprintf (stderr,
1438     "-a            Describe the tiling path by printing the tab-delimited\n"
1439     "              alignment region coordinates to stdout\n"
1440     "-c            Assume the reference sequences are circular, and allow\n"
1441     "              tiled contigs to span the origin\n"
1442     "-h            Display help information\n"
1443     "-g int        Set maximum gap between clustered alignments [-1, INT_MAX]\n"
1444     "              A value of -1 will represent infinity\n"
1445     "              (nucmer default = %ld)\n"
1446     "              (promer default = %ld)\n"
1447     "-i float      Set minimum percent identity to tile [0.0, 100.0]\n"
1448     "              (nucmer default = %.1f)\n"
1449     "              (promer default = %.1f)\n"
1450     "-l int        Set minimum length contig to report [-1, INT_MAX]\n"
1451     "              A value of -1 will represent infinity\n"
1452     "              (common default = %ld)\n"
1453     "-p file       Output a pseudo molecule of the query contigs to 'file'\n"
1454     "-R            Deal with repetitive contigs by randomly placing them\n"
1455     "              in one of their copy locations (implies -V 0)\n"
1456     "-t file       Output a TIGR style contig list of each query sequence\n"
1457     "              that sufficiently matches the reference (non-circular)\n"
1458     "-u file       Output the tab-delimited alignment region coordinates\n"
1459     "              of the unusable contigs to 'file'\n"
1460     "-v float      Set minimum contig coverage to tile [0.0, 100.0]\n"
1461     "              (nucmer default = %.1f) sum of individual alignments\n"
1462     "              (promer default = %.1f) extent of syntenic region\n"
1463     "-V float      Set minimum contig coverage difference [0.0, 100.0]\n"
1464     "              i.e. the difference needed to determine one alignment\n"
1465     "              is 'better' than another alignment\n"
1466     "              (nucmer default = %.1f) sum of individual alignments\n"
1467     "              (promer default = %.1f) extent of syntenic region\n"
1468     "-x            Describe the tiling path by printing the XML contig\n"
1469     "              linking information to stdout\n\n",
1470 	   DEFAULT_NUCMER_MAX_GAP_SIZE, DEFAULT_PROMER_MAX_GAP_SIZE,
1471 	   DEFAULT_NUCMER_MIN_PIDY, DEFAULT_PROMER_MIN_PIDY,
1472 	   DEFAULT_MIN_CONTIG_LENGTH,
1473 	   DEFAULT_NUCMER_MIN_COVERAGE, DEFAULT_PROMER_MIN_COVERAGE,
1474 	   DEFAULT_NUCMER_MIN_COVERAGE_DIFF, DEFAULT_PROMER_MIN_COVERAGE_DIFF);
1475   fprintf (stderr,
1476        "  Input is the .delta output of the nucmer program, run on very\n"
1477        "similar sequence data, or the .delta output of the promer program,\n"
1478        "run on divergent sequence data.\n"
1479        "  Output is to stdout, and consists of the predicted location of\n"
1480        "each aligning query contig as mapped to the reference sequences.\n"
1481        "These coordinates reference the extent of the entire query contig,\n"
1482        "even when only a certain percentage of the contig was actually\n"
1483        "aligned (unless the -a option is used). Columns are, start in ref,\n"
1484        "end in ref, distance to next contig, length of this contig, alignment\n"
1485        "coverage, identity, orientation, and ID respectively.\n\n");
1486   return;
1487 }
1488 
1489 
1490 
1491 
printUsage(const char * s)1492 void printUsage
1493      (const char * s)
1494 
1495      //  Display the program's usage information to stderr.
1496 
1497 {
1498   fprintf (stderr,
1499            "\nUSAGE: %s  [options]  <deltafile>\n\n", s);
1500   fprintf (stderr, "Try '%s -h' for more information.\n", s);
1501   return;
1502 }
1503 
1504 
1505 
1506 
revC(long int Coord,long int Len)1507 inline long int revC
1508      (long int Coord, long int Len)
1509 
1510      //  Reverse complement the given coordinate for the given length.
1511 
1512 {
1513   assert (Len - Coord + 1 > 0);
1514   return (Len - Coord + 1);
1515 }
1516 
1517 
1518 
1519 
tileContigs(vector<QueryContig> & Contigs)1520 void tileContigs
1521      (vector<QueryContig> & Contigs)
1522 {
1523   vector<QueryContig>::iterator Cp;
1524   vector<AlignStats>::iterator Ap, Aip, Atemp;
1525 
1526   long int start, end, hang;
1527 
1528   char * IdR = NULL;
1529   char * IdRhang = NULL;
1530 
1531   float cov, covhang;
1532   float max_cov, maxx_cov;
1533   float idy;
1534   float max_covhang, maxx_covhang;
1535   float idyhang, tmpidy;
1536 
1537   for ( Cp = Contigs.begin( ); Cp < Contigs.end( ); Cp ++ )
1538     {
1539       if ( Cp->TileLevel != UNUSED_TILE_LEVEL )
1540 	continue;
1541 
1542       max_cov = maxx_cov = idy = -(FLT_MAX);
1543       max_covhang = maxx_covhang = idyhang = -(FLT_MAX);
1544       Ap = Cp->Aligns.begin( );
1545       while ( Ap < Cp->Aligns.end( ) )
1546 	{
1547 	  //-- For a single reference
1548 	  for ( Aip = Ap + 1; Aip < Cp->Aligns.end( ); Aip ++ )
1549 	    if ( strcmp (Ap->IdR, Aip->IdR) != 0 )
1550 	      break;
1551 
1552 	  //-- Cluster the alignments
1553 	  longestConsistentSubset (Ap, Aip);
1554 
1555 	  //-- Get the extents and overhang of the best subset
1556 	  hang = 0;
1557 	  end = start = -1;
1558 	  for ( Atemp = Ap; Atemp < Aip; Atemp ++ )
1559 	    if ( Atemp->isTiled )
1560 	      {
1561 		if ( Atemp->sQ - Atemp->sR > 0 )
1562 		  hang += Atemp->sQ - Atemp->sR;
1563 		start = Atemp->sR;
1564 		break;
1565 	      }
1566 	  for ( Atemp = Aip - 1; Atemp >= Ap; Atemp -- )
1567 	    if ( Atemp->isTiled )
1568 	      {
1569 		if ( Atemp->eR + (Cp->SeqLenQ - Atemp->eQ) > Atemp->SeqLenR )
1570 		  hang += (Atemp->eR + (Cp->SeqLenQ - Atemp->eQ)) -
1571 		    Atemp->SeqLenR;
1572 		end = Atemp->eR;
1573 		break;
1574 	      }
1575 	  assert ( end > 0 && start > 0 );
1576 
1577 	  //-- Check the query coverage for mapping to this reference
1578 	  if ( DATA_TYPE == NUCMER_DATA )
1579 	    cov = getSubsetQryCoverage (Ap, Aip, Cp->SeqLenQ);
1580 	  else // PROMER_DATA
1581 	    cov = getSubsetQrySyntenyCoverage (Ap, Aip, Cp->SeqLenQ);
1582 	  if ( cov >= max_cov )
1583 	    {
1584 	      tmpidy = getSubsetIdentity (Ap, Aip);
1585 	      if ( cov > max_cov  ||  (cov == max_cov  &&  tmpidy > idy) )
1586 		{
1587 		  IdR = Ap->IdR;
1588 		  maxx_cov = max_cov;
1589 		  max_cov = cov;
1590 		  idy = tmpidy;
1591 		}
1592 	    }
1593 	  else if ( cov > maxx_cov )
1594 	    maxx_cov = cov;
1595 
1596 	  if ( !isCircularReference )
1597 	    {
1598 	      //-- Check the query coverage for mapping to this reference
1599 	      if ( DATA_TYPE == NUCMER_DATA )
1600 		covhang = getSubsetQryCoverage
1601 		  (Ap, Aip, Cp->SeqLenQ - hang);
1602 	      else // PROMER_DATA
1603 		covhang = getSubsetQrySyntenyCoverage
1604 		  (Ap, Aip, Cp->SeqLenQ - hang);
1605 	      if ( covhang >= max_covhang )
1606 		{
1607 		  tmpidy = getSubsetIdentity (Ap, Aip);
1608 		  if ( covhang > max_covhang  ||  (covhang == max_covhang  &&
1609 						   tmpidy > idyhang) )
1610 		    {
1611 		      IdRhang = Ap->IdR;
1612 		      maxx_covhang = max_covhang;
1613 		      max_covhang = covhang;
1614 		      idyhang = tmpidy;
1615 		    }
1616 		}
1617 	      else if ( covhang > maxx_covhang )
1618 		maxx_covhang = covhang;
1619 	    }
1620 
1621 	  for ( ; Ap < Aip; Ap ++ )
1622 	    if ( !Ap->isTiled )
1623 	      {
1624 		//-- Don't worry about the single if it overlaps
1625 		if (start <= end  &&  (Ap->sR <= end  &&  Ap->eR >= start))
1626 		  continue;
1627 		else if (start > end  &&  (Ap->eR >= start  ||  Ap->sR <= end))
1628 		  continue;
1629 
1630 		//-- Look for competing single alignments
1631 		cov = getAlignmentQryCoverage (Ap, Cp->SeqLenQ);
1632 		if ( cov > maxx_cov )
1633 		  maxx_cov = cov;
1634 
1635 		if ( !isCircularReference )
1636 		  {
1637 		    hang = 0;
1638 		    if ( Ap->sQ - Ap->sR > 0 )
1639 		      hang += Ap->sQ - Ap->sR;
1640 		    if ( Ap->eR + (Cp->SeqLenQ - Ap->eQ) >
1641 			 Ap->SeqLenR )
1642 		      hang += (Ap->eR + (Cp->SeqLenQ - Ap->eQ)) -
1643 			Ap->SeqLenR;
1644 
1645 		    covhang = getAlignmentQryCoverage (Ap, Cp->SeqLenQ - hang);
1646 		    if ( covhang > maxx_covhang )
1647 		      maxx_covhang = covhang;
1648 		  }
1649 	      }
1650 	  // Ap now equals Aip
1651 	}
1652 
1653       //-- If clustered coverage is...
1654       if ( max_cov - maxx_cov >= MIN_COVERAGE_DIFF  &&
1655 	   max_cov >= MIN_COVERAGE  &&  idy >= MIN_PIDY )
1656 	{
1657 	  for ( Aip = Cp->Aligns.begin( ); Aip < Cp->Aligns.end( ); Aip ++ )
1658 	    if ( Aip->isTiled  &&  strcmp (Aip->IdR, IdR) != 0 )
1659 	      Aip->isTiled = false;
1660 
1661 	  //-- Tile the contig
1662 	  Cp->TileLevel = USED_TILE_LEVEL;
1663 	  placeContig ( Cp );
1664 	}
1665       else if ( !isCircularReference  &&
1666 		max_covhang - maxx_covhang >= MIN_COVERAGE_DIFF  &&
1667 		max_covhang >= MIN_COVERAGE  &&  idyhang >= MIN_PIDY )
1668 	{
1669 	  for ( Aip = Cp->Aligns.begin( ); Aip < Cp->Aligns.end( ); Aip ++ )
1670 	    if ( Aip->isTiled  &&  strcmp (Aip->IdR, IdRhang) != 0 )
1671 	      Aip->isTiled = false;
1672 
1673 	  //-- Tile the contig
1674 	  Cp->TileLevel = USED_TILE_LEVEL;
1675 	  placeContig ( Cp );
1676 	}
1677       else
1678 	Cp->TileLevel = UNUSABLE_TILE_LEVEL;
1679     }
1680 
1681   //-- Sort by reference mapping location
1682   sort ( Contigs.begin( ), Contigs.end( ), IdR_StartR_Sort( ) );
1683 
1684   return;
1685 }
1686