1 
2 /******************************************************************************
3  *
4  *  This file is part of canu, a software program that assembles whole-genome
5  *  sequencing reads into contigs.
6  *
7  *  This software is based on:
8  *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9  *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10  *
11  *  Except as indicated otherwise, this is a 'United States Government Work',
12  *  and is released in the public domain.
13  *
14  *  File 'README.licenses' in the root directory of this distribution
15  *  contains full conditions and disclaimers.
16  */
17 
18 #include "runtime.H"
19 #include "system.H"
20 
21 #include "sqStore.H"
22 #include "tgStore.H"
23 
24 #include "edlib.H"
25 
26 #include "strings.H"
27 #include "sequence.H"
28 
29 #include "gfa.H"
30 #include "bed.H"
31 
32 #define IS_GFA   1
33 #define IS_BED   2
34 
35 #define STEP_BP        500
36 
37 class sequence {
38 public:
sequence()39   sequence() {
40     seq = NULL;
41     len = 0;
42   };
~sequence()43   ~sequence() {
44     delete [] seq;
45   };
46 
set(tgTig * tig)47   void  set(tgTig *tig) {
48     len = tig->length();
49     seq = new char [len + 1];
50 
51     if (tig->consensusExists())
52        memcpy(seq, tig->bases(), len);
53     else
54        memset(seq, 0, len);
55 
56     seq[len] = 0;
57   };
58 
59   char   *seq;
60   uint32  len;
61 };
62 
63 
64 
65 class sequences {
66 public:
sequences(char * tigName,uint32 tigVers)67   sequences(char *tigName, uint32 tigVers) {
68     if (tigVers <= 0)
69        fprintf(stderr, "Error: invalid version of tigstore: %d, not supported\n", tigVers);
70     assert(tigVers > 0);
71 
72     tgStore *tigStore = new tgStore(tigName, tigVers);
73 
74     b    = 0;
75     e    = tigStore->numTigs();
76     seqs = new sequence [e+1];
77     used = new uint32   [e+1];
78 
79     for (uint32 ti=b; ti < e; ti++) {
80       tgTig *tig = tigStore->loadTig(ti);
81 
82       used[ti] = 0;
83 
84       if (tig == NULL)
85         continue;
86 
87       seqs[ti].set(tig);
88 
89       tigStore->unloadTig(ti);
90     }
91 
92     delete tigStore;
93   };
94 
~sequences()95   ~sequences() {
96     delete [] seqs;
97     delete [] used;
98   };
99 
100   sequence &operator[](uint32 xx) {
101     if (xx < e)
102       return(seqs[xx]);
103 
104     fprintf(stderr, "ERROR: sequence id %u out of range b=%u e=%u\n", xx, b, e);
105 
106     assert(xx < e);
107     return(seqs[0]);
108   };
109 
110   uint32    b;
111   uint32    e;
112   sequence *seqs;
113   uint32   *used;
114 };
115 
116 
117 
118 void
dotplot(uint32 Aid,bool Afwd,char * Aseq,uint32 Bid,bool Bfwd,char * Bseq)119 dotplot(uint32 Aid, bool Afwd, char *Aseq,
120         uint32 Bid, bool Bfwd, char *Bseq) {
121   char   Aname[16];
122   char   Bname[16];
123   char   Pname[64];
124   FILE  *F;
125 
126   sprintf(Aname, "tig%08u%c",  Aid, (Afwd) ? '+' : '-');
127   sprintf(Bname, "tig%08u%c",  Bid, (Bfwd) ? '+' : '-');
128   sprintf(Pname, "plot-%s-%s", Aname, Bname);
129 
130   F = AS_UTL_openOutputFile(Pname, '.', "sh");
131   fprintf(F, "#!/bin/sh\n");
132   fprintf(F, "\n");
133   fprintf(F, "nucmer --maxmatch --nosimplify -p %s %s.fasta %s.fasta\n", Pname, Aname, Bname);
134   fprintf(F, "show-coords -l -o -r -T %s.delta | expand -t 8 > %s.coords\n", Pname, Pname);
135   fprintf(F, "mummerplot --fat -t png -p %s %s.delta\n", Pname, Pname);
136   fprintf(F, "echo mummerplot --fat -p %s %s.delta\n", Pname, Pname);
137   AS_UTL_closeFile(F, Pname, '.', "sh");
138 
139   F = AS_UTL_openOutputFile(Aname, '.', "fasta");
140   fprintf(F, ">%s\n%s\n", Aname, Aseq);
141   AS_UTL_closeFile(F, Aname, '.', "fasta");
142 
143   F = AS_UTL_openOutputFile(Bname, '.', "fasta");
144   fprintf(F, ">%s\n%s\n", Bname, Bseq);
145   AS_UTL_closeFile(F, Bname, '.', "fasta");
146 
147   sprintf(Pname, "sh plot-%s-%s.sh", Aname, Bname);
148   system(Pname);
149 }
150 
151 
152 
153 bool
checkLink(gfaLink * link,sequences & seqs,sequences & seqs_orig,double erate,bool beVerbose,bool doPlot)154 checkLink(gfaLink   *link,
155           sequences &seqs,
156           sequences &seqs_orig,
157           double     erate,
158           bool       beVerbose,
159           bool       doPlot) {
160 
161   char   *Aseq = seqs[link->_Aid].seq, *Arev = NULL;
162   char   *Bseq = seqs[link->_Bid].seq, *Brev = NULL;
163 
164   int32  Abgn, Aend, Alen = seqs[link->_Aid].len;
165   int32  Bbgn, Bend, Blen = seqs[link->_Bid].len;
166 
167   double ratio = std::max((double)Alen/seqs_orig[link->_Aid].len, (double)Blen/seqs_orig[link->_Bid].len);
168 
169   EdlibAlignResult  result  = { 0, NULL, NULL, 0, NULL, 0, 0 };
170 
171   int32  AalignLen = 0;
172   int32  BalignLen = 0;
173   int32  editDist  = 0;
174   int32  alignLen  = 0;
175   int32  maxEdit   = 0;
176 
177   //  NOTE!  edlibAlign calls the 'A' sequence the 'query' and
178   //  the 'B' sequence the 'target' (aka, 'reference').
179 
180   link->alignmentLength(AalignLen, BalignLen, alignLen);
181 
182   //  Regardless of if we find an new alignment or not, remove the old one.
183   //  If we don't find a new one, we'll discard the link.
184 
185   delete [] link->_cigar;
186   link->_cigar = NULL;
187 
188   if (link->_Afwd == false)
189     Aseq = Arev = reverseComplementCopy(Aseq, Alen);
190   if (link->_Bfwd == false)
191     Bseq = Brev = reverseComplementCopy(Bseq, Blen);
192 
193   //  Ty to find the end coordinate on B.  Align the last bits of A to B.
194   //
195   //   -------(---------]     v--??
196   //             [------------)------
197   //
198   // if we fail to find the specified link, we will shrink A to see if we can find it that way and B will be trimmed to the right point
199   // no need to retry on the second time because we'll have the right location of B and will trim A to it
200 
201   Abgn = std::max(Alen - AalignLen, 0);
202   Aend = Alen;
203 
204   Bbgn = 0;
205   Bend = std::min(Blen, (int32)(1.10 * ratio * BalignLen));  //  Expand by whatever factor the contig grew during consensus plus 10%
206 
207   maxEdit = (int32)ceil(alignLen * erate);
208 
209   while (result.numLocations == 0) {
210     if (beVerbose)
211       fprintf(stderr, "LINK tig%08u %c %17s    tig%08u %c %17s   Aalign %6u Balign %6u align %6u\n",
212               link->_Aid, (link->_Afwd) ? '+' : '-', "",
213               link->_Bid, (link->_Bfwd) ? '+' : '-', "",
214               AalignLen, BalignLen, alignLen);
215 
216     if (beVerbose)
217       fprintf(stderr, "TEST tig%08u %c %8d-%-8d    tig%08u %c %8d-%-8d  maxEdit=%6d  (extend B)",
218               link->_Aid, (link->_Afwd) ? '+' : '-', Abgn, Aend,
219               link->_Bid, (link->_Bfwd) ? '+' : '-', Bbgn, Bend,
220               maxEdit);
221 
222     result = edlibAlign(Aseq + Abgn, Aend-Abgn,  //  The 'query'
223                         Bseq + Bbgn, Bend-Bbgn,  //  The 'target'
224                         edlibNewAlignConfig(maxEdit, EDLIB_MODE_HW, EDLIB_TASK_LOC));
225 
226     if (result.numLocations > 0) {
227       if (beVerbose)
228         fprintf(stderr, "\n");
229       Bend = Bbgn + result.endLocations[0] + 1;  // 0-based to space-based
230       edlibFreeAlignResult(result);
231       break; // found it, stop
232     } else {
233       if (beVerbose)
234         fprintf(stderr, " - FAILED\n");
235       if (Abgn + STEP_BP >= Alen) break; // we ran out of sequence, stop
236       Abgn += STEP_BP;
237     }
238   }
239 
240   //  Do the same for A.  Aend and Bbgn never change; Bend was set above.
241   //
242   //   ------(--------------]
243   //         ^--??  [-------]-----------
244   //
245 
246   Abgn = std::max(Alen - (int32)(1.10 * ratio * AalignLen), 0);  //  Expand by whatever factor the contig grew during consensus plus 10%
247   // update max edit by new length
248   maxEdit = (int32)ceil((Bend-Bbgn+1) * erate);
249 
250   if (beVerbose)
251     fprintf(stderr, "     tig%08u %c %8d-%-8d    tig%08u %c %8d-%-8d  maxEdit=%6d  (extend A)",
252             link->_Aid, (link->_Afwd) ? '+' : '-', Abgn, Aend,
253             link->_Bid, (link->_Bfwd) ? '+' : '-', Bbgn, Bend,
254             maxEdit);
255 
256   //  NEEDS to be MODE_HW because we need to find the suffix alignment.
257 
258   result = edlibAlign(Bseq + Bbgn, Bend-Bbgn,  //  The 'query'
259                       Aseq + Abgn, Aend-Abgn,  //  The 'target'
260                       edlibNewAlignConfig(maxEdit, EDLIB_MODE_HW, EDLIB_TASK_LOC));
261 
262   if (result.numLocations > 0) {
263     if (beVerbose)
264       fprintf(stderr, "\n");
265     Abgn = Abgn + result.startLocations[0];
266     edlibFreeAlignResult(result);
267   } else {
268     if (beVerbose)
269       fprintf(stderr, " - FAILED\n");
270   }
271 
272   //  One more alignment, this time, with feeling - notice EDLIB_MODE_MW and EDLIB_TASK_PATH.
273 
274   if (beVerbose)
275     fprintf(stderr, "     tig%08u %c %8d-%-8d    tig%08u %c %8d-%-8d  maxEdit=%6d  (final)",
276             link->_Aid, (link->_Afwd) ? '+' : '-', Abgn, Aend,
277             link->_Bid, (link->_Bfwd) ? '+' : '-', Bbgn, Bend,
278             maxEdit);
279 
280   result = edlibAlign(Aseq + Abgn, Aend-Abgn,
281                       Bseq + Bbgn, Bend-Bbgn,
282                       edlibNewAlignConfig(2 * maxEdit, EDLIB_MODE_NW, EDLIB_TASK_PATH));
283 
284 
285   bool   success = false;
286 
287   if (result.numLocations > 0) {
288     if (beVerbose)
289       fprintf(stderr, "\n");
290 
291     editDist = result.editDistance;
292     alignLen = ((Aend - Abgn) + (Bend - Bbgn) + (editDist)) / 2;
293     alignLen = result.alignmentLength;     //  Edlib 'alignmentLength' is populated only for TASK_PATH
294 
295     link->_cigar = edlibAlignmentToCigar(result.alignment,
296                                          result.alignmentLength, EDLIB_CIGAR_STANDARD);
297 
298     edlibFreeAlignResult(result);
299 
300     success = true;
301   } else {
302     if (beVerbose)
303       fprintf(stderr, " - FAILED\n");
304   }
305 
306   if (beVerbose)
307     fprintf(stderr, "     tig%08u %c %8d-%-8d    tig%08u %c %8d-%-8d   %.4f\n",
308             link->_Aid, (link->_Afwd) ? '+' : '-', Abgn, Aend,
309             link->_Bid, (link->_Bfwd) ? '+' : '-', Bbgn, Bend,
310             (double)editDist / alignLen);
311 
312   //  Make a plot.
313 
314   if ((success == false) && (doPlot == true))
315     dotplot(link->_Aid, link->_Afwd, Aseq,
316             link->_Bid, link->_Bfwd, Bseq);
317 
318   //  Cleanup for the next link.
319 
320   delete [] Arev;
321   delete [] Brev;
322 
323   if (beVerbose)
324     fprintf(stderr, "\n");
325 
326   return(success);
327 }
328 
329 
330 
331 //   Align all of B into A.  Extend A as needed to make the whole thing fit.
332 //   Abgn, Aend and score are updated with the alignment.
333 //
334 bool
checkRecord_align(char const * label,char const * Aname,char const * Aseq,int32 Alen,int32 & Abgn,int32 & Aend,char const * Bname,char const * Bseq,int32 Blen,int32 & score,bool beVerbose)335 checkRecord_align(char const *label,
336                   char const *Aname, char const *Aseq, int32 Alen, int32 &Abgn, int32 &Aend,
337                   char const *Bname, char const *Bseq, int32 Blen,
338                   int32 &score,
339                   bool   beVerbose) {
340 
341   EdlibAlignResult  result  = { 0, NULL, NULL, 0, NULL, 0, 0 };
342 
343   int32  editDist    = 0;
344   int32  alignLen    = 0;
345   int32  alignScore  = 0;
346   int32  maxEdit     = (int32)ceil(Blen * 0.03);  //  Should be the same sequence, but allow for a little difference.
347   int32  step        = (int32)ceil(Blen * 0.15);
348 
349   Aend = std::min(Aend + 2 * step, Alen);        //  Limit Aend to the actual length of the contig (consensus can shrink)
350   Abgn = std::max(Aend - Blen - 2 * step, 0);    //  Then push Abgn back to make space for the unitig.
351 
352  tryAgain:
353   if (beVerbose)
354     fprintf(stderr, "ALIGN %5s utg %s len=%7d to ctg %s %9d-%9d len=%9d",
355             label,
356             Bname, Blen,
357             Aname, Abgn, Aend, Alen);
358 
359 #if 0
360   char N[FILENAME_MAX+1];
361   FILE *F;
362 
363   char  ach = Aseq[Aend];   Aseq[Aend] = 0;
364   char  bch = Bseq[Bend];   Bseq[Bend] = 0;
365 
366   sprintf(N, "compare%04d-%04d-ctg%04d.fasta", record->_Aid, record->_Bid, record->_Aid);
367   F = AS_UTL_openOutputFile(N);
368   fprintf(F, ">ctg%04d\n%s\n", record->_Aid, Aseq + Abgn);
369   AS_UTL_closeFile(F, N);
370 
371   sprintf(N, "compare%04d-%04d-utg%04d.fasta", record->_Aid, record->_Bid, record->_Bid);
372   F = AS_UTL_openOutputFile(N);
373   fprintf(F, ">utg%04d\n%s\n", record->_Bid, Bseq + Bbgn);
374   AS_UTL_closeFile(F, N);
375 
376   Aseq[Aend] = ach;
377   Bseq[Bend] = bch;
378 #endif
379 
380   result = edlibAlign(Bseq,        Blen,       //  The 'query'   (unitig)
381                       Aseq + Abgn, Aend-Abgn,  //  The 'target'  (contig)
382                       edlibNewAlignConfig(maxEdit, EDLIB_MODE_HW, EDLIB_TASK_LOC));
383 
384   //  Got an alignment?  Process and report, and maybe try again.
385 
386   if (result.numLocations > 0) {
387     int32  nAbgn = Abgn + result.startLocations[0];
388     int32  nAend = Abgn + result.endLocations[0] + 1;  // 0-based to space-based
389     char  *cigar = NULL;
390 
391     editDist   = result.editDistance;
392     alignLen   = ((nAend - nAbgn) + (Blen) + (editDist)) / 2;
393     alignScore = 1000 - (int32)(1000.0 * editDist / alignLen);
394 
395     //  If there's an alignment, we can get a cigar string and better alignment length.
396     if ((result.alignment != NULL) && (result.alignmentLength > 0)) {
397       cigar      = edlibAlignmentToCigar(result.alignment, result.alignmentLength, EDLIB_CIGAR_STANDARD);
398       alignLen   = result.alignmentLength;
399     }
400 
401     edlibFreeAlignResult(result);
402 
403     if (beVerbose)
404       fprintf(stderr, " - POSITION from %9d-%-9d to %9d-%-9d score %5d/%9d = %4d%s%s\n",
405               Abgn, Aend,
406               nAbgn, nAend,
407               editDist, alignLen, alignScore,
408               (cigar != NULL) ? " align " : "",
409               (cigar != NULL) ? cigar     : "");
410 
411     delete [] cigar;
412 
413     //  If it's a full alignment -- if the A region was big enough to have unaligned bases -- then
414     //  we're done.  Update the result and get out of here.
415 
416     if (((Abgn  < nAbgn) || (Abgn == 0)) &&
417         ((nAend < Aend)  || (Aend == Alen))) {
418 
419       Abgn  = nAbgn;
420       Aend  = nAend;
421       score = alignScore;
422 
423       return(true);
424     }
425 
426     //  Otherwise, we ran out of A sequence to align to before we ran out of stuff to align.  Extend
427     //  the A region and try again.
428 
429     if (Abgn == nAbgn)
430       Abgn = std::max(Abgn - step, 0);
431 
432     if (Aend == nAend)
433       Aend = std::min(Aend + step, Alen);
434 
435     goto tryAgain;
436   }
437 
438   //  Didn't get a good alignment.
439 
440   //  We fail for one of two reasons - either not enough bases in the reference, or too high of
441   //  error.  Unitigs are supposed to be from the same sequence, but they might be lower coverage
442   //  and therefore higher error.  It's more likely they are misplaced.
443 
444 
445   if ((Aend - Abgn < 10 * Blen) &&
446       (maxEdit < Blen * 0.35)) {
447     if (beVerbose)
448       fprintf(stderr, " - FAILED, RELAX\n");
449 
450     Abgn = std::max(Abgn - step, 0);
451     Aend = std::min(Aend + step, Alen);
452     maxEdit *= 1.2;
453 
454     goto tryAgain;
455   }
456 
457   if (beVerbose)
458     fprintf(stderr, " - ABORT, ABORT, ABORT!\n");
459 
460   return(false);
461 }
462 
463 
464 
465 bool
checkRecord(bedRecord * record,sequences & ctgs,sequences & ctgs_orig,sequences & utgs,bool beVerbose,bool UNUSED (doPlot))466 checkRecord(bedRecord   *record,
467             sequences   &ctgs,
468             sequences   &ctgs_orig,
469             sequences   &utgs,
470             bool         beVerbose,
471             bool         UNUSED(doPlot)) {
472 
473   char   *Aseq = ctgs[record->_Aid].seq;
474   char   *Bseq = utgs[record->_Bid].seq, *Brev = NULL;
475 
476   int32  Alen = ctgs[record->_Aid].len;
477   int32  Blen = utgs[record->_Bid].len;
478 
479   double ratio = (double)Alen/ctgs_orig[record->_Aid].len;
480 
481   int32  Abgn  = std::max(0,    (int32)(floor(record->_bgn*ratio)));
482   int32  Aend  = std::min(Alen, (int32)(ceil (record->_end*ratio)));
483 
484   bool   success    = true;
485   int32  alignScore = 0;
486 
487   if (record->_Bfwd == false)
488     Bseq = Brev = reverseComplementCopy(Bseq, Blen);
489 
490   //  If Bseq (the unitig) is small, just align the full thing.
491 
492   if (Blen < 50000) {
493     success &= checkRecord_align("ALL",
494                                  record->_Aname, Aseq, Alen, Abgn, Aend,
495                                  record->_Bname, Bseq, Blen,
496                                  alignScore,
497                                  beVerbose);
498   }
499 
500   //  Otherwise, we need to try to align only the ends of the unitig.
501   //
502   //        -----------------------[AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA]------------------
503   //                                     BBBBB..............BBBBB
504   //
505 
506   else {
507     int32   AbgnL = Abgn,         AendL = Abgn + 50000;
508     int32   AbgnR = Aend - 50000, AendR = Aend;
509 
510     char   *BseqL = Bseq;
511     char   *BseqR = Bseq + Blen - 50000;
512 
513 #if 0
514     success &= checkRecord_align("ALL",
515                                  record->_Aname, Aseq, Alen, Abgn, Aend,
516                                  record->_Bname, Bseq, Blen,
517                                  alignScore,
518                                  beVerbose);
519 #endif
520 
521     success &= checkRecord_align("LEFT",
522                                  record->_Aname, Aseq,  Alen, AbgnL, AendL,
523                                  record->_Bname, BseqL, 50000,
524                                  alignScore,
525                                  beVerbose);
526 
527     success &= checkRecord_align("RIGHT",
528                                  record->_Aname, Aseq,  Alen, AbgnR, AendR,
529                                  record->_Bname, BseqR, 50000,
530                                  alignScore,
531                                  beVerbose);
532 
533     Abgn = AbgnL;
534     Aend = AendR;
535   }
536 
537   delete [] Brev;
538 
539   //  If successful, save the coordinates.  Because we're usually not aligning the whole
540   //  unitig to the contig, we can't save the score.
541 
542   if (success) {
543     record->_bgn   = Abgn;
544     record->_end   = Aend;
545     record->_score = 0;   //alignScore;
546   }
547 
548   return(success);
549 }
550 
551 
552 
553 //
554 //  Try to find an alignment for each link in the GFA file.  If found, output a new link
555 //  with correct CIGAR string.  If not found, discard the link.
556 //
557 void
processGFA(char * tigName,uint32 tigVers,char * inGFA,char * otGFA,double erate,uint32 verbosity)558 processGFA(char     *tigName,
559            uint32    tigVers,
560            char     *inGFA,
561            char     *otGFA,
562            double    erate,
563            uint32    verbosity) {
564 
565   //  Load the GFA file.
566 
567   fprintf(stderr, "-- Reading GFA '%s'.\n", inGFA);
568 
569   gfaFile   *gfa  = new gfaFile(inGFA);
570 
571   fprintf(stderr, "-- Loading sequences from tigStore '%s' version %u.\n", tigName, tigVers-1);
572 
573   sequences *seqs_origp = new sequences(tigName, tigVers-1);
574   sequences &seqs_orig  = *seqs_origp;
575 
576   fprintf(stderr, "-- Loading sequences from tigStore '%s' version %u.\n", tigName, tigVers);
577 
578   sequences *seqsp = new sequences(tigName, tigVers);
579   sequences &seqs  = *seqsp;
580 
581   //  Set GFA lengths based on the sequences we loaded.
582 
583   fprintf(stderr, "-- Resetting sequence lengths.\n");
584 
585   for (uint32 ii=0; ii<gfa->_sequences.size(); ii++)
586     gfa->_sequences[ii]->_length = seqs[gfa->_sequences[ii]->_id].len;
587 
588   //  Align!
589 
590   uint32  passCircular = 0;
591   uint32  failCircular = 0;
592 
593   uint32  passNormal = 0;
594   uint32  failNormal = 0;
595 
596   uint32  iiLimit      = gfa->_links.size();
597   uint32  iiNumThreads = getNumThreads();
598   uint32  iiBlockSize  = (iiLimit < 1000 * iiNumThreads) ? iiNumThreads : iiLimit / 999;
599 
600   fprintf(stderr, "-- Aligning " F_U32 " links using " F_U32 " threads and %.2f error rate.\n", iiLimit, iiNumThreads, erate*100);
601 
602 #pragma omp parallel for schedule(dynamic, iiBlockSize)
603   for (uint32 ii=0; ii<iiLimit; ii++) {
604     gfaLink *link = gfa->_links[ii];
605 
606     if (link->_Aid == link->_Bid) {
607       if (verbosity > 0)
608         fprintf(stderr, "Processing circular link for tig %u\n", link->_Aid);
609 
610       if (link->_Afwd != link->_Bfwd)
611         fprintf(stderr, "WARNING: %s %c %s %c -- circular to the same end!?\n",
612                 link->_Aname, link->_Afwd ? '+' : '-',
613                 link->_Bname, link->_Bfwd ? '+' : '-');
614 
615       bool  pN = checkLink(link, seqs, seqs_orig, erate, (verbosity > 0), false);
616 
617       if (pN == true)
618         passCircular++;
619       else
620         failCircular++;
621     }
622 
623     //  Now the usual case.
624 
625     else {
626       if (verbosity > 0)
627         fprintf(stderr, "Processing link between tig %u %s and tig %u %s\n",
628                 link->_Aid, link->_Afwd ? "-->" : "<--",
629                 link->_Bid, link->_Bfwd ? "-->" : "<--");
630 
631       bool  pN = checkLink(link, seqs, seqs_orig, erate, (verbosity > 0), false);
632 
633       if (pN == true)
634         passNormal++;
635       else
636         failNormal++;
637     }
638 
639     //  If the cigar exists, we found an alignment.  If not, delete the link.
640 
641     if (link->_cigar == NULL) {
642       if (verbosity > 0)
643         fprintf(stderr, "  Failed to find alignment.\n");
644       delete gfa->_links[ii];
645       gfa->_links[ii] = NULL;
646     }
647   }
648 
649   fprintf(stderr, "-- Writing GFA '%s'.\n", otGFA);
650 
651   gfa->saveFile(otGFA);
652 
653   fprintf(stderr, "-- Cleaning up.\n");
654 
655   delete seqsp;
656   delete gfa;
657 
658   fprintf(stderr, "-- Aligned %6u ciruclar tigs, failed %6u\n", passCircular, failCircular);
659   fprintf(stderr, "-- Aligned %6u   linear tigs, failed %6u\n", passNormal,   failNormal);
660 }
661 
662 
663 
664 //
665 //  Find an alignment between the unitig (the feature) and the contig (the 'chromosome').
666 //  Output updated coordiates.
667 //
668 void
processBED(char * tigName,uint32 tigVers,char * seqName,uint32 seqVers,char * inBED,char * otBED,uint32 verbosity)669 processBED(char   *tigName,
670            uint32  tigVers,
671            char   *seqName,
672            uint32  seqVers,
673            char   *inBED,
674            char   *otBED,
675            uint32  verbosity) {
676 
677   //  Load the BED file.
678 
679   fprintf(stderr, "-- Reading BED '%s'.\n", inBED);
680 
681   bedFile   *bed  = new bedFile(inBED);
682 
683   fprintf(stderr, "-- Loading sequences from tigStore '%s' version %u.\n", tigName, tigVers);
684 
685   sequences *utgsp = new sequences(tigName, tigVers);
686   sequences &utgs  = *utgsp;
687 
688   fprintf(stderr, "-- Loading sequences from tigStore '%s' version %u.\n", seqName, seqVers-1);
689 
690   sequences *ctgs_origp = new sequences(seqName, seqVers-1);
691   sequences &ctgs_orig  = *ctgs_origp;
692 
693   fprintf(stderr, "-- Loading sequences from tigStore '%s' version %u.\n", seqName, seqVers);
694 
695   sequences *ctgsp = new sequences(seqName, seqVers);
696   sequences &ctgs  = *ctgsp;
697 
698   //  Align!
699 
700   uint32  pass = 0;
701   uint32  fail = 0;
702 
703   uint32  iiLimit      = bed->_records.size();
704   uint32  iiNumThreads = getNumThreads();
705   uint32  iiBlockSize  = (iiLimit < 1000 * iiNumThreads) ? iiNumThreads : iiLimit / 999;
706 
707   fprintf(stderr, "-- Aligning " F_U32 " records using " F_U32 " threads.\n", iiLimit, iiNumThreads);
708 
709 #pragma omp parallel for schedule(dynamic, iiBlockSize)
710   for (uint32 ii=0; ii<iiLimit; ii++) {
711     bedRecord *record = bed->_records[ii];
712 
713     if (checkRecord(record, ctgs, ctgs_orig, utgs, (verbosity > 0), false)) {
714       pass++;
715     } else {
716       delete bed->_records[ii];
717       bed->_records[ii] = NULL;
718       fail++;
719     }
720   }
721 
722   fprintf(stderr, "-- Writing BED '%s'.\n", otBED);
723 
724   bed->saveFile(otBED);
725 
726   fprintf(stderr, "-- Cleaning up.\n");
727 
728   delete utgsp;
729   delete ctgsp;
730   delete bed;
731 
732   fprintf(stderr, "-- Aligned %6u unitigs to contigs, failed %6u\n", pass, fail);
733 }
734 
735 
736 
737 //
738 //  Infer a graph from the positions of unitigs (features) in contigs (chromosomes).  Generate a GFA
739 //  input and toss that up to processGFA.
740 //
741 void
processBEDtoGFA(char * tigName,uint32 tigVers,char * inBED,char * otGFA,double erate,uint32 verbosity)742 processBEDtoGFA(char   *tigName,
743                 uint32  tigVers,
744                 char   *inBED,
745                 char   *otGFA,
746                 double  erate,
747                 uint32  verbosity) {
748 
749   int32  minOlap = 100;
750 
751   //  We only really need the sequence lengths here, but eventually, we'll want to generate
752   //  alignments for all the overlaps, and so we'll need the sequences too.
753   fprintf(stderr, "-- Loading sequences from tigStore '%s' version %u.\n", tigName, tigVers-1);
754 
755   sequences *seqs_origp = new sequences(tigName, tigVers-1);
756   sequences &seqs_orig  = *seqs_origp;
757 
758   fprintf(stderr, "-- Loading sequences from tigStore '%s' version %u.\n", tigName, tigVers);
759 
760   sequences *seqsp = new sequences(tigName, tigVers);
761   sequences &seqs  = *seqsp;
762 
763   //  Load the BED file and allocate an output GFA.
764 
765   fprintf(stderr, "-- Reading BED '%s'.\n", inBED);
766 
767   bedFile   *bed  = new bedFile(inBED);
768   gfaFile   *gfa  = new gfaFile("H\tVN:Z:1.0");
769 
770   //  Iterate over sequences, looking for overlaps in contigs.  Stupid, O(n^2) but seems fast enough.
771 
772   uint32  iiLimit      = bed->_records.size();
773   uint32  iiNumThreads = getNumThreads();
774   uint32  iiBlockSize  = (iiLimit < 1000 * iiNumThreads) ? iiNumThreads : iiLimit / 999;
775 
776   fprintf(stderr, "-- Aligning " F_U32 " records using " F_U32 " threads.\n", iiLimit, iiNumThreads);
777 
778 #pragma omp parallel for schedule(dynamic, iiBlockSize)
779   for (uint64 ii=0; ii<bed->_records.size(); ii++) {
780     for (uint64 jj=ii+1; jj<bed->_records.size(); jj++) {
781 
782       if (bed->_records[ii]->_Aid != bed->_records[jj]->_Aid)                 //  Different contigs?
783         continue;                                                             //  No overlap.
784 
785       if ((bed->_records[ii]->_end < bed->_records[jj]->_bgn + minOlap) ||    //  No (thick) intersection?
786           (bed->_records[jj]->_end < bed->_records[ii]->_bgn + minOlap))      //
787         continue;                                                             //  No overlap.
788 
789       //  Overlap!
790 
791       //fprintf(stderr, "OVERLAP %s %d-%d - %s %d-%d\n",
792       //        bed->_records[ii]->_Bname, bed->_records[ii]->_bgn, bed->_records[ii]->_end,
793       //        bed->_records[jj]->_Bname, bed->_records[jj]->_bgn, bed->_records[jj]->_end);
794 
795       int32  olapLen = 0;
796 
797       if (bed->_records[ii]->_bgn < bed->_records[jj]->_end)
798         olapLen = bed->_records[ii]->_end - bed->_records[jj]->_bgn;
799 
800       if (bed->_records[jj]->_bgn < bed->_records[ii]->_end)
801         olapLen = bed->_records[jj]->_end - bed->_records[ii]->_bgn;
802 
803       assert(olapLen > 0);
804 
805       char   cigar[81];
806 
807       sprintf(cigar, "%dM", olapLen);
808 
809       gfaLink *link = new gfaLink(bed->_records[ii]->_Bname, bed->_records[ii]->_Bid, true,
810                                   bed->_records[jj]->_Bname, bed->_records[jj]->_Bid, true,
811                                   cigar);
812 
813       bool  pN = checkLink(link, seqs, seqs_orig, erate, (verbosity > 0), false);
814 
815 #pragma omp critical
816       {
817         if (pN)
818           gfa->_links.push_back(link);
819         else
820           gfa->_links.push_back(link);
821 
822         //  Remember sequences we've hit.
823 
824         seqs.used[bed->_records[ii]->_Bid]++;
825         seqs.used[bed->_records[jj]->_Bid]++;
826       }
827     }
828   }
829 
830   //  Add sequences.  We could have done this as we're running through making edges, but we then
831   //  need to figure out if we've seen a sequence already.
832 
833   char   seqName[80];
834 
835   for (uint32 ii=0; ii<seqs.e; ii++)
836     if (seqs.used[ii] > 0) {
837       sprintf(seqName, "utg%08u", ii);
838       gfa->_sequences.push_back(new gfaSequence(seqName, ii, seqs[ii].len));
839     }
840 
841   //  Write the file, cleanup, done!
842 
843   gfa->saveFile(otGFA);
844 
845   delete gfa;
846   delete bed;
847 
848   delete seqsp;
849 }
850 
851 
852 
853 int
main(int argc,char ** argv)854 main (int argc, char **argv) {
855   char     *tigName         = NULL;         //  For GFA and BED, the source of the tigs
856   uint32    tigVers         = UINT32_MAX;
857 
858   char     *seqName         = NULL;         //  For BED, the source of the 'chromosomes'
859   uint32    seqVers         = UINT32_MAX;   //  The -C option (either chromosome or container)
860 
861   char     *inGraph         = NULL;
862   char     *otGraph         = NULL;
863 
864   uint32    graphType       = IS_GFA;
865 
866   uint32    verbosity      = 0;
867 
868   // not used for bed alignments, that is fixed at a lower value
869   double    erate          = 0.10;
870 
871   argc = AS_configure(argc, argv);
872 
873   int arg=1;
874   int err=0;
875   while (arg < argc) {
876     if        (strcmp(argv[arg], "-T") == 0) {
877       tigName = argv[++arg];
878       tigVers = atoi(argv[++arg]);
879 
880     } else if (strcmp(argv[arg], "-C") == 0) {
881       seqName = argv[++arg];
882       seqVers = atoi(argv[++arg]);
883 
884     } else if (strcmp(argv[arg], "-gfa") == 0) {
885       graphType = IS_GFA;
886     } else if (strcmp(argv[arg], "-bed") == 0) {
887       graphType = IS_BED;
888 
889     } else if (strcmp(argv[arg], "-i") == 0) {
890       inGraph = argv[++arg];
891     } else if (strcmp(argv[arg], "-o") == 0) {
892       otGraph = argv[++arg];
893 
894     } else if (strcmp(argv[arg], "-V") == 0) {
895       verbosity++;
896 
897     } else if (strcmp(argv[arg], "-t") == 0) {
898       setNumThreads(argv[++arg]);
899 
900     } else if (strcmp(argv[arg], "-e") == 0) {
901       erate = atof(argv[++arg]);
902 
903     } else {
904       fprintf(stderr, "%s: Unknown option '%s'\n", argv[0], argv[arg]);
905       err++;
906     }
907 
908     arg++;
909   }
910 
911   if (tigName == NULL)
912     err++;
913   if (inGraph == NULL)
914     err++;
915   if (otGraph == NULL)
916     err++;
917 
918   if ((tigName) && (tigVers == 0))
919     err++;
920   if ((seqName) && (seqVers == 0))
921     err++;
922 
923   if (err) {
924     fprintf(stderr, "usage: %s [opts]\n", argv[0]);
925     fprintf(stderr, "  Validates a GFA by generating alignments.\n");
926     fprintf(stderr, "  Optionally writes new GFA with updated CIGAR string (NOT IMPLEMENTED).\n");
927     fprintf(stderr, "\n");
928     fprintf(stderr, "    -T t v         Load tigs from tgStore 't', version 'v'.\n");
929     fprintf(stderr, "    -C t v         For BED format, the source of the 'chromosomes'.  Similar to -T.\n");
930     fprintf(stderr, "                     Consensus sequence must exist for -T and -C (usually in v=2)\n");
931     fprintf(stderr, "\n");
932     fprintf(stderr, "    -i input       Input graph.\n");
933     fprintf(stderr, "    -o output      Output graph.\n");
934     fprintf(stderr, "                     Graph are either GFA (v1) or BED format.\n");
935     fprintf(stderr, "\n");
936     fprintf(stderr, "    -gfa           The input and output graphs are in GFA (v1) format.\n");
937     fprintf(stderr, "    -bed           The input graph is in BED format.  If -C is supplied, the\n");
938     fprintf(stderr, "                   output will also be BED, and will have updated positions.\n");
939     fprintf(stderr, "                   If -C is not supplied, the output will be GFA (v1) of the\n");
940     fprintf(stderr, "                   overlaps inferred from the BED positions.\n");
941     fprintf(stderr, "\n");
942     fprintf(stderr, "    -V             Increase chatter.\n");
943     fprintf(stderr, "\n");
944     fprintf(stderr, "    -t threads     Use 'threads' computational threads.\n");
945     fprintf(stderr, "\n");
946 
947     if (tigName == NULL)
948       fprintf(stderr, "ERROR: no tigStore (-T) supplied.\n");
949     if (inGraph == NULL)
950       fprintf(stderr, "ERROR: no input GFA (-i) supplied.\n");
951     if (otGraph == NULL)
952       fprintf(stderr, "ERROR: no output GFA (-o) supplied.\n");
953 
954     if ((tigName) && (tigVers == 0))
955       fprintf(stderr, "ERROR: invalid tigStore version (-T) supplied.\n");
956     if ((seqName) && (seqVers == 0))
957       fprintf(stderr, "ERROR: invalid tigStore version (-C) supplied.\n");
958 
959     exit(1);
960   }
961 
962   if (graphType == IS_GFA)
963     processGFA(tigName, tigVers, inGraph, otGraph, erate, verbosity);
964 
965   if ((graphType == IS_BED) && (seqName != NULL))
966     processBED(tigName, tigVers, seqName, seqVers, inGraph, otGraph, verbosity);
967 
968   if ((graphType == IS_BED) && (seqName == NULL))
969     processBEDtoGFA(tigName, tigVers, inGraph, otGraph, erate, verbosity);
970 
971   fprintf(stderr, "Bye.\n");
972 
973   exit(0);
974 }
975