1
2 /******************************************************************************
3 *
4 * This file is part of canu, a software program that assembles whole-genome
5 * sequencing reads into contigs.
6 *
7 * This software is based on:
8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 *
11 * Except as indicated otherwise, this is a 'United States Government Work',
12 * and is released in the public domain.
13 *
14 * File 'README.licenses' in the root directory of this distribution
15 * contains full conditions and disclaimers.
16 */
17
18 #include "runtime.H"
19 #include "system.H"
20
21 #include "sqStore.H"
22 #include "tgStore.H"
23
24 #include "edlib.H"
25
26 #include "strings.H"
27 #include "sequence.H"
28
29 #include "gfa.H"
30 #include "bed.H"
31
32 #define IS_GFA 1
33 #define IS_BED 2
34
35 #define STEP_BP 500
36
37 class sequence {
38 public:
sequence()39 sequence() {
40 seq = NULL;
41 len = 0;
42 };
~sequence()43 ~sequence() {
44 delete [] seq;
45 };
46
set(tgTig * tig)47 void set(tgTig *tig) {
48 len = tig->length();
49 seq = new char [len + 1];
50
51 if (tig->consensusExists())
52 memcpy(seq, tig->bases(), len);
53 else
54 memset(seq, 0, len);
55
56 seq[len] = 0;
57 };
58
59 char *seq;
60 uint32 len;
61 };
62
63
64
65 class sequences {
66 public:
sequences(char * tigName,uint32 tigVers)67 sequences(char *tigName, uint32 tigVers) {
68 if (tigVers <= 0)
69 fprintf(stderr, "Error: invalid version of tigstore: %d, not supported\n", tigVers);
70 assert(tigVers > 0);
71
72 tgStore *tigStore = new tgStore(tigName, tigVers);
73
74 b = 0;
75 e = tigStore->numTigs();
76 seqs = new sequence [e+1];
77 used = new uint32 [e+1];
78
79 for (uint32 ti=b; ti < e; ti++) {
80 tgTig *tig = tigStore->loadTig(ti);
81
82 used[ti] = 0;
83
84 if (tig == NULL)
85 continue;
86
87 seqs[ti].set(tig);
88
89 tigStore->unloadTig(ti);
90 }
91
92 delete tigStore;
93 };
94
~sequences()95 ~sequences() {
96 delete [] seqs;
97 delete [] used;
98 };
99
100 sequence &operator[](uint32 xx) {
101 if (xx < e)
102 return(seqs[xx]);
103
104 fprintf(stderr, "ERROR: sequence id %u out of range b=%u e=%u\n", xx, b, e);
105
106 assert(xx < e);
107 return(seqs[0]);
108 };
109
110 uint32 b;
111 uint32 e;
112 sequence *seqs;
113 uint32 *used;
114 };
115
116
117
118 void
dotplot(uint32 Aid,bool Afwd,char * Aseq,uint32 Bid,bool Bfwd,char * Bseq)119 dotplot(uint32 Aid, bool Afwd, char *Aseq,
120 uint32 Bid, bool Bfwd, char *Bseq) {
121 char Aname[16];
122 char Bname[16];
123 char Pname[64];
124 FILE *F;
125
126 sprintf(Aname, "tig%08u%c", Aid, (Afwd) ? '+' : '-');
127 sprintf(Bname, "tig%08u%c", Bid, (Bfwd) ? '+' : '-');
128 sprintf(Pname, "plot-%s-%s", Aname, Bname);
129
130 F = AS_UTL_openOutputFile(Pname, '.', "sh");
131 fprintf(F, "#!/bin/sh\n");
132 fprintf(F, "\n");
133 fprintf(F, "nucmer --maxmatch --nosimplify -p %s %s.fasta %s.fasta\n", Pname, Aname, Bname);
134 fprintf(F, "show-coords -l -o -r -T %s.delta | expand -t 8 > %s.coords\n", Pname, Pname);
135 fprintf(F, "mummerplot --fat -t png -p %s %s.delta\n", Pname, Pname);
136 fprintf(F, "echo mummerplot --fat -p %s %s.delta\n", Pname, Pname);
137 AS_UTL_closeFile(F, Pname, '.', "sh");
138
139 F = AS_UTL_openOutputFile(Aname, '.', "fasta");
140 fprintf(F, ">%s\n%s\n", Aname, Aseq);
141 AS_UTL_closeFile(F, Aname, '.', "fasta");
142
143 F = AS_UTL_openOutputFile(Bname, '.', "fasta");
144 fprintf(F, ">%s\n%s\n", Bname, Bseq);
145 AS_UTL_closeFile(F, Bname, '.', "fasta");
146
147 sprintf(Pname, "sh plot-%s-%s.sh", Aname, Bname);
148 system(Pname);
149 }
150
151
152
153 bool
checkLink(gfaLink * link,sequences & seqs,sequences & seqs_orig,double erate,bool beVerbose,bool doPlot)154 checkLink(gfaLink *link,
155 sequences &seqs,
156 sequences &seqs_orig,
157 double erate,
158 bool beVerbose,
159 bool doPlot) {
160
161 char *Aseq = seqs[link->_Aid].seq, *Arev = NULL;
162 char *Bseq = seqs[link->_Bid].seq, *Brev = NULL;
163
164 int32 Abgn, Aend, Alen = seqs[link->_Aid].len;
165 int32 Bbgn, Bend, Blen = seqs[link->_Bid].len;
166
167 double ratio = std::max((double)Alen/seqs_orig[link->_Aid].len, (double)Blen/seqs_orig[link->_Bid].len);
168
169 EdlibAlignResult result = { 0, NULL, NULL, 0, NULL, 0, 0 };
170
171 int32 AalignLen = 0;
172 int32 BalignLen = 0;
173 int32 editDist = 0;
174 int32 alignLen = 0;
175 int32 maxEdit = 0;
176
177 // NOTE! edlibAlign calls the 'A' sequence the 'query' and
178 // the 'B' sequence the 'target' (aka, 'reference').
179
180 link->alignmentLength(AalignLen, BalignLen, alignLen);
181
182 // Regardless of if we find an new alignment or not, remove the old one.
183 // If we don't find a new one, we'll discard the link.
184
185 delete [] link->_cigar;
186 link->_cigar = NULL;
187
188 if (link->_Afwd == false)
189 Aseq = Arev = reverseComplementCopy(Aseq, Alen);
190 if (link->_Bfwd == false)
191 Bseq = Brev = reverseComplementCopy(Bseq, Blen);
192
193 // Ty to find the end coordinate on B. Align the last bits of A to B.
194 //
195 // -------(---------] v--??
196 // [------------)------
197 //
198 // if we fail to find the specified link, we will shrink A to see if we can find it that way and B will be trimmed to the right point
199 // no need to retry on the second time because we'll have the right location of B and will trim A to it
200
201 Abgn = std::max(Alen - AalignLen, 0);
202 Aend = Alen;
203
204 Bbgn = 0;
205 Bend = std::min(Blen, (int32)(1.10 * ratio * BalignLen)); // Expand by whatever factor the contig grew during consensus plus 10%
206
207 maxEdit = (int32)ceil(alignLen * erate);
208
209 while (result.numLocations == 0) {
210 if (beVerbose)
211 fprintf(stderr, "LINK tig%08u %c %17s tig%08u %c %17s Aalign %6u Balign %6u align %6u\n",
212 link->_Aid, (link->_Afwd) ? '+' : '-', "",
213 link->_Bid, (link->_Bfwd) ? '+' : '-', "",
214 AalignLen, BalignLen, alignLen);
215
216 if (beVerbose)
217 fprintf(stderr, "TEST tig%08u %c %8d-%-8d tig%08u %c %8d-%-8d maxEdit=%6d (extend B)",
218 link->_Aid, (link->_Afwd) ? '+' : '-', Abgn, Aend,
219 link->_Bid, (link->_Bfwd) ? '+' : '-', Bbgn, Bend,
220 maxEdit);
221
222 result = edlibAlign(Aseq + Abgn, Aend-Abgn, // The 'query'
223 Bseq + Bbgn, Bend-Bbgn, // The 'target'
224 edlibNewAlignConfig(maxEdit, EDLIB_MODE_HW, EDLIB_TASK_LOC));
225
226 if (result.numLocations > 0) {
227 if (beVerbose)
228 fprintf(stderr, "\n");
229 Bend = Bbgn + result.endLocations[0] + 1; // 0-based to space-based
230 edlibFreeAlignResult(result);
231 break; // found it, stop
232 } else {
233 if (beVerbose)
234 fprintf(stderr, " - FAILED\n");
235 if (Abgn + STEP_BP >= Alen) break; // we ran out of sequence, stop
236 Abgn += STEP_BP;
237 }
238 }
239
240 // Do the same for A. Aend and Bbgn never change; Bend was set above.
241 //
242 // ------(--------------]
243 // ^--?? [-------]-----------
244 //
245
246 Abgn = std::max(Alen - (int32)(1.10 * ratio * AalignLen), 0); // Expand by whatever factor the contig grew during consensus plus 10%
247 // update max edit by new length
248 maxEdit = (int32)ceil((Bend-Bbgn+1) * erate);
249
250 if (beVerbose)
251 fprintf(stderr, " tig%08u %c %8d-%-8d tig%08u %c %8d-%-8d maxEdit=%6d (extend A)",
252 link->_Aid, (link->_Afwd) ? '+' : '-', Abgn, Aend,
253 link->_Bid, (link->_Bfwd) ? '+' : '-', Bbgn, Bend,
254 maxEdit);
255
256 // NEEDS to be MODE_HW because we need to find the suffix alignment.
257
258 result = edlibAlign(Bseq + Bbgn, Bend-Bbgn, // The 'query'
259 Aseq + Abgn, Aend-Abgn, // The 'target'
260 edlibNewAlignConfig(maxEdit, EDLIB_MODE_HW, EDLIB_TASK_LOC));
261
262 if (result.numLocations > 0) {
263 if (beVerbose)
264 fprintf(stderr, "\n");
265 Abgn = Abgn + result.startLocations[0];
266 edlibFreeAlignResult(result);
267 } else {
268 if (beVerbose)
269 fprintf(stderr, " - FAILED\n");
270 }
271
272 // One more alignment, this time, with feeling - notice EDLIB_MODE_MW and EDLIB_TASK_PATH.
273
274 if (beVerbose)
275 fprintf(stderr, " tig%08u %c %8d-%-8d tig%08u %c %8d-%-8d maxEdit=%6d (final)",
276 link->_Aid, (link->_Afwd) ? '+' : '-', Abgn, Aend,
277 link->_Bid, (link->_Bfwd) ? '+' : '-', Bbgn, Bend,
278 maxEdit);
279
280 result = edlibAlign(Aseq + Abgn, Aend-Abgn,
281 Bseq + Bbgn, Bend-Bbgn,
282 edlibNewAlignConfig(2 * maxEdit, EDLIB_MODE_NW, EDLIB_TASK_PATH));
283
284
285 bool success = false;
286
287 if (result.numLocations > 0) {
288 if (beVerbose)
289 fprintf(stderr, "\n");
290
291 editDist = result.editDistance;
292 alignLen = ((Aend - Abgn) + (Bend - Bbgn) + (editDist)) / 2;
293 alignLen = result.alignmentLength; // Edlib 'alignmentLength' is populated only for TASK_PATH
294
295 link->_cigar = edlibAlignmentToCigar(result.alignment,
296 result.alignmentLength, EDLIB_CIGAR_STANDARD);
297
298 edlibFreeAlignResult(result);
299
300 success = true;
301 } else {
302 if (beVerbose)
303 fprintf(stderr, " - FAILED\n");
304 }
305
306 if (beVerbose)
307 fprintf(stderr, " tig%08u %c %8d-%-8d tig%08u %c %8d-%-8d %.4f\n",
308 link->_Aid, (link->_Afwd) ? '+' : '-', Abgn, Aend,
309 link->_Bid, (link->_Bfwd) ? '+' : '-', Bbgn, Bend,
310 (double)editDist / alignLen);
311
312 // Make a plot.
313
314 if ((success == false) && (doPlot == true))
315 dotplot(link->_Aid, link->_Afwd, Aseq,
316 link->_Bid, link->_Bfwd, Bseq);
317
318 // Cleanup for the next link.
319
320 delete [] Arev;
321 delete [] Brev;
322
323 if (beVerbose)
324 fprintf(stderr, "\n");
325
326 return(success);
327 }
328
329
330
331 // Align all of B into A. Extend A as needed to make the whole thing fit.
332 // Abgn, Aend and score are updated with the alignment.
333 //
334 bool
checkRecord_align(char const * label,char const * Aname,char const * Aseq,int32 Alen,int32 & Abgn,int32 & Aend,char const * Bname,char const * Bseq,int32 Blen,int32 & score,bool beVerbose)335 checkRecord_align(char const *label,
336 char const *Aname, char const *Aseq, int32 Alen, int32 &Abgn, int32 &Aend,
337 char const *Bname, char const *Bseq, int32 Blen,
338 int32 &score,
339 bool beVerbose) {
340
341 EdlibAlignResult result = { 0, NULL, NULL, 0, NULL, 0, 0 };
342
343 int32 editDist = 0;
344 int32 alignLen = 0;
345 int32 alignScore = 0;
346 int32 maxEdit = (int32)ceil(Blen * 0.03); // Should be the same sequence, but allow for a little difference.
347 int32 step = (int32)ceil(Blen * 0.15);
348
349 Aend = std::min(Aend + 2 * step, Alen); // Limit Aend to the actual length of the contig (consensus can shrink)
350 Abgn = std::max(Aend - Blen - 2 * step, 0); // Then push Abgn back to make space for the unitig.
351
352 tryAgain:
353 if (beVerbose)
354 fprintf(stderr, "ALIGN %5s utg %s len=%7d to ctg %s %9d-%9d len=%9d",
355 label,
356 Bname, Blen,
357 Aname, Abgn, Aend, Alen);
358
359 #if 0
360 char N[FILENAME_MAX+1];
361 FILE *F;
362
363 char ach = Aseq[Aend]; Aseq[Aend] = 0;
364 char bch = Bseq[Bend]; Bseq[Bend] = 0;
365
366 sprintf(N, "compare%04d-%04d-ctg%04d.fasta", record->_Aid, record->_Bid, record->_Aid);
367 F = AS_UTL_openOutputFile(N);
368 fprintf(F, ">ctg%04d\n%s\n", record->_Aid, Aseq + Abgn);
369 AS_UTL_closeFile(F, N);
370
371 sprintf(N, "compare%04d-%04d-utg%04d.fasta", record->_Aid, record->_Bid, record->_Bid);
372 F = AS_UTL_openOutputFile(N);
373 fprintf(F, ">utg%04d\n%s\n", record->_Bid, Bseq + Bbgn);
374 AS_UTL_closeFile(F, N);
375
376 Aseq[Aend] = ach;
377 Bseq[Bend] = bch;
378 #endif
379
380 result = edlibAlign(Bseq, Blen, // The 'query' (unitig)
381 Aseq + Abgn, Aend-Abgn, // The 'target' (contig)
382 edlibNewAlignConfig(maxEdit, EDLIB_MODE_HW, EDLIB_TASK_LOC));
383
384 // Got an alignment? Process and report, and maybe try again.
385
386 if (result.numLocations > 0) {
387 int32 nAbgn = Abgn + result.startLocations[0];
388 int32 nAend = Abgn + result.endLocations[0] + 1; // 0-based to space-based
389 char *cigar = NULL;
390
391 editDist = result.editDistance;
392 alignLen = ((nAend - nAbgn) + (Blen) + (editDist)) / 2;
393 alignScore = 1000 - (int32)(1000.0 * editDist / alignLen);
394
395 // If there's an alignment, we can get a cigar string and better alignment length.
396 if ((result.alignment != NULL) && (result.alignmentLength > 0)) {
397 cigar = edlibAlignmentToCigar(result.alignment, result.alignmentLength, EDLIB_CIGAR_STANDARD);
398 alignLen = result.alignmentLength;
399 }
400
401 edlibFreeAlignResult(result);
402
403 if (beVerbose)
404 fprintf(stderr, " - POSITION from %9d-%-9d to %9d-%-9d score %5d/%9d = %4d%s%s\n",
405 Abgn, Aend,
406 nAbgn, nAend,
407 editDist, alignLen, alignScore,
408 (cigar != NULL) ? " align " : "",
409 (cigar != NULL) ? cigar : "");
410
411 delete [] cigar;
412
413 // If it's a full alignment -- if the A region was big enough to have unaligned bases -- then
414 // we're done. Update the result and get out of here.
415
416 if (((Abgn < nAbgn) || (Abgn == 0)) &&
417 ((nAend < Aend) || (Aend == Alen))) {
418
419 Abgn = nAbgn;
420 Aend = nAend;
421 score = alignScore;
422
423 return(true);
424 }
425
426 // Otherwise, we ran out of A sequence to align to before we ran out of stuff to align. Extend
427 // the A region and try again.
428
429 if (Abgn == nAbgn)
430 Abgn = std::max(Abgn - step, 0);
431
432 if (Aend == nAend)
433 Aend = std::min(Aend + step, Alen);
434
435 goto tryAgain;
436 }
437
438 // Didn't get a good alignment.
439
440 // We fail for one of two reasons - either not enough bases in the reference, or too high of
441 // error. Unitigs are supposed to be from the same sequence, but they might be lower coverage
442 // and therefore higher error. It's more likely they are misplaced.
443
444
445 if ((Aend - Abgn < 10 * Blen) &&
446 (maxEdit < Blen * 0.35)) {
447 if (beVerbose)
448 fprintf(stderr, " - FAILED, RELAX\n");
449
450 Abgn = std::max(Abgn - step, 0);
451 Aend = std::min(Aend + step, Alen);
452 maxEdit *= 1.2;
453
454 goto tryAgain;
455 }
456
457 if (beVerbose)
458 fprintf(stderr, " - ABORT, ABORT, ABORT!\n");
459
460 return(false);
461 }
462
463
464
465 bool
checkRecord(bedRecord * record,sequences & ctgs,sequences & ctgs_orig,sequences & utgs,bool beVerbose,bool UNUSED (doPlot))466 checkRecord(bedRecord *record,
467 sequences &ctgs,
468 sequences &ctgs_orig,
469 sequences &utgs,
470 bool beVerbose,
471 bool UNUSED(doPlot)) {
472
473 char *Aseq = ctgs[record->_Aid].seq;
474 char *Bseq = utgs[record->_Bid].seq, *Brev = NULL;
475
476 int32 Alen = ctgs[record->_Aid].len;
477 int32 Blen = utgs[record->_Bid].len;
478
479 double ratio = (double)Alen/ctgs_orig[record->_Aid].len;
480
481 int32 Abgn = std::max(0, (int32)(floor(record->_bgn*ratio)));
482 int32 Aend = std::min(Alen, (int32)(ceil (record->_end*ratio)));
483
484 bool success = true;
485 int32 alignScore = 0;
486
487 if (record->_Bfwd == false)
488 Bseq = Brev = reverseComplementCopy(Bseq, Blen);
489
490 // If Bseq (the unitig) is small, just align the full thing.
491
492 if (Blen < 50000) {
493 success &= checkRecord_align("ALL",
494 record->_Aname, Aseq, Alen, Abgn, Aend,
495 record->_Bname, Bseq, Blen,
496 alignScore,
497 beVerbose);
498 }
499
500 // Otherwise, we need to try to align only the ends of the unitig.
501 //
502 // -----------------------[AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA]------------------
503 // BBBBB..............BBBBB
504 //
505
506 else {
507 int32 AbgnL = Abgn, AendL = Abgn + 50000;
508 int32 AbgnR = Aend - 50000, AendR = Aend;
509
510 char *BseqL = Bseq;
511 char *BseqR = Bseq + Blen - 50000;
512
513 #if 0
514 success &= checkRecord_align("ALL",
515 record->_Aname, Aseq, Alen, Abgn, Aend,
516 record->_Bname, Bseq, Blen,
517 alignScore,
518 beVerbose);
519 #endif
520
521 success &= checkRecord_align("LEFT",
522 record->_Aname, Aseq, Alen, AbgnL, AendL,
523 record->_Bname, BseqL, 50000,
524 alignScore,
525 beVerbose);
526
527 success &= checkRecord_align("RIGHT",
528 record->_Aname, Aseq, Alen, AbgnR, AendR,
529 record->_Bname, BseqR, 50000,
530 alignScore,
531 beVerbose);
532
533 Abgn = AbgnL;
534 Aend = AendR;
535 }
536
537 delete [] Brev;
538
539 // If successful, save the coordinates. Because we're usually not aligning the whole
540 // unitig to the contig, we can't save the score.
541
542 if (success) {
543 record->_bgn = Abgn;
544 record->_end = Aend;
545 record->_score = 0; //alignScore;
546 }
547
548 return(success);
549 }
550
551
552
553 //
554 // Try to find an alignment for each link in the GFA file. If found, output a new link
555 // with correct CIGAR string. If not found, discard the link.
556 //
557 void
processGFA(char * tigName,uint32 tigVers,char * inGFA,char * otGFA,double erate,uint32 verbosity)558 processGFA(char *tigName,
559 uint32 tigVers,
560 char *inGFA,
561 char *otGFA,
562 double erate,
563 uint32 verbosity) {
564
565 // Load the GFA file.
566
567 fprintf(stderr, "-- Reading GFA '%s'.\n", inGFA);
568
569 gfaFile *gfa = new gfaFile(inGFA);
570
571 fprintf(stderr, "-- Loading sequences from tigStore '%s' version %u.\n", tigName, tigVers-1);
572
573 sequences *seqs_origp = new sequences(tigName, tigVers-1);
574 sequences &seqs_orig = *seqs_origp;
575
576 fprintf(stderr, "-- Loading sequences from tigStore '%s' version %u.\n", tigName, tigVers);
577
578 sequences *seqsp = new sequences(tigName, tigVers);
579 sequences &seqs = *seqsp;
580
581 // Set GFA lengths based on the sequences we loaded.
582
583 fprintf(stderr, "-- Resetting sequence lengths.\n");
584
585 for (uint32 ii=0; ii<gfa->_sequences.size(); ii++)
586 gfa->_sequences[ii]->_length = seqs[gfa->_sequences[ii]->_id].len;
587
588 // Align!
589
590 uint32 passCircular = 0;
591 uint32 failCircular = 0;
592
593 uint32 passNormal = 0;
594 uint32 failNormal = 0;
595
596 uint32 iiLimit = gfa->_links.size();
597 uint32 iiNumThreads = getNumThreads();
598 uint32 iiBlockSize = (iiLimit < 1000 * iiNumThreads) ? iiNumThreads : iiLimit / 999;
599
600 fprintf(stderr, "-- Aligning " F_U32 " links using " F_U32 " threads and %.2f error rate.\n", iiLimit, iiNumThreads, erate*100);
601
602 #pragma omp parallel for schedule(dynamic, iiBlockSize)
603 for (uint32 ii=0; ii<iiLimit; ii++) {
604 gfaLink *link = gfa->_links[ii];
605
606 if (link->_Aid == link->_Bid) {
607 if (verbosity > 0)
608 fprintf(stderr, "Processing circular link for tig %u\n", link->_Aid);
609
610 if (link->_Afwd != link->_Bfwd)
611 fprintf(stderr, "WARNING: %s %c %s %c -- circular to the same end!?\n",
612 link->_Aname, link->_Afwd ? '+' : '-',
613 link->_Bname, link->_Bfwd ? '+' : '-');
614
615 bool pN = checkLink(link, seqs, seqs_orig, erate, (verbosity > 0), false);
616
617 if (pN == true)
618 passCircular++;
619 else
620 failCircular++;
621 }
622
623 // Now the usual case.
624
625 else {
626 if (verbosity > 0)
627 fprintf(stderr, "Processing link between tig %u %s and tig %u %s\n",
628 link->_Aid, link->_Afwd ? "-->" : "<--",
629 link->_Bid, link->_Bfwd ? "-->" : "<--");
630
631 bool pN = checkLink(link, seqs, seqs_orig, erate, (verbosity > 0), false);
632
633 if (pN == true)
634 passNormal++;
635 else
636 failNormal++;
637 }
638
639 // If the cigar exists, we found an alignment. If not, delete the link.
640
641 if (link->_cigar == NULL) {
642 if (verbosity > 0)
643 fprintf(stderr, " Failed to find alignment.\n");
644 delete gfa->_links[ii];
645 gfa->_links[ii] = NULL;
646 }
647 }
648
649 fprintf(stderr, "-- Writing GFA '%s'.\n", otGFA);
650
651 gfa->saveFile(otGFA);
652
653 fprintf(stderr, "-- Cleaning up.\n");
654
655 delete seqsp;
656 delete gfa;
657
658 fprintf(stderr, "-- Aligned %6u ciruclar tigs, failed %6u\n", passCircular, failCircular);
659 fprintf(stderr, "-- Aligned %6u linear tigs, failed %6u\n", passNormal, failNormal);
660 }
661
662
663
664 //
665 // Find an alignment between the unitig (the feature) and the contig (the 'chromosome').
666 // Output updated coordiates.
667 //
668 void
processBED(char * tigName,uint32 tigVers,char * seqName,uint32 seqVers,char * inBED,char * otBED,uint32 verbosity)669 processBED(char *tigName,
670 uint32 tigVers,
671 char *seqName,
672 uint32 seqVers,
673 char *inBED,
674 char *otBED,
675 uint32 verbosity) {
676
677 // Load the BED file.
678
679 fprintf(stderr, "-- Reading BED '%s'.\n", inBED);
680
681 bedFile *bed = new bedFile(inBED);
682
683 fprintf(stderr, "-- Loading sequences from tigStore '%s' version %u.\n", tigName, tigVers);
684
685 sequences *utgsp = new sequences(tigName, tigVers);
686 sequences &utgs = *utgsp;
687
688 fprintf(stderr, "-- Loading sequences from tigStore '%s' version %u.\n", seqName, seqVers-1);
689
690 sequences *ctgs_origp = new sequences(seqName, seqVers-1);
691 sequences &ctgs_orig = *ctgs_origp;
692
693 fprintf(stderr, "-- Loading sequences from tigStore '%s' version %u.\n", seqName, seqVers);
694
695 sequences *ctgsp = new sequences(seqName, seqVers);
696 sequences &ctgs = *ctgsp;
697
698 // Align!
699
700 uint32 pass = 0;
701 uint32 fail = 0;
702
703 uint32 iiLimit = bed->_records.size();
704 uint32 iiNumThreads = getNumThreads();
705 uint32 iiBlockSize = (iiLimit < 1000 * iiNumThreads) ? iiNumThreads : iiLimit / 999;
706
707 fprintf(stderr, "-- Aligning " F_U32 " records using " F_U32 " threads.\n", iiLimit, iiNumThreads);
708
709 #pragma omp parallel for schedule(dynamic, iiBlockSize)
710 for (uint32 ii=0; ii<iiLimit; ii++) {
711 bedRecord *record = bed->_records[ii];
712
713 if (checkRecord(record, ctgs, ctgs_orig, utgs, (verbosity > 0), false)) {
714 pass++;
715 } else {
716 delete bed->_records[ii];
717 bed->_records[ii] = NULL;
718 fail++;
719 }
720 }
721
722 fprintf(stderr, "-- Writing BED '%s'.\n", otBED);
723
724 bed->saveFile(otBED);
725
726 fprintf(stderr, "-- Cleaning up.\n");
727
728 delete utgsp;
729 delete ctgsp;
730 delete bed;
731
732 fprintf(stderr, "-- Aligned %6u unitigs to contigs, failed %6u\n", pass, fail);
733 }
734
735
736
737 //
738 // Infer a graph from the positions of unitigs (features) in contigs (chromosomes). Generate a GFA
739 // input and toss that up to processGFA.
740 //
741 void
processBEDtoGFA(char * tigName,uint32 tigVers,char * inBED,char * otGFA,double erate,uint32 verbosity)742 processBEDtoGFA(char *tigName,
743 uint32 tigVers,
744 char *inBED,
745 char *otGFA,
746 double erate,
747 uint32 verbosity) {
748
749 int32 minOlap = 100;
750
751 // We only really need the sequence lengths here, but eventually, we'll want to generate
752 // alignments for all the overlaps, and so we'll need the sequences too.
753 fprintf(stderr, "-- Loading sequences from tigStore '%s' version %u.\n", tigName, tigVers-1);
754
755 sequences *seqs_origp = new sequences(tigName, tigVers-1);
756 sequences &seqs_orig = *seqs_origp;
757
758 fprintf(stderr, "-- Loading sequences from tigStore '%s' version %u.\n", tigName, tigVers);
759
760 sequences *seqsp = new sequences(tigName, tigVers);
761 sequences &seqs = *seqsp;
762
763 // Load the BED file and allocate an output GFA.
764
765 fprintf(stderr, "-- Reading BED '%s'.\n", inBED);
766
767 bedFile *bed = new bedFile(inBED);
768 gfaFile *gfa = new gfaFile("H\tVN:Z:1.0");
769
770 // Iterate over sequences, looking for overlaps in contigs. Stupid, O(n^2) but seems fast enough.
771
772 uint32 iiLimit = bed->_records.size();
773 uint32 iiNumThreads = getNumThreads();
774 uint32 iiBlockSize = (iiLimit < 1000 * iiNumThreads) ? iiNumThreads : iiLimit / 999;
775
776 fprintf(stderr, "-- Aligning " F_U32 " records using " F_U32 " threads.\n", iiLimit, iiNumThreads);
777
778 #pragma omp parallel for schedule(dynamic, iiBlockSize)
779 for (uint64 ii=0; ii<bed->_records.size(); ii++) {
780 for (uint64 jj=ii+1; jj<bed->_records.size(); jj++) {
781
782 if (bed->_records[ii]->_Aid != bed->_records[jj]->_Aid) // Different contigs?
783 continue; // No overlap.
784
785 if ((bed->_records[ii]->_end < bed->_records[jj]->_bgn + minOlap) || // No (thick) intersection?
786 (bed->_records[jj]->_end < bed->_records[ii]->_bgn + minOlap)) //
787 continue; // No overlap.
788
789 // Overlap!
790
791 //fprintf(stderr, "OVERLAP %s %d-%d - %s %d-%d\n",
792 // bed->_records[ii]->_Bname, bed->_records[ii]->_bgn, bed->_records[ii]->_end,
793 // bed->_records[jj]->_Bname, bed->_records[jj]->_bgn, bed->_records[jj]->_end);
794
795 int32 olapLen = 0;
796
797 if (bed->_records[ii]->_bgn < bed->_records[jj]->_end)
798 olapLen = bed->_records[ii]->_end - bed->_records[jj]->_bgn;
799
800 if (bed->_records[jj]->_bgn < bed->_records[ii]->_end)
801 olapLen = bed->_records[jj]->_end - bed->_records[ii]->_bgn;
802
803 assert(olapLen > 0);
804
805 char cigar[81];
806
807 sprintf(cigar, "%dM", olapLen);
808
809 gfaLink *link = new gfaLink(bed->_records[ii]->_Bname, bed->_records[ii]->_Bid, true,
810 bed->_records[jj]->_Bname, bed->_records[jj]->_Bid, true,
811 cigar);
812
813 bool pN = checkLink(link, seqs, seqs_orig, erate, (verbosity > 0), false);
814
815 #pragma omp critical
816 {
817 if (pN)
818 gfa->_links.push_back(link);
819 else
820 gfa->_links.push_back(link);
821
822 // Remember sequences we've hit.
823
824 seqs.used[bed->_records[ii]->_Bid]++;
825 seqs.used[bed->_records[jj]->_Bid]++;
826 }
827 }
828 }
829
830 // Add sequences. We could have done this as we're running through making edges, but we then
831 // need to figure out if we've seen a sequence already.
832
833 char seqName[80];
834
835 for (uint32 ii=0; ii<seqs.e; ii++)
836 if (seqs.used[ii] > 0) {
837 sprintf(seqName, "utg%08u", ii);
838 gfa->_sequences.push_back(new gfaSequence(seqName, ii, seqs[ii].len));
839 }
840
841 // Write the file, cleanup, done!
842
843 gfa->saveFile(otGFA);
844
845 delete gfa;
846 delete bed;
847
848 delete seqsp;
849 }
850
851
852
853 int
main(int argc,char ** argv)854 main (int argc, char **argv) {
855 char *tigName = NULL; // For GFA and BED, the source of the tigs
856 uint32 tigVers = UINT32_MAX;
857
858 char *seqName = NULL; // For BED, the source of the 'chromosomes'
859 uint32 seqVers = UINT32_MAX; // The -C option (either chromosome or container)
860
861 char *inGraph = NULL;
862 char *otGraph = NULL;
863
864 uint32 graphType = IS_GFA;
865
866 uint32 verbosity = 0;
867
868 // not used for bed alignments, that is fixed at a lower value
869 double erate = 0.10;
870
871 argc = AS_configure(argc, argv);
872
873 int arg=1;
874 int err=0;
875 while (arg < argc) {
876 if (strcmp(argv[arg], "-T") == 0) {
877 tigName = argv[++arg];
878 tigVers = atoi(argv[++arg]);
879
880 } else if (strcmp(argv[arg], "-C") == 0) {
881 seqName = argv[++arg];
882 seqVers = atoi(argv[++arg]);
883
884 } else if (strcmp(argv[arg], "-gfa") == 0) {
885 graphType = IS_GFA;
886 } else if (strcmp(argv[arg], "-bed") == 0) {
887 graphType = IS_BED;
888
889 } else if (strcmp(argv[arg], "-i") == 0) {
890 inGraph = argv[++arg];
891 } else if (strcmp(argv[arg], "-o") == 0) {
892 otGraph = argv[++arg];
893
894 } else if (strcmp(argv[arg], "-V") == 0) {
895 verbosity++;
896
897 } else if (strcmp(argv[arg], "-t") == 0) {
898 setNumThreads(argv[++arg]);
899
900 } else if (strcmp(argv[arg], "-e") == 0) {
901 erate = atof(argv[++arg]);
902
903 } else {
904 fprintf(stderr, "%s: Unknown option '%s'\n", argv[0], argv[arg]);
905 err++;
906 }
907
908 arg++;
909 }
910
911 if (tigName == NULL)
912 err++;
913 if (inGraph == NULL)
914 err++;
915 if (otGraph == NULL)
916 err++;
917
918 if ((tigName) && (tigVers == 0))
919 err++;
920 if ((seqName) && (seqVers == 0))
921 err++;
922
923 if (err) {
924 fprintf(stderr, "usage: %s [opts]\n", argv[0]);
925 fprintf(stderr, " Validates a GFA by generating alignments.\n");
926 fprintf(stderr, " Optionally writes new GFA with updated CIGAR string (NOT IMPLEMENTED).\n");
927 fprintf(stderr, "\n");
928 fprintf(stderr, " -T t v Load tigs from tgStore 't', version 'v'.\n");
929 fprintf(stderr, " -C t v For BED format, the source of the 'chromosomes'. Similar to -T.\n");
930 fprintf(stderr, " Consensus sequence must exist for -T and -C (usually in v=2)\n");
931 fprintf(stderr, "\n");
932 fprintf(stderr, " -i input Input graph.\n");
933 fprintf(stderr, " -o output Output graph.\n");
934 fprintf(stderr, " Graph are either GFA (v1) or BED format.\n");
935 fprintf(stderr, "\n");
936 fprintf(stderr, " -gfa The input and output graphs are in GFA (v1) format.\n");
937 fprintf(stderr, " -bed The input graph is in BED format. If -C is supplied, the\n");
938 fprintf(stderr, " output will also be BED, and will have updated positions.\n");
939 fprintf(stderr, " If -C is not supplied, the output will be GFA (v1) of the\n");
940 fprintf(stderr, " overlaps inferred from the BED positions.\n");
941 fprintf(stderr, "\n");
942 fprintf(stderr, " -V Increase chatter.\n");
943 fprintf(stderr, "\n");
944 fprintf(stderr, " -t threads Use 'threads' computational threads.\n");
945 fprintf(stderr, "\n");
946
947 if (tigName == NULL)
948 fprintf(stderr, "ERROR: no tigStore (-T) supplied.\n");
949 if (inGraph == NULL)
950 fprintf(stderr, "ERROR: no input GFA (-i) supplied.\n");
951 if (otGraph == NULL)
952 fprintf(stderr, "ERROR: no output GFA (-o) supplied.\n");
953
954 if ((tigName) && (tigVers == 0))
955 fprintf(stderr, "ERROR: invalid tigStore version (-T) supplied.\n");
956 if ((seqName) && (seqVers == 0))
957 fprintf(stderr, "ERROR: invalid tigStore version (-C) supplied.\n");
958
959 exit(1);
960 }
961
962 if (graphType == IS_GFA)
963 processGFA(tigName, tigVers, inGraph, otGraph, erate, verbosity);
964
965 if ((graphType == IS_BED) && (seqName != NULL))
966 processBED(tigName, tigVers, seqName, seqVers, inGraph, otGraph, verbosity);
967
968 if ((graphType == IS_BED) && (seqName == NULL))
969 processBEDtoGFA(tigName, tigVers, inGraph, otGraph, erate, verbosity);
970
971 fprintf(stderr, "Bye.\n");
972
973 exit(0);
974 }
975