1 
2 /******************************************************************************
3  *
4  *  This file is part of canu, a software program that assembles whole-genome
5  *  sequencing reads into contigs.
6  *
7  *  This software is based on:
8  *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9  *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10  *
11  *  Except as indicated otherwise, this is a 'United States Government Work',
12  *  and is released in the public domain.
13  *
14  *  File 'README.licenses' in the root directory of this distribution
15  *  contains full conditions and disclaimers.
16  */
17 
18 #include "runtime.H"
19 
20 #include "files.H"
21 #include "kmers.H"
22 
23 #include "sqStore.H"
24 #include "sqCache.H"
25 
26 #include "ovStore.H"
27 
28 #include "edlib.H"
29 
30 #include <vector>
31 #include <algorithm>
32 
33 
34 
35 bool
overlapBefore(ovOverlap & A,ovOverlap & B)36 overlapBefore(ovOverlap &A, ovOverlap &B) {
37   if ((A.a_iid     < B.a_iid))
38     return(true);
39 
40   if ((A.a_iid    == B.a_iid) &&
41       (A.b_iid     < B.b_iid))
42     return(true);
43 
44   if ((A.a_iid    == B.a_iid) &&
45       (A.b_iid    == B.b_iid) &&
46       (A.flipped() < B.flipped()))
47     return(true);
48 
49   return(false);
50 }
51 
52 
53 
54 void
findReadKmers(char const * seq,uint32 seqLen,uint32 olapBgn,uint32 olapEnd,std::set<kmer> & inOverlap,std::set<kmer> & allKmer)55 findReadKmers(char const      *seq,
56               uint32           seqLen,
57               uint32           olapBgn,
58               uint32           olapEnd,
59               std::set<kmer>  &inOverlap,
60               std::set<kmer>  &allKmer) {
61 
62   kmerIterator  it(seq, seqLen);
63 
64   inOverlap.clear();
65   allKmer.clear();
66 
67   if (olapEnd < olapBgn)
68     std::swap(olapBgn, olapEnd);
69 
70   while (it.nextMer() == true) {
71     kmer f = it.fmer();
72     kmer r = it.rmer();
73     kmer c = (f < r) ? f : r;
74 
75     allKmer.insert(c);
76 
77     if ((olapBgn < it.bgnPosition()) &&
78         (it.endPosition() <= olapEnd))
79       inOverlap.insert(c);
80   }
81 }
82 
83 
84 
85 void
evaluateKmers(ovOverlap ovl,const char * Aalign,uint32 AalignLen,const char * Balign,uint32 BalignLen,uint32 & sharedRepeat,uint32 & sharedUnique,merylExactLookup * lookup)86 evaluateKmers(ovOverlap ovl,
87               const char *Aalign, uint32 AalignLen,
88               const char *Balign, uint32 BalignLen,
89               uint32 &sharedRepeat,
90               uint32 &sharedUnique,
91               merylExactLookup *lookup) {
92 
93   kmerIterator  Ait(Aalign, AalignLen);
94   kmerIterator  Bit(Balign, BalignLen);
95 
96   while ((Ait.nextBase() == true) &&
97          (Bit.nextBase() == true)) {
98     kmer af = Ait.fmer(), ar = Ait.rmer();
99     kmer bf = Bit.fmer(), br = Bit.rmer();
100 
101     kmer ac = (af < ar) ? af : ar;
102     kmer bc = (bf < br) ? bf : br;
103 
104     kmvalu av = lookup->value(ac);
105     kmvalu bv = lookup->value(bc);
106 
107 
108     if ((Ait.isValid() == false) && (Bit.isValid() == false)) {
109       continue;
110     }
111 
112     if ((Ait.isValid() == true) && (Bit.isValid() == false)) {
113       continue;
114     }
115     if ((Ait.isValid() == false) && (Bit.isValid() == true)) {
116       continue;
117     }
118 
119     //  Both valid.
120 
121     if (ac != bc) {
122       continue;
123     }
124 
125     //  Both the same.
126 
127     assert(av == bv);
128 
129     if (av <= 1333) {
130       sharedUnique++;
131     }
132 
133     else {
134       sharedRepeat++;
135     }
136   }
137 }
138 
139 
140 
141 
142 void
dumpKmers(ovOverlap ovl,char const * Aalign,uint32 AalignLen,uint32 abgn,uint32 aend,char const * Balign,uint32 BalignLen,uint32 bbgn,uint32 bend,merylExactLookup * lookup)143 dumpKmers(ovOverlap   ovl,
144           char const *Aalign,  uint32 AalignLen,  uint32 abgn, uint32 aend,
145           char const *Balign,  uint32 BalignLen,  uint32 bbgn, uint32 bend,
146           merylExactLookup *lookup) {
147 
148   char  NA[FILENAME_MAX];
149   char  NK[FILENAME_MAX];
150 
151   sprintf(NA, "align/%08u-%08u-%s.align", ovl.a_iid, ovl.b_iid, ovl.flipped() ? "flip" : "norm");
152   sprintf(NK, "align/%08u-%08u-%s.kmers", ovl.a_iid, ovl.b_iid, ovl.flipped() ? "flip" : "norm");
153 
154   FILE *FA = AS_UTL_openOutputFile(NA);
155   FILE *FK = AS_UTL_openOutputFile(NK);
156 
157   //  Output the alignments.
158 
159   fprintf(FA, "<pre>\n");
160   fprintf(FA, "%-8u %6u-%-6u %s %s\n", ovl.a_iid, abgn, aend, ovl.flipped() ? "fwd" : "fwd", Aalign);
161   fprintf(FA, "%-8u %6u-%-6u %s %s\n", ovl.b_iid, bbgn, bend, ovl.flipped() ? "rev" : "fwd", Balign);
162   fprintf(FA, "</pre>\n");
163 
164   //  Output the kmers.
165 
166   kmerIterator  Ait(Aalign, AalignLen);
167   kmerIterator  Bit(Balign, BalignLen);
168 
169   while ((Ait.nextBase() == true) &&
170          (Bit.nextBase() == true)) {
171     kmer af = Ait.fmer(), ar = Ait.rmer();
172     kmer bf = Bit.fmer(), br = Bit.rmer();
173 
174     kmer ac = (af < ar) ? af : ar;
175     kmer bc = (bf < br) ? bf : br;
176 
177     if      (Ait.isValid() &&
178              Bit.isValid()) {
179       uint64  pos = Ait.bgnPosition();
180 
181       fprintf(FK, "%lu\t%u\t%u\t%c\t%c\n",
182               pos,
183               lookup->value(ac), lookup->value(bc),
184               Aalign[pos], Balign[pos]);
185     }
186 
187     else if (Ait.isValid()) {
188       //fprintf(FK, "%u\t%u\t%u\n", Ait.bgnPosition(), lookup->value(ac), 0);
189     }
190 
191     else if (Bit.isValid()) {
192       //fprintf(FK, "%u\t%u\t%u\n", Ait.bgnPosition(), 0, lookup->value(bc));
193     }
194 
195     else {
196     }
197   }
198 
199   //  Cleanup.
200 
201   AS_UTL_closeFile(FA);
202   AS_UTL_closeFile(FK);
203 }
204 
205 
206 
207 void
compareOverlapKmers(sqCache * cache,merylExactLookup * lookup,ovOverlap ovl)208 compareOverlapKmers(sqCache *cache,
209                     merylExactLookup *lookup,
210                     ovOverlap ovl) {
211 
212   std::set<kmer>  Ain, Aall;
213   std::set<kmer>  Bin, Ball;
214 
215   uint32  AbasesLen = 0,  AbasesMax = 0;    char *Abases = nullptr;
216   uint32  BbasesLen = 0,  BbasesMax = 0;    char *Bbases = nullptr;
217 
218   uint32  AalignLen = 0;                    char *Aalign = nullptr;
219   uint32  BalignLen = 0;                    char *Balign = nullptr;
220 
221   cache->sqCache_getSequence(ovl.a_iid, Abases, AbasesLen, AbasesMax);
222   cache->sqCache_getSequence(ovl.b_iid, Bbases, BbasesLen, BbasesMax);
223 
224   if (ovl.flipped() == true)
225     reverseComplementSequence(Bbases, BbasesLen);
226 
227   EdlibAlignResult  res;
228 
229   int32  abgn = ovl.a_bgn();
230   int32  aend = ovl.a_end();
231   int32  bbgn = ovl.flipped() ? BbasesLen - ovl.b_bgn() : ovl.b_bgn();
232   int32  bend = ovl.flipped() ? BbasesLen - ovl.b_end() : ovl.b_end();
233 
234   //fprintf(stderr, "Align %u %u-%u %u  to  %u %u-%u %u\n",
235   //        ovl.a_iid, abgn, aend, AbasesLen,
236   //        ovl.b_iid, bbgn, bend, BbasesLen);
237 
238   res = edlibAlign(Abases + abgn, aend - abgn,
239                    Bbases + bbgn, bend - bbgn,
240                    edlibNewAlignConfig((aend - abgn) * 0.25, EDLIB_MODE_NW, EDLIB_TASK_PATH));
241 
242   AalignLen = res.alignmentLength + 1;    Aalign = new char [AalignLen];
243   BalignLen = res.alignmentLength + 1;    Balign = new char [BalignLen];
244 
245   if (res.numLocations == 0) {
246     fprintf(stderr, "Not aligned.\n");
247     assert(0);
248   } else {
249     //fprintf(stderr, "Aligned with distance %d length %d\n", res.editDistance, res.alignmentLength);
250   }
251 
252   edlibAlignmentToStrings(res,
253                           Abases+abgn, aend-abgn,
254                           Bbases+bbgn, bend-bbgn,
255                           Aalign,
256                           Balign);
257 
258 
259 
260 
261   uint32 sharedRepeat = 0;
262   uint32 sharedUnique = 0;
263 
264   evaluateKmers(ovl,
265                 Aalign, AalignLen,
266                 Balign, BalignLen,
267                 sharedRepeat,
268                 sharedUnique,
269                 lookup);
270 
271 
272 #if 0
273   findReadKmers(Aalign, AalignLen, 0, AalignLen, Ain, Aall);
274   findReadKmers(Balign, BalignLen, 0, BalignLen, Bin, Ball);
275 
276   for (auto it=Ain.begin(); it != Ain.end(); ++it) {
277     kmer    k = *it;
278     kmvalu  v = lookup->value(k);
279 
280     if (Bin.count(k) == 0)
281       continue;
282 
283     if (v == 0) {
284       nocount++;
285       continue;
286     }
287 
288     if (v < 1333) {
289       sharedUnique++;
290     } else {
291       sharedRepeat++;
292     }
293   }
294 #endif
295 
296   if (sharedUnique > 0) {
297     fprintf(stdout, "KMERS unique  %u\n", sharedUnique);
298     fprintf(stdout, "      repeat  %u\n", sharedRepeat);
299     fprintf(stdout, "OVERLAP A %6u-%6u %6lu/%6lu kmers\n", ovl.a_bgn(), ovl.a_end(), Ain.size(), Aall.size());
300     fprintf(stdout, "        B %6u-%6u %6lu/%6lu kmers\n", ovl.b_bgn(), ovl.b_end(), Bin.size(), Ball.size());
301     fprintf(stdout, "perl plot-kmer.pl %u %u %u %u  %u %u %u %u\n",
302             ovl.a_iid, ovl.a_bgn(), ovl.a_end(), AbasesLen,
303             ovl.b_iid, ovl.b_bgn(), ovl.b_end(), BbasesLen);
304     fprintf(stdout, "plot [][1:6666] 'align/%08d-%08d-%s.kmers' using 1:2, 'align/%08d-%08d-%s.kmers' using 1:3, 1333\n",
305             ovl.a_iid, ovl.b_iid, ovl.flipped() ? "flip" : "norm",
306             ovl.a_iid, ovl.b_iid, ovl.flipped() ? "flip" : "norm");
307     fprintf(stdout, "\n");
308 
309     dumpKmers(ovl,
310               Aalign, AalignLen, abgn, aend,
311               Balign, BalignLen, bbgn, bend, lookup);
312   }
313 
314   delete [] Abases;   delete [] Aalign;
315   delete [] Bbases;   delete [] Balign;
316 }
317 
318 
319 
320 void
compareFiles(sqStore * seq,sqCache * cache,ovFile * ovfA,ovFile * ovfB,merylExactLookup * lookup)321 compareFiles(sqStore *seq,
322              sqCache *cache,
323              ovFile *ovfA, ovFile *ovfB,
324              merylExactLookup *lookup) {
325   uint64      ovlAmax = ovfA->getCounts()->numOverlaps();
326   uint64      ovlBmax = ovfB->getCounts()->numOverlaps();
327 
328   //  Allocate space for and load overlaps.
329 
330   fprintf(stderr, "Allocating 2x %lu A and 2x %lu B overlaps.\n", ovlAmax, ovlBmax);
331 
332   ovOverlap  *ovlA    = new ovOverlap [2 * ovlAmax];
333   ovOverlap  *ovlB    = new ovOverlap [2 * ovlBmax];
334 
335   fprintf(stderr, "Loading overlaps.\n");
336 
337   uint64      ovlAlen = ovfA->readOverlaps(ovlA, ovlAmax);
338   uint64      ovlBlen = ovfB->readOverlaps(ovlB, ovlBmax);
339 
340   //  Duplicate all overlaps into their other twin
341 
342   fprintf(stderr, "Duplicating overlaps.\n");
343 
344   for (uint32 ii=0; ii<ovlAmax; ii++, ovlAlen++)
345     ovlA[ovlAlen].swapIDs(ovlA[ii]);
346 
347   for (uint32 ii=0; ii<ovlBmax; ii++, ovlBlen++)
348     ovlB[ovlBlen].swapIDs(ovlB[ii]);
349 
350   //  Sort.
351 
352   fprintf(stderr, "Sorting overlaps.\n");
353 
354   std::sort(ovlA, ovlA + ovlAlen);
355   std::sort(ovlB, ovlB + ovlBlen);
356 
357   uint64  onlyA=0, onlyB=0, same=0;
358 
359   fprintf(stderr, "Comparing overlaps.\n");
360 
361   for (uint32 aa=0, bb=0; (aa < ovlAlen) || (bb < ovlBlen); ) {
362 
363     //  If one runs out of overlaps, examine the other one.
364 
365     if      (aa == ovlAlen) {
366       fprintf(stdout, "                                     -- B#%-5u %8u-%8u %c %6.3f%%\n",
367               bb, ovlB[bb].a_iid, ovlB[bb].b_iid, ovlB[bb].flipped() ? 'f' : 'n', ovlB[bb].erate() * 100.0);
368 
369       onlyB++;
370       bb++;
371       continue;
372     }
373 
374     if (bb == ovlBlen) {
375       fprintf(stdout, "A#%-5u %8u-%8u %c %6.3f%% --\n",
376               aa, ovlA[aa].a_iid, ovlA[aa].b_iid, ovlA[aa].flipped() ? 'f' : 'n', ovlA[aa].erate() * 100.0);
377 
378       onlyA++;
379       aa++;
380       continue;
381     }
382 
383     //  If they're the same reads and the same orientation, count
384     //  it as a match.
385     if ((ovlA[aa].a_iid     == ovlB[bb].a_iid) &&
386         (ovlA[aa].b_iid     == ovlB[bb].b_iid) &&
387         (ovlA[aa].flipped() == ovlB[bb].flipped())) {
388       fprintf(stdout, "A#%-5u %8u-%8u %c %6.3f%% == B#%-5u %8u-%8u %c %6.3f%%\n",
389               aa, ovlA[aa].a_iid, ovlA[aa].b_iid, ovlA[aa].flipped() ? 'f' : 'n', ovlA[aa].erate() * 100.0,
390               bb, ovlB[bb].a_iid, ovlB[bb].b_iid, ovlB[bb].flipped() ? 'f' : 'n', ovlB[bb].erate() * 100.0);
391 
392       same++;
393       aa++;
394       bb++;
395       continue;
396     }
397 
398     //
399 
400     if (overlapBefore(ovlA[aa], ovlB[bb]) == true) {
401       fprintf(stdout, "A#%-5u %8u-%8u %c %6.3f%% <+\n",
402               aa, ovlA[aa].a_iid, ovlA[aa].b_iid, ovlA[aa].flipped() ? 'f' : 'n', ovlA[aa].erate() * 100.0);
403 
404       onlyA++;
405       aa++;
406     } else {
407       fprintf(stdout, "                                    +> B#%-5u %8u-%8u %c %6.3f%%\n",
408               bb, ovlB[bb].a_iid, ovlB[bb].b_iid, ovlB[bb].flipped() ? 'f' : 'n', ovlB[bb].erate() * 100.0);
409 
410       compareOverlapKmers(cache, lookup, ovlB[bb]);
411 
412       onlyB++;
413       bb++;
414     }
415   }
416 
417   fprintf(stderr, "onlyA:  %lu\n", onlyA);
418   fprintf(stderr, "onlyB:  %lu\n", onlyB);
419   fprintf(stderr, "same:   %lu\n", same);
420 
421   delete [] ovlA;
422   delete [] ovlB;
423 }
424 
425 
426 
427 void
compareStores(sqStore * seq,ovStore * ovfA,ovStore * ovfB,merylExactLookup * lookup)428 compareStores(sqStore *seq,
429               ovStore *ovfA, ovStore *ovfB,
430               merylExactLookup *lookup) {
431 }
432 
433 
434 
435 int
main(int argc,char ** argv)436 main(int argc, char **argv) {
437   char const   *seqStoreName  = nullptr;
438   char const   *ovlFileNameA  = nullptr;
439   char const   *ovlStoreNameA = nullptr;
440   char const   *ovlFileNameB  = nullptr;
441   char const   *ovlStoreNameB = nullptr;
442   char const   *kmerDBname    = nullptr;
443   uint64        maxMemory     = 32llu * 1024 * 1024 * 1024;
444 
445   argc = AS_configure(argc, argv, 16);
446 
447   std::vector<char const *>  err;
448   for (int arg=1; arg < argc; arg++) {
449     if        (strcmp(argv[arg], "-S") == 0) {
450       seqStoreName = argv[++arg];
451 
452     } else if (strcmp(argv[arg], "-Af") == 0) {
453       ovlFileNameA = argv[++arg];
454     } else if (strcmp(argv[arg], "-A") == 0) {
455       ovlStoreNameA = argv[++arg];
456 
457     } else if (strcmp(argv[arg], "-Bf") == 0) {
458       ovlFileNameB = argv[++arg];
459     } else if (strcmp(argv[arg], "-B") == 0) {
460       ovlStoreNameB = argv[++arg];
461 
462     } else if (strcmp(argv[arg], "-K") == 0) {
463       kmerDBname = argv[++arg];
464 
465     } else {
466       char *s = new char [1024];
467       snprintf(s, 1024, "Unknown option '%s'.\n", argv[arg]);
468       err.push_back(s);
469     }
470   }
471 
472   if (seqStoreName == nullptr)   err.push_back("No sequence store (-S) supplied.\n");
473   if (ovlFileNameA == nullptr)   err.push_back("No overlap file A (-A) supplied.\n");
474   if (ovlFileNameB == nullptr)   err.push_back("No overlap file B (-B) supplied.\n");
475   if (kmerDBname   == nullptr)   err.push_back("No kmer database (-S) supplied.\n");
476 
477   if (err.size() > 0) {
478     fprintf(stderr, "usage: %s -S seqStore -A ovlA -B ovlB -K kmers\n", argv[0]);
479     fprintf(stderr, "\n");
480     return(1);
481   }
482 
483   //  Open inputs.
484 
485   sqStore  *seq   = new sqStore(seqStoreName);
486   sqCache  *cache = new sqCache(seq);
487 
488   fprintf(stderr, "Loading reads.\n");
489   cache->sqCache_loadReads(true);
490 
491   //  Make a kmer lookup table.
492 
493   merylFileReader   *reader = new merylFileReader(kmerDBname);
494   merylExactLookup  *lookup = new merylExactLookup();
495 
496   fprintf(stderr, "Loading kmers.\n");
497   lookup->load(reader, maxMemory, true, false, uint32min, uint32max);
498 
499   delete reader;
500 
501   fprintf(stderr, "Lookup table loaded.\n");
502 
503   if (ovlFileNameA && ovlFileNameB) {
504     ovFile   *ovfA = new ovFile(seq, ovlFileNameA, ovFileFull);
505     ovFile   *ovfB = new ovFile(seq, ovlFileNameB, ovFileFull);
506 
507     compareFiles(seq, cache, ovfA, ovfB, lookup);
508 
509     delete ovfA;
510     delete ovfB;
511   }
512 
513 
514   if (ovlStoreNameA && ovlStoreNameB) {
515     ovStore  *ovsA = new ovStore(ovlStoreNameA, nullptr);
516     ovStore  *ovsB = new ovStore(ovlStoreNameB, nullptr);
517 
518     compareStores(seq, ovsA, ovsB, lookup);
519 
520     delete ovsA;
521     delete ovsB;
522   }
523 
524 
525   //  Done.  Cleanup.
526 
527   delete lookup;
528   delete seq;
529 
530   return(0);
531 }
532