1
2 /******************************************************************************
3 *
4 * This file is part of canu, a software program that assembles whole-genome
5 * sequencing reads into contigs.
6 *
7 * This software is based on:
8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 *
11 * Except as indicated otherwise, this is a 'United States Government Work',
12 * and is released in the public domain.
13 *
14 * File 'README.licenses' in the root directory of this distribution
15 * contains full conditions and disclaimers.
16 */
17
18 #include "runtime.H"
19
20 #include "files.H"
21 #include "kmers.H"
22
23 #include "sqStore.H"
24 #include "sqCache.H"
25
26 #include "ovStore.H"
27
28 #include "edlib.H"
29
30 #include <vector>
31 #include <algorithm>
32
33
34
35 bool
overlapBefore(ovOverlap & A,ovOverlap & B)36 overlapBefore(ovOverlap &A, ovOverlap &B) {
37 if ((A.a_iid < B.a_iid))
38 return(true);
39
40 if ((A.a_iid == B.a_iid) &&
41 (A.b_iid < B.b_iid))
42 return(true);
43
44 if ((A.a_iid == B.a_iid) &&
45 (A.b_iid == B.b_iid) &&
46 (A.flipped() < B.flipped()))
47 return(true);
48
49 return(false);
50 }
51
52
53
54 void
findReadKmers(char const * seq,uint32 seqLen,uint32 olapBgn,uint32 olapEnd,std::set<kmer> & inOverlap,std::set<kmer> & allKmer)55 findReadKmers(char const *seq,
56 uint32 seqLen,
57 uint32 olapBgn,
58 uint32 olapEnd,
59 std::set<kmer> &inOverlap,
60 std::set<kmer> &allKmer) {
61
62 kmerIterator it(seq, seqLen);
63
64 inOverlap.clear();
65 allKmer.clear();
66
67 if (olapEnd < olapBgn)
68 std::swap(olapBgn, olapEnd);
69
70 while (it.nextMer() == true) {
71 kmer f = it.fmer();
72 kmer r = it.rmer();
73 kmer c = (f < r) ? f : r;
74
75 allKmer.insert(c);
76
77 if ((olapBgn < it.bgnPosition()) &&
78 (it.endPosition() <= olapEnd))
79 inOverlap.insert(c);
80 }
81 }
82
83
84
85 void
evaluateKmers(ovOverlap ovl,const char * Aalign,uint32 AalignLen,const char * Balign,uint32 BalignLen,uint32 & sharedRepeat,uint32 & sharedUnique,merylExactLookup * lookup)86 evaluateKmers(ovOverlap ovl,
87 const char *Aalign, uint32 AalignLen,
88 const char *Balign, uint32 BalignLen,
89 uint32 &sharedRepeat,
90 uint32 &sharedUnique,
91 merylExactLookup *lookup) {
92
93 kmerIterator Ait(Aalign, AalignLen);
94 kmerIterator Bit(Balign, BalignLen);
95
96 while ((Ait.nextBase() == true) &&
97 (Bit.nextBase() == true)) {
98 kmer af = Ait.fmer(), ar = Ait.rmer();
99 kmer bf = Bit.fmer(), br = Bit.rmer();
100
101 kmer ac = (af < ar) ? af : ar;
102 kmer bc = (bf < br) ? bf : br;
103
104 kmvalu av = lookup->value(ac);
105 kmvalu bv = lookup->value(bc);
106
107
108 if ((Ait.isValid() == false) && (Bit.isValid() == false)) {
109 continue;
110 }
111
112 if ((Ait.isValid() == true) && (Bit.isValid() == false)) {
113 continue;
114 }
115 if ((Ait.isValid() == false) && (Bit.isValid() == true)) {
116 continue;
117 }
118
119 // Both valid.
120
121 if (ac != bc) {
122 continue;
123 }
124
125 // Both the same.
126
127 assert(av == bv);
128
129 if (av <= 1333) {
130 sharedUnique++;
131 }
132
133 else {
134 sharedRepeat++;
135 }
136 }
137 }
138
139
140
141
142 void
dumpKmers(ovOverlap ovl,char const * Aalign,uint32 AalignLen,uint32 abgn,uint32 aend,char const * Balign,uint32 BalignLen,uint32 bbgn,uint32 bend,merylExactLookup * lookup)143 dumpKmers(ovOverlap ovl,
144 char const *Aalign, uint32 AalignLen, uint32 abgn, uint32 aend,
145 char const *Balign, uint32 BalignLen, uint32 bbgn, uint32 bend,
146 merylExactLookup *lookup) {
147
148 char NA[FILENAME_MAX];
149 char NK[FILENAME_MAX];
150
151 sprintf(NA, "align/%08u-%08u-%s.align", ovl.a_iid, ovl.b_iid, ovl.flipped() ? "flip" : "norm");
152 sprintf(NK, "align/%08u-%08u-%s.kmers", ovl.a_iid, ovl.b_iid, ovl.flipped() ? "flip" : "norm");
153
154 FILE *FA = AS_UTL_openOutputFile(NA);
155 FILE *FK = AS_UTL_openOutputFile(NK);
156
157 // Output the alignments.
158
159 fprintf(FA, "<pre>\n");
160 fprintf(FA, "%-8u %6u-%-6u %s %s\n", ovl.a_iid, abgn, aend, ovl.flipped() ? "fwd" : "fwd", Aalign);
161 fprintf(FA, "%-8u %6u-%-6u %s %s\n", ovl.b_iid, bbgn, bend, ovl.flipped() ? "rev" : "fwd", Balign);
162 fprintf(FA, "</pre>\n");
163
164 // Output the kmers.
165
166 kmerIterator Ait(Aalign, AalignLen);
167 kmerIterator Bit(Balign, BalignLen);
168
169 while ((Ait.nextBase() == true) &&
170 (Bit.nextBase() == true)) {
171 kmer af = Ait.fmer(), ar = Ait.rmer();
172 kmer bf = Bit.fmer(), br = Bit.rmer();
173
174 kmer ac = (af < ar) ? af : ar;
175 kmer bc = (bf < br) ? bf : br;
176
177 if (Ait.isValid() &&
178 Bit.isValid()) {
179 uint64 pos = Ait.bgnPosition();
180
181 fprintf(FK, "%lu\t%u\t%u\t%c\t%c\n",
182 pos,
183 lookup->value(ac), lookup->value(bc),
184 Aalign[pos], Balign[pos]);
185 }
186
187 else if (Ait.isValid()) {
188 //fprintf(FK, "%u\t%u\t%u\n", Ait.bgnPosition(), lookup->value(ac), 0);
189 }
190
191 else if (Bit.isValid()) {
192 //fprintf(FK, "%u\t%u\t%u\n", Ait.bgnPosition(), 0, lookup->value(bc));
193 }
194
195 else {
196 }
197 }
198
199 // Cleanup.
200
201 AS_UTL_closeFile(FA);
202 AS_UTL_closeFile(FK);
203 }
204
205
206
207 void
compareOverlapKmers(sqCache * cache,merylExactLookup * lookup,ovOverlap ovl)208 compareOverlapKmers(sqCache *cache,
209 merylExactLookup *lookup,
210 ovOverlap ovl) {
211
212 std::set<kmer> Ain, Aall;
213 std::set<kmer> Bin, Ball;
214
215 uint32 AbasesLen = 0, AbasesMax = 0; char *Abases = nullptr;
216 uint32 BbasesLen = 0, BbasesMax = 0; char *Bbases = nullptr;
217
218 uint32 AalignLen = 0; char *Aalign = nullptr;
219 uint32 BalignLen = 0; char *Balign = nullptr;
220
221 cache->sqCache_getSequence(ovl.a_iid, Abases, AbasesLen, AbasesMax);
222 cache->sqCache_getSequence(ovl.b_iid, Bbases, BbasesLen, BbasesMax);
223
224 if (ovl.flipped() == true)
225 reverseComplementSequence(Bbases, BbasesLen);
226
227 EdlibAlignResult res;
228
229 int32 abgn = ovl.a_bgn();
230 int32 aend = ovl.a_end();
231 int32 bbgn = ovl.flipped() ? BbasesLen - ovl.b_bgn() : ovl.b_bgn();
232 int32 bend = ovl.flipped() ? BbasesLen - ovl.b_end() : ovl.b_end();
233
234 //fprintf(stderr, "Align %u %u-%u %u to %u %u-%u %u\n",
235 // ovl.a_iid, abgn, aend, AbasesLen,
236 // ovl.b_iid, bbgn, bend, BbasesLen);
237
238 res = edlibAlign(Abases + abgn, aend - abgn,
239 Bbases + bbgn, bend - bbgn,
240 edlibNewAlignConfig((aend - abgn) * 0.25, EDLIB_MODE_NW, EDLIB_TASK_PATH));
241
242 AalignLen = res.alignmentLength + 1; Aalign = new char [AalignLen];
243 BalignLen = res.alignmentLength + 1; Balign = new char [BalignLen];
244
245 if (res.numLocations == 0) {
246 fprintf(stderr, "Not aligned.\n");
247 assert(0);
248 } else {
249 //fprintf(stderr, "Aligned with distance %d length %d\n", res.editDistance, res.alignmentLength);
250 }
251
252 edlibAlignmentToStrings(res,
253 Abases+abgn, aend-abgn,
254 Bbases+bbgn, bend-bbgn,
255 Aalign,
256 Balign);
257
258
259
260
261 uint32 sharedRepeat = 0;
262 uint32 sharedUnique = 0;
263
264 evaluateKmers(ovl,
265 Aalign, AalignLen,
266 Balign, BalignLen,
267 sharedRepeat,
268 sharedUnique,
269 lookup);
270
271
272 #if 0
273 findReadKmers(Aalign, AalignLen, 0, AalignLen, Ain, Aall);
274 findReadKmers(Balign, BalignLen, 0, BalignLen, Bin, Ball);
275
276 for (auto it=Ain.begin(); it != Ain.end(); ++it) {
277 kmer k = *it;
278 kmvalu v = lookup->value(k);
279
280 if (Bin.count(k) == 0)
281 continue;
282
283 if (v == 0) {
284 nocount++;
285 continue;
286 }
287
288 if (v < 1333) {
289 sharedUnique++;
290 } else {
291 sharedRepeat++;
292 }
293 }
294 #endif
295
296 if (sharedUnique > 0) {
297 fprintf(stdout, "KMERS unique %u\n", sharedUnique);
298 fprintf(stdout, " repeat %u\n", sharedRepeat);
299 fprintf(stdout, "OVERLAP A %6u-%6u %6lu/%6lu kmers\n", ovl.a_bgn(), ovl.a_end(), Ain.size(), Aall.size());
300 fprintf(stdout, " B %6u-%6u %6lu/%6lu kmers\n", ovl.b_bgn(), ovl.b_end(), Bin.size(), Ball.size());
301 fprintf(stdout, "perl plot-kmer.pl %u %u %u %u %u %u %u %u\n",
302 ovl.a_iid, ovl.a_bgn(), ovl.a_end(), AbasesLen,
303 ovl.b_iid, ovl.b_bgn(), ovl.b_end(), BbasesLen);
304 fprintf(stdout, "plot [][1:6666] 'align/%08d-%08d-%s.kmers' using 1:2, 'align/%08d-%08d-%s.kmers' using 1:3, 1333\n",
305 ovl.a_iid, ovl.b_iid, ovl.flipped() ? "flip" : "norm",
306 ovl.a_iid, ovl.b_iid, ovl.flipped() ? "flip" : "norm");
307 fprintf(stdout, "\n");
308
309 dumpKmers(ovl,
310 Aalign, AalignLen, abgn, aend,
311 Balign, BalignLen, bbgn, bend, lookup);
312 }
313
314 delete [] Abases; delete [] Aalign;
315 delete [] Bbases; delete [] Balign;
316 }
317
318
319
320 void
compareFiles(sqStore * seq,sqCache * cache,ovFile * ovfA,ovFile * ovfB,merylExactLookup * lookup)321 compareFiles(sqStore *seq,
322 sqCache *cache,
323 ovFile *ovfA, ovFile *ovfB,
324 merylExactLookup *lookup) {
325 uint64 ovlAmax = ovfA->getCounts()->numOverlaps();
326 uint64 ovlBmax = ovfB->getCounts()->numOverlaps();
327
328 // Allocate space for and load overlaps.
329
330 fprintf(stderr, "Allocating 2x %lu A and 2x %lu B overlaps.\n", ovlAmax, ovlBmax);
331
332 ovOverlap *ovlA = new ovOverlap [2 * ovlAmax];
333 ovOverlap *ovlB = new ovOverlap [2 * ovlBmax];
334
335 fprintf(stderr, "Loading overlaps.\n");
336
337 uint64 ovlAlen = ovfA->readOverlaps(ovlA, ovlAmax);
338 uint64 ovlBlen = ovfB->readOverlaps(ovlB, ovlBmax);
339
340 // Duplicate all overlaps into their other twin
341
342 fprintf(stderr, "Duplicating overlaps.\n");
343
344 for (uint32 ii=0; ii<ovlAmax; ii++, ovlAlen++)
345 ovlA[ovlAlen].swapIDs(ovlA[ii]);
346
347 for (uint32 ii=0; ii<ovlBmax; ii++, ovlBlen++)
348 ovlB[ovlBlen].swapIDs(ovlB[ii]);
349
350 // Sort.
351
352 fprintf(stderr, "Sorting overlaps.\n");
353
354 std::sort(ovlA, ovlA + ovlAlen);
355 std::sort(ovlB, ovlB + ovlBlen);
356
357 uint64 onlyA=0, onlyB=0, same=0;
358
359 fprintf(stderr, "Comparing overlaps.\n");
360
361 for (uint32 aa=0, bb=0; (aa < ovlAlen) || (bb < ovlBlen); ) {
362
363 // If one runs out of overlaps, examine the other one.
364
365 if (aa == ovlAlen) {
366 fprintf(stdout, " -- B#%-5u %8u-%8u %c %6.3f%%\n",
367 bb, ovlB[bb].a_iid, ovlB[bb].b_iid, ovlB[bb].flipped() ? 'f' : 'n', ovlB[bb].erate() * 100.0);
368
369 onlyB++;
370 bb++;
371 continue;
372 }
373
374 if (bb == ovlBlen) {
375 fprintf(stdout, "A#%-5u %8u-%8u %c %6.3f%% --\n",
376 aa, ovlA[aa].a_iid, ovlA[aa].b_iid, ovlA[aa].flipped() ? 'f' : 'n', ovlA[aa].erate() * 100.0);
377
378 onlyA++;
379 aa++;
380 continue;
381 }
382
383 // If they're the same reads and the same orientation, count
384 // it as a match.
385 if ((ovlA[aa].a_iid == ovlB[bb].a_iid) &&
386 (ovlA[aa].b_iid == ovlB[bb].b_iid) &&
387 (ovlA[aa].flipped() == ovlB[bb].flipped())) {
388 fprintf(stdout, "A#%-5u %8u-%8u %c %6.3f%% == B#%-5u %8u-%8u %c %6.3f%%\n",
389 aa, ovlA[aa].a_iid, ovlA[aa].b_iid, ovlA[aa].flipped() ? 'f' : 'n', ovlA[aa].erate() * 100.0,
390 bb, ovlB[bb].a_iid, ovlB[bb].b_iid, ovlB[bb].flipped() ? 'f' : 'n', ovlB[bb].erate() * 100.0);
391
392 same++;
393 aa++;
394 bb++;
395 continue;
396 }
397
398 //
399
400 if (overlapBefore(ovlA[aa], ovlB[bb]) == true) {
401 fprintf(stdout, "A#%-5u %8u-%8u %c %6.3f%% <+\n",
402 aa, ovlA[aa].a_iid, ovlA[aa].b_iid, ovlA[aa].flipped() ? 'f' : 'n', ovlA[aa].erate() * 100.0);
403
404 onlyA++;
405 aa++;
406 } else {
407 fprintf(stdout, " +> B#%-5u %8u-%8u %c %6.3f%%\n",
408 bb, ovlB[bb].a_iid, ovlB[bb].b_iid, ovlB[bb].flipped() ? 'f' : 'n', ovlB[bb].erate() * 100.0);
409
410 compareOverlapKmers(cache, lookup, ovlB[bb]);
411
412 onlyB++;
413 bb++;
414 }
415 }
416
417 fprintf(stderr, "onlyA: %lu\n", onlyA);
418 fprintf(stderr, "onlyB: %lu\n", onlyB);
419 fprintf(stderr, "same: %lu\n", same);
420
421 delete [] ovlA;
422 delete [] ovlB;
423 }
424
425
426
427 void
compareStores(sqStore * seq,ovStore * ovfA,ovStore * ovfB,merylExactLookup * lookup)428 compareStores(sqStore *seq,
429 ovStore *ovfA, ovStore *ovfB,
430 merylExactLookup *lookup) {
431 }
432
433
434
435 int
main(int argc,char ** argv)436 main(int argc, char **argv) {
437 char const *seqStoreName = nullptr;
438 char const *ovlFileNameA = nullptr;
439 char const *ovlStoreNameA = nullptr;
440 char const *ovlFileNameB = nullptr;
441 char const *ovlStoreNameB = nullptr;
442 char const *kmerDBname = nullptr;
443 uint64 maxMemory = 32llu * 1024 * 1024 * 1024;
444
445 argc = AS_configure(argc, argv, 16);
446
447 std::vector<char const *> err;
448 for (int arg=1; arg < argc; arg++) {
449 if (strcmp(argv[arg], "-S") == 0) {
450 seqStoreName = argv[++arg];
451
452 } else if (strcmp(argv[arg], "-Af") == 0) {
453 ovlFileNameA = argv[++arg];
454 } else if (strcmp(argv[arg], "-A") == 0) {
455 ovlStoreNameA = argv[++arg];
456
457 } else if (strcmp(argv[arg], "-Bf") == 0) {
458 ovlFileNameB = argv[++arg];
459 } else if (strcmp(argv[arg], "-B") == 0) {
460 ovlStoreNameB = argv[++arg];
461
462 } else if (strcmp(argv[arg], "-K") == 0) {
463 kmerDBname = argv[++arg];
464
465 } else {
466 char *s = new char [1024];
467 snprintf(s, 1024, "Unknown option '%s'.\n", argv[arg]);
468 err.push_back(s);
469 }
470 }
471
472 if (seqStoreName == nullptr) err.push_back("No sequence store (-S) supplied.\n");
473 if (ovlFileNameA == nullptr) err.push_back("No overlap file A (-A) supplied.\n");
474 if (ovlFileNameB == nullptr) err.push_back("No overlap file B (-B) supplied.\n");
475 if (kmerDBname == nullptr) err.push_back("No kmer database (-S) supplied.\n");
476
477 if (err.size() > 0) {
478 fprintf(stderr, "usage: %s -S seqStore -A ovlA -B ovlB -K kmers\n", argv[0]);
479 fprintf(stderr, "\n");
480 return(1);
481 }
482
483 // Open inputs.
484
485 sqStore *seq = new sqStore(seqStoreName);
486 sqCache *cache = new sqCache(seq);
487
488 fprintf(stderr, "Loading reads.\n");
489 cache->sqCache_loadReads(true);
490
491 // Make a kmer lookup table.
492
493 merylFileReader *reader = new merylFileReader(kmerDBname);
494 merylExactLookup *lookup = new merylExactLookup();
495
496 fprintf(stderr, "Loading kmers.\n");
497 lookup->load(reader, maxMemory, true, false, uint32min, uint32max);
498
499 delete reader;
500
501 fprintf(stderr, "Lookup table loaded.\n");
502
503 if (ovlFileNameA && ovlFileNameB) {
504 ovFile *ovfA = new ovFile(seq, ovlFileNameA, ovFileFull);
505 ovFile *ovfB = new ovFile(seq, ovlFileNameB, ovFileFull);
506
507 compareFiles(seq, cache, ovfA, ovfB, lookup);
508
509 delete ovfA;
510 delete ovfB;
511 }
512
513
514 if (ovlStoreNameA && ovlStoreNameB) {
515 ovStore *ovsA = new ovStore(ovlStoreNameA, nullptr);
516 ovStore *ovsB = new ovStore(ovlStoreNameB, nullptr);
517
518 compareStores(seq, ovsA, ovsB, lookup);
519
520 delete ovsA;
521 delete ovsB;
522 }
523
524
525 // Done. Cleanup.
526
527 delete lookup;
528 delete seq;
529
530 return(0);
531 }
532