1 #include "KmerIndex.h"
2 #include <algorithm>
3 #include <random>
4 #include <ctype.h>
5 #include <zlib.h>
6 #include <unordered_set>
7 #include "kseq.h"
8
9 #ifndef KSEQ_INIT_READY
10 #define KSEQ_INIT_READY
KSEQ_INIT(gzFile,gzread)11 KSEQ_INIT(gzFile, gzread)
12 #endif
13
14 // helper functions
15 // pre: u is sorted
16 bool isUnique(const std::vector<int>& u) {
17 for (int j = 1; j < u.size(); j++) {
18 if (u[j-1] == u[j]) {
19 return false;
20 }
21 }
22 return true;
23 }
24
unique(const std::vector<int> & u)25 std::vector<int> unique(const std::vector<int>& u) {
26 std::vector<int> v;
27 v.reserve(u.size());
28 v.push_back(u[0]);
29 for (int j = 1; j < u.size(); j++) {
30 if (u[j-1] != u[j]) {
31 v.push_back(u[j]);
32 }
33 }
34 return v;
35 }
36
Dna(int i)37 const char Dna(int i) {
38 static const char *dna = "ACGT";
39 return dna[i & 0x03];
40 }
41
hamming(const char * a,const char * b)42 int hamming(const char *a, const char *b) {
43 int h = 0;
44 while (*a != 0 && *b != 0) {
45 if (*a != *b) {
46 h++;
47 }
48 a++;
49 b++;
50 }
51 return h;
52 }
53
revcomp(const std::string s)54 std::string revcomp(const std::string s) {
55 std::string r(s);
56 std::transform(s.rbegin(), s.rend(), r.begin(), [](char c) {
57 switch(c) {
58 case 'A': return 'T';
59 case 'C': return 'G';
60 case 'G': return 'C';
61 case 'T': return 'A';
62 default: return 'N';
63 }
64 return 'N';
65 });
66 return r;
67 }
68
BuildTranscripts(const ProgramOptions & opt)69 void KmerIndex::BuildTranscripts(const ProgramOptions& opt) {
70 // read input
71 std::unordered_set<std::string> unique_names;
72 int k = opt.k;
73 for (auto& fasta : opt.transfasta) {
74 std::cerr << "[build] loading fasta file " << fasta
75 << std::endl;
76 }
77 std::cerr << "[build] k-mer length: " << k << std::endl;
78
79
80 std::vector<std::string> seqs;
81
82 // read fasta file
83 gzFile fp = 0;
84 kseq_t *seq;
85 int l = 0;
86 std::mt19937 gen(42);
87 int countNonNucl = 0;
88 int countUNuc = 0;
89 int polyAcount = 0;
90
91 for (auto& fasta : opt.transfasta) {
92 fp = gzopen(fasta.c_str(), "r");
93 seq = kseq_init(fp);
94 while (true) {
95 l = kseq_read(seq);
96 if (l <= 0) {
97 break;
98 }
99 seqs.emplace_back(seq->seq.s);
100 std::string& str = *seqs.rbegin();
101 auto n = str.size();
102 for (auto i = 0; i < n; i++) {
103 char c = str[i];
104 c = ::toupper(c);
105 if (c=='U') {
106 str[i] = 'T';
107 countUNuc++;
108 } else if (c !='A' && c != 'C' && c != 'G' && c != 'T') {
109 str[i] = Dna(gen()); // replace with pseudorandom string
110 countNonNucl++;
111 }
112 }
113 std::transform(str.begin(), str.end(),str.begin(), ::toupper);
114
115 if (str.size() >= 10 && str.substr(str.size()-10,10) == "AAAAAAAAAA") {
116 // clip off polyA tail
117 //std::cerr << "[index] clipping off polyA tail" << std::endl;
118 polyAcount++;
119 int j;
120 for (j = str.size()-1; j >= 0 && str[j] == 'A'; j--) {}
121 str = str.substr(0,j+1);
122 }
123
124
125 target_lens_.push_back(seq->seq.l);
126 std::string name(seq->name.s);
127 size_t p = name.find(' ');
128 if (p != std::string::npos) {
129 name = name.substr(0,p);
130 }
131
132 if (unique_names.find(name) != unique_names.end()) {
133 if (!opt.make_unique) {
134 std::cerr << "Error: repeated name in FASTA file " << fasta << "\n" << name << "\n\n" << "Run with --make-unique to replace repeated names with unique names" << std::endl;
135 exit(1);
136 } else {
137 for (int i = 1; ; i++) { // potential bug if you have more than 2^32 repeated names
138 std::string new_name = name + "_" + std::to_string(i);
139 if (unique_names.find(new_name) == unique_names.end()) {
140 name = new_name;
141 break;
142 }
143 }
144 }
145 }
146 unique_names.insert(name);
147 target_names_.push_back(name);
148
149 }
150 gzclose(fp);
151 fp=0;
152 }
153
154 if (polyAcount > 0) {
155 std::cerr << "[build] warning: clipped off poly-A tail (longer than 10)" << std::endl << " from " << polyAcount << " target sequences" << std::endl;
156 }
157
158
159 if (countNonNucl > 0) {
160 std::cerr << "[build] warning: replaced " << countNonNucl << " non-ACGUT characters in the input sequence" << std::endl << " with pseudorandom nucleotides" << std::endl;
161 }
162 if (countUNuc > 0) {
163 std::cerr << "[build] warning: replaced " << countUNuc << " U characters with Ts" << std::endl;
164 }
165
166 num_trans = seqs.size();
167
168 // for each target, create it's own equivalence class
169 for (int i = 0; i < seqs.size(); i++ ) {
170 std::vector<int> single(1,i);
171 //ecmap.insert({i,single});
172 ecmap.push_back(single);
173 ecmapinv.insert({single,i});
174 }
175
176 BuildDeBruijnGraph(opt, seqs);
177 BuildEquivalenceClasses(opt, seqs);
178 //BuildEdges(opt);
179
180 }
181
BuildDeBruijnGraph(const ProgramOptions & opt,const std::vector<std::string> & seqs)182 void KmerIndex::BuildDeBruijnGraph(const ProgramOptions& opt, const std::vector<std::string>& seqs) {
183
184
185 std::cerr << "[build] counting k-mers ... "; std::cerr.flush();
186 // gather all k-mers
187 for (int i = 0; i < seqs.size(); i++) {
188 const char *s = seqs[i].c_str();
189 KmerIterator kit(s),kit_end;
190 for (; kit != kit_end; ++kit) {
191 kmap.insert({kit->first.rep(), KmerEntry()}); // don't care about repeats
192 // Kmer rep = kit->first.rep();
193 // std::cout << rep.toString() << "\t" << rep.hash() << std::endl;
194 }
195 }
196 std::cerr << "done." << std::endl;
197
198 std::cerr << "[build] building target de Bruijn graph ... "; std::cerr.flush();
199 // find out how much we can skip ahead for each k-mer.
200 for (auto& kv : kmap) {
201 if (kv.second.contig == -1) {
202 // ok we haven't processed the k-mer yet
203 std::vector<Kmer> flist, blist;
204
205 // iterate in forward direction
206 Kmer km = kv.first;
207 Kmer end = km;
208 Kmer last = end;
209 Kmer twin = km.twin();
210 bool selfLoop = false;
211 flist.push_back(km);
212
213 while (fwStep(end,end)) {
214 if (end == km) {
215 // selfloop
216 selfLoop = true;
217 break;
218 } else if (end == twin) {
219 selfLoop = (flist.size() > 1); // hairpins are not loops
220 // mobius loop
221 break;
222 } else if (end == last.twin()) {
223 // hairpin
224 break;
225 }
226 flist.push_back(end);
227 last = end;
228 }
229
230 Kmer front = twin;
231 Kmer first = front;
232
233 if (!selfLoop) {
234 while (fwStep(front,front)) {
235 if (front == twin) {
236 // selfloop
237 selfLoop = true;
238 break;
239 } else if (front == km) {
240 // mobius loop
241 selfLoop = true;
242 break;
243 } else if (front == first.twin()) {
244 // hairpin
245 break;
246 }
247 blist.push_back(front);
248 first = front;
249 }
250 }
251
252 std::vector<Kmer> klist;
253 for (auto it = blist.rbegin(); it != blist.rend(); ++it) {
254 klist.push_back(it->twin());
255 }
256 for (auto x : flist) {
257 klist.push_back(x);
258 }
259
260
261 Contig contig;
262 contig.id = dbGraph.contigs.size();
263 contig.length = klist.size();
264 contig.seq = klist[0].toString();
265 contig.seq.reserve(contig.length + k-1);
266
267
268 for (int i = 0; i < klist.size(); i++) {
269 Kmer x = klist[i];
270 Kmer xr = x.rep();
271 bool forward = (x==xr);
272 auto it = kmap.find(xr);
273 assert(it->second.contig==-1);
274 it->second = KmerEntry(contig.id, contig.length, i, forward);
275 if (i > 0) {
276 contig.seq.push_back(x.toString()[k-1]);
277 }
278 }
279
280 dbGraph.contigs.push_back(contig);
281 dbGraph.ecs.push_back(-1);
282 }
283 }
284 std::cerr << " done " << std::endl;
285
286 }
287
BuildEquivalenceClasses(const ProgramOptions & opt,const std::vector<std::string> & seqs)288 void KmerIndex::BuildEquivalenceClasses(const ProgramOptions& opt, const std::vector<std::string>& seqs) {
289 std::cerr << "[build] creating equivalence classes ... "; std::cerr.flush();
290
291 std::vector<std::vector<TRInfo>> trinfos(dbGraph.contigs.size());
292 //std::cout << "Mapping target " << std::endl;
293 for (int i = 0; i < seqs.size(); i++) {
294 int seqlen = seqs[i].size() - k + 1; // number of k-mers
295 const char *s = seqs[i].c_str();
296 //std::cout << "sequence number " << i << std::endl;
297 KmerIterator kit(s), kit_end;
298 for (; kit != kit_end; ++kit) {
299 Kmer x = kit->first;
300 Kmer xr = x.rep();
301 auto search = kmap.find(xr);
302 bool forward = (x==xr);
303 KmerEntry val = search->second;
304 std::vector<TRInfo>& trinfo = trinfos[val.contig];
305 Contig& contig = dbGraph.contigs[val.contig];
306
307
308
309 TRInfo tr;
310 tr.trid = i;
311 int jump = kit->second;
312 if (forward == val.isFw()) {
313 tr.sense = true;
314 tr.start = val.getPos();
315 if (contig.length - tr.start > seqlen - kit->second) {
316 // tartget stops
317 tr.stop = tr.start + seqlen - kit->second;
318 jump = seqlen;
319 } else {
320 tr.stop = contig.length;
321 jump = kit->second + (tr.stop - tr.start)-1;
322 }
323 } else {
324 tr.sense = false;
325 tr.stop = val.getPos()+1;
326 int stpos = tr.stop - (seqlen - kit->second);
327 if (stpos > 0) {
328 tr.start = stpos;
329 jump = seqlen;
330 } else {
331 tr.start = 0;
332 jump = kit->second + (tr.stop - tr.start) - 1;
333 }
334 }
335
336 // // debugging -->
337 //std::cout << "covering seq" << std::endl << seqs[i].substr(kit->second, jump-kit->second +k) << std::endl;
338 //std::cout << "id = " << tr.trid << ", (" << tr.start << ", " << tr.stop << ")" << std::endl;
339 //std::cout << "contig seq" << std::endl;
340 // if (forward == val.isFw()) {
341 // //std::cout << contig.seq << std::endl;
342 // assert(contig.seq.substr(tr.start, k-1 + tr.stop-tr.start) == seqs[i].substr(kit->second, jump-kit->second +k) );
343 // } else {
344 // //std::cout << revcomp(contig.seq) << std::endl;
345 // assert(revcomp(contig.seq.substr(tr.start, k-1 + tr.stop-tr.start)) == seqs[i].substr(kit->second, jump-kit->second +k));
346 // }
347 // if (jump == seqlen) {
348 // //std::cout << std::string(k-1+(tr.stop-tr.start)-1,'-') << "^" << std::endl;
349 // }
350
351 // // <-- debugging
352
353 trinfo.push_back(tr);
354 kit.jumpTo(jump);
355 }
356 }
357
358
359 FixSplitContigs(opt, trinfos);
360
361
362 int perftr = 0;
363 for (int i = 0; i < trinfos.size(); i++) {
364 bool all = true;
365
366 int contigLen = dbGraph.contigs[i].length;
367 //std::cout << "contig " << i << ", length = " << contigLen << ", seq = " << dbGraph.contigs[i].seq << std::endl << "tr = ";
368 for (auto x : trinfos[i]) {
369 if (x.start!=0 || x.stop !=contigLen) {
370 all = false;
371 }
372 //std::cout << "[" << x.trid << ",(" << x.start << ", " << x.stop << ")], " ;
373 }
374 //std::cout << std::endl;
375 if (all) {
376 perftr++;
377 }
378 }
379 //std::cerr << "For " << dbGraph.contigs.size() << ", " << (dbGraph.contigs.size() - perftr) << " of them need to be split" << std::endl;
380 // need to create the equivalence classes
381
382 assert(dbGraph.contigs.size() == trinfos.size());
383 // for each contig
384 for (int i = 0; i < trinfos.size(); i++) {
385 std::vector<int> u;
386 for (auto x : trinfos[i]) {
387 u.push_back(x.trid);
388 }
389 sort(u.begin(), u.end());
390 if (!isUnique(u)){
391 std::vector<int> v = unique(u);
392 swap(u,v);
393 }
394
395 assert(!u.empty());
396
397 auto search = ecmapinv.find(u);
398 int ec = -1;
399 if (search != ecmapinv.end()) {
400 // insert contig -> ec info
401 ec = search->second;
402 } else {
403 ec = ecmapinv.size();
404 ecmapinv.insert({u,ec});
405 ecmap.push_back(u);
406 }
407 dbGraph.ecs[i] = ec;
408 assert(ec != -1);
409
410 // record the transc
411 Contig& contig = dbGraph.contigs[i];
412 contig.ec = ec;
413 // correct ec of all k-mers in contig
414 /*
415 KmerIterator kit(contig.seq.c_str()), kit_end;
416 for (; kit != kit_end; ++kit) {
417 auto ksearch = kmap.find(kit->first.rep());
418 //ksearch->second.ec = ec;
419 }
420 */
421 }
422
423 // map transcripts to contigs
424 //std::cout << std::endl;
425 for (int i = 0; i < seqs.size(); i++) {
426 int seqlen = seqs[i].size() - k + 1; // number of k-mers
427 // debugging
428 std::string stmp;
429 const char *s = seqs[i].c_str();
430 ////std::cout << "sequence number " << i << std::endl;
431 //std::cout << ">" << target_names_[i] << std::endl;
432 //std::cout << seqs[i] << std::endl;
433 KmerIterator kit(s), kit_end;
434 for (; kit != kit_end; ++kit) {
435 Kmer x = kit->first;
436 //std::cout << "position = " << kit->second << ", mapping " << x.toString() << std::endl;
437 Kmer xr = x.rep();
438 auto search = kmap.find(xr);
439 bool forward = (x==xr);
440 KmerEntry val = search->second;
441 Contig& contig = dbGraph.contigs[val.contig];
442
443 ContigToTranscript info;
444 info.trid = i;
445 info.pos = kit->second;
446 info.sense = (forward == val.isFw());
447 int jump = kit->second + contig.length-1;
448 //std::cout << "mapped to contig " << val.contig << ", len = " << contig.length << ", pos = " << val.getPos() << ", sense = " << info.sense << std::endl;
449 contig.transcripts.push_back(info);
450 // debugging
451 if (info.sense) {
452
453 if (info.pos == 0) {
454 stmp.append(contig.seq);
455 //std::cout << contig.seq << std::endl;
456 } else {
457 stmp.append(contig.seq.substr(k-1));
458 //std::cout << contig.seq.substr(k-1) << std::endl;
459 }
460 } else {
461 std::string r = revcomp(contig.seq);
462 if (info.pos == 0) {
463 stmp.append(r);
464 //std::cout << r << std::endl;
465 } else {
466 stmp.append(r.substr(k-1));
467 //std::cout << r.substr(k-1) << std::endl;
468 }
469 }
470 //std::cout << stmp << std::endl;
471 //std::cout << "covering seq" << std::endl << seqs[i].substr(kit->second, jump-kit->second +k) << std::endl;
472 //std::cout << "id = " << tr.trid << ", (" << tr.start << ", " << tr.stop << ")" << std::endl;
473 //std::cout << "contig seq" << std::endl;
474 // if (forward == val.isFw()) {
475 // //std::cout << contig.seq << std::endl;
476 // assert(contig.seq.substr(tr.start, k-1 + tr.stop-tr.start) == seqs[i].substr(kit->second, jump-kit->second +k) );
477 // } else {
478 // //std::cout << revcomp(contig.seq) << std::endl;
479 // assert(revcomp(contig.seq.substr(tr.start, k-1 + tr.stop-tr.start)) == seqs[i].substr(kit->second, jump-kit->second +k));
480 // }
481 // if (jump == seqlen) {
482 // //std::cout << std::string(k-1+(tr.stop-tr.start)-1,'-') << "^" << std::endl;
483 // }
484
485 // // <-- debugging
486
487 kit.jumpTo(jump);
488 }
489 if (seqlen > 0 && seqs[i] != stmp) {
490 /*std::cout << ">" << target_names_[i] << std::endl
491 << seqs[i] << std::endl
492 << stmp << std::endl;*/
493 assert(false);
494
495 }
496 }
497
498 // double check the contigs
499 for (auto &c : dbGraph.contigs) {
500 for (auto info : c.transcripts) {
501 std::string r;
502 if (info.sense) {
503 r = c.seq;
504 } else {
505 r = revcomp(c.seq);
506 }
507 assert(r == seqs[info.trid].substr(info.pos,r.size()));
508 }
509 }
510
511
512 std::cerr << " done" << std::endl;
513 std::cerr << "[build] target de Bruijn graph has " << dbGraph.contigs.size() << " contigs and contains " << kmap.size() << " k-mers " << std::endl;
514
515
516
517
518
519 }
520
FixSplitContigs(const ProgramOptions & opt,std::vector<std::vector<TRInfo>> & trinfos)521 void KmerIndex::FixSplitContigs(const ProgramOptions& opt, std::vector<std::vector<TRInfo>>& trinfos) {
522
523 int perftr = 0;
524 int orig_size = trinfos.size();
525
526 for (int i = 0; i < orig_size; i++) {
527 bool all = true;
528
529 int contigLen = dbGraph.contigs[i].length;
530 //std::cout << "contig " << i << ", length = " << contigLen << ", seq = " << dbGraph.contigs[i].seq << std::endl << "tr = ";
531 for (auto x : trinfos[i]) {
532 if (x.start!=0 || x.stop !=contigLen) {
533 all = false;
534 }
535 //std::cout << "[" << x.trid << ",(" << x.start << ", " << x.stop << ")], " ;
536 assert(x.start < x.stop);
537 }
538 //std::cout << std::endl;
539
540 if (all) {
541 perftr++;
542 } else {
543 // break up equivalence classes
544 // sort by start/stop
545 std::vector<int> brpoints;
546 for (auto& x : trinfos[i]) {
547 brpoints.push_back(x.start);
548 brpoints.push_back(x.stop);
549 }
550 sort(brpoints.begin(), brpoints.end());
551 assert(brpoints[0] == 0);
552 assert(brpoints[brpoints.size()-1]==contigLen);
553
554 // find unique points
555 if (!isUnique(brpoints)) {
556 std::vector<int> u = unique(brpoints);
557 swap(u,brpoints);
558 }
559
560 assert(!brpoints.empty());
561
562 // copy sequence
563 std::string seq = dbGraph.contigs[i].seq;
564 // copy old trinfo
565 std::vector<TRInfo> oldtrinfo = trinfos[i];
566
567 for (int j = 1; j < brpoints.size(); j++) {
568 assert(brpoints[j-1] < brpoints[j]);
569 Contig newc;
570 newc.seq = seq.substr(brpoints[j-1], brpoints[j]-brpoints[j-1]+k-1);
571 newc.length = brpoints[j]-brpoints[j-1];
572
573 if (j>1) {
574 newc.id = dbGraph.contigs.size();
575 dbGraph.contigs.push_back(newc);
576 dbGraph.ecs.push_back(-1);
577 } else {
578 newc.id = i;
579 dbGraph.contigs[i] = newc;
580 }
581
582 // repair k-mer mapping
583 KmerIterator kit(newc.seq.c_str()), kit_end;
584 for (; kit != kit_end; ++kit) {
585 Kmer x = kit->first;
586 Kmer xr = x.rep();
587 auto search = kmap.find(xr);
588 assert(search != kmap.end());
589 bool forward = (x==xr);
590 search->second = KmerEntry(newc.id, newc.length, kit->second, forward);
591 }
592
593 // repair tr-info
594 std::vector<TRInfo> newtrinfo;
595 for (auto x : oldtrinfo) {
596 if (!(x.stop <= brpoints[j-1] || x.start >= brpoints[j])) {
597 TRInfo trinfo;
598 trinfo.sense = x.sense;
599 trinfo.trid = x.trid;
600 trinfo.start = 0;
601 trinfo.stop = newc.length;
602 newtrinfo.push_back(trinfo);
603 }
604 }
605 if (j>1) {
606 trinfos.push_back(newtrinfo);
607 } else {
608 trinfos[i] = newtrinfo;
609 }
610 }
611 }
612 }
613
614
615 //std::cerr << "For " << dbGraph.contigs.size() << ", " << (dbGraph.contigs.size() - perftr) << " of them need to be split" << std::endl;
616
617
618 }
619
620
621
622
write(const std::string & index_out,bool writeKmerTable)623 void KmerIndex::write(const std::string& index_out, bool writeKmerTable) {
624 std::ofstream out;
625 out.open(index_out, std::ios::out | std::ios::binary);
626
627 if (!out.is_open()) {
628 // TODO: better handling
629 std::cerr << "Error: index output file could not be opened!";
630 exit(1);
631 }
632
633 // 1. write version
634 out.write((char *)&INDEX_VERSION, sizeof(INDEX_VERSION));
635
636 // 2. write k
637 out.write((char *)&k, sizeof(k));
638
639 // 3. write number of targets
640 out.write((char *)&num_trans, sizeof(num_trans));
641
642 // 4. write out target lengths
643 for (int tlen : target_lens_) {
644 out.write((char *)&tlen, sizeof(tlen));
645 }
646
647 size_t kmap_size = kmap.size();
648
649 if (writeKmerTable) {
650 // 5. write number of k-mers in map
651 out.write((char *)&kmap_size, sizeof(kmap_size));
652
653 // 6. write kmer->ec values
654 for (auto& kv : kmap) {
655 out.write((char *)&kv.first, sizeof(kv.first));
656 out.write((char *)&kv.second, sizeof(kv.second));
657 }
658 } else {
659 // 5. write fake k-mer size
660 kmap_size = 0;
661 out.write((char *)&kmap_size, sizeof(kmap_size));
662
663 // 6. write none of the kmer->ec values
664 }
665 // 7. write number of equivalence classes
666 size_t tmp_size;
667 tmp_size = ecmap.size();
668 out.write((char *)&tmp_size, sizeof(tmp_size));
669
670 // 8. write out each equiv class
671 // for (auto& kv : ecmap) {
672 for (int ec = 0; ec < ecmap.size(); ec++) {
673 out.write((char *)&ec, sizeof(ec));
674 auto& v = ecmap[ec];
675 // 8.1 write out the size of equiv class
676 tmp_size = v.size();
677 out.write((char *)&tmp_size, sizeof(tmp_size));
678 // 8.2 write each member
679 for (auto& val: v) {
680 out.write((char *)&val, sizeof(val));
681 }
682 }
683
684 // 9. Write out target ids
685 // XXX: num_trans should equal to target_names_.size(), so don't need
686 // to write out again.
687 assert(num_trans == target_names_.size());
688 for (auto& tid : target_names_) {
689 // 9.1 write out how many bytes
690 // XXX: Note: this doesn't actually encore the max targ id size.
691 // might cause problems in the future
692 // tmp_size = tid.size();
693 tmp_size = strlen(tid.c_str());
694 out.write((char *)&tmp_size, sizeof(tmp_size));
695
696 // 9.2 write out the actual string
697 out.write(tid.c_str(), tmp_size);
698 }
699
700 // 10. write out contigs
701 if (writeKmerTable) {
702 assert(dbGraph.contigs.size() == dbGraph.ecs.size());
703 tmp_size = dbGraph.contigs.size();
704 out.write((char*)&tmp_size, sizeof(tmp_size));
705 for (auto& contig : dbGraph.contigs) {
706 out.write((char*)&contig.id, sizeof(contig.id));
707 out.write((char*)&contig.length, sizeof(contig.length));
708 tmp_size = strlen(contig.seq.c_str());
709 out.write((char*)&tmp_size, sizeof(tmp_size));
710 out.write(contig.seq.c_str(), tmp_size);
711
712 // 10.1 write out transcript info
713 tmp_size = contig.transcripts.size();
714 out.write((char*)&tmp_size, sizeof(tmp_size));
715 for (auto& info : contig.transcripts) {
716 out.write((char*)&info.trid, sizeof(info.trid));
717 out.write((char*)&info.pos, sizeof(info.pos));
718 out.write((char*)&info.sense, sizeof(info.sense));
719 }
720 }
721
722 // 11. write out ecs info
723 for (auto ec : dbGraph.ecs) {
724 out.write((char*)&ec, sizeof(ec));
725 }
726
727
728 } else {
729 // write empty dBG
730 tmp_size = 0;
731 out.write((char*)&tmp_size, sizeof(tmp_size));
732 }
733
734
735 out.flush();
736 out.close();
737 }
738
fwStep(Kmer km,Kmer & end) const739 bool KmerIndex::fwStep(Kmer km, Kmer& end) const {
740 int j = -1;
741 int fw_count = 0;
742 for (int i = 0; i < 4; i++) {
743 Kmer fw_rep = end.forwardBase(Dna(i)).rep();
744 auto search = kmap.find(fw_rep);
745 if (search != kmap.end()) {
746 j = i;
747 ++fw_count;
748 if (fw_count > 1) {
749 return false;
750 }
751 }
752 }
753
754 if (fw_count != 1) {
755 return false;
756 }
757
758 Kmer fw = end.forwardBase(Dna(j));
759
760 int bw_count = 0;
761 for (int i = 0; i < 4; i++) {
762 Kmer bw_rep = fw.backwardBase(Dna(i)).rep();
763 if (kmap.find(bw_rep) != kmap.end()) {
764 ++bw_count;
765 if (bw_count > 1) {
766 return false;
767 }
768 }
769 }
770
771 if (bw_count != 1) {
772 return false;
773 } else {
774 if (fw != km) {
775 end = fw;
776 return true;
777 } else {
778 return false;
779 }
780 }
781
782 }
783
load(ProgramOptions & opt,bool loadKmerTable)784 void KmerIndex::load(ProgramOptions& opt, bool loadKmerTable) {
785
786 std::string& index_in = opt.index;
787 std::ifstream in;
788
789
790 in.open(index_in, std::ios::in | std::ios::binary);
791
792 if (!in.is_open()) {
793 // TODO: better handling
794 std::cerr << "Error: index input file could not be opened!";
795 exit(1);
796 }
797
798 // 1. read version
799 size_t header_version = 0;
800 in.read((char *)&header_version, sizeof(header_version));
801
802 if (header_version != INDEX_VERSION) {
803 std::cerr << "Error: incompatible indices. Found version " << header_version << ", expected version " << INDEX_VERSION << std::endl
804 << "Rerun with index to regenerate";
805 exit(1);
806 }
807
808 // 2. read k
809 in.read((char *)&k, sizeof(k));
810 if (Kmer::k == 0) {
811 //std::cerr << "[index] no k has been set, setting k = " << k << std::endl;
812 Kmer::set_k(k);
813 opt.k = k;
814 } else if (Kmer::k == k) {
815 //std::cerr << "[index] Kmer::k has been set and matches" << k << std::endl;
816 opt.k = k;
817 } else {
818 std::cerr << "Error: Kmer::k was already set to = " << Kmer::k << std::endl
819 << " conflicts with value of k = " << k << std::endl;
820 exit(1);
821 }
822
823 // 3. read in number of targets
824 in.read((char *)&num_trans, sizeof(num_trans));
825
826 // 4. read in length of targets
827 target_lens_.clear();
828 target_lens_.reserve(num_trans);
829
830 for (int i = 0; i < num_trans; i++) {
831 int tlen;
832 in.read((char *)&tlen, sizeof(tlen));
833 target_lens_.push_back(tlen);
834 }
835
836 // 5. read number of k-mers
837 size_t kmap_size;
838 in.read((char *)&kmap_size, sizeof(kmap_size));
839
840 std::cerr << "[index] k-mer length: " << k << std::endl;
841 std::cerr << "[index] number of targets: " << pretty_num(num_trans)
842 << std::endl;
843 std::cerr << "[index] number of k-mers: " << pretty_num(kmap_size)
844 << std::endl;
845
846 kmap.clear();
847 if (loadKmerTable) {
848 kmap.reserve(kmap_size,true);
849 }
850
851 // 6. read kmer->ec values
852 Kmer tmp_kmer;
853 KmerEntry tmp_val;
854 for (size_t i = 0; i < kmap_size; ++i) {
855 in.read((char *)&tmp_kmer, sizeof(tmp_kmer));
856 in.read((char *)&tmp_val, sizeof(tmp_val));
857
858 if (loadKmerTable) {
859 kmap.insert({tmp_kmer, tmp_val});
860 }
861 }
862
863 // 7. read number of equivalence classes
864 size_t ecmap_size;
865 in.read((char *)&ecmap_size, sizeof(ecmap_size));
866
867 std::cerr << "[index] number of equivalence classes: "
868 << pretty_num(ecmap_size) << std::endl;
869 ecmap.resize(ecmap_size);
870 int tmp_id;
871 int tmp_ecval;
872 size_t vec_size;
873 // 8. read each equiv class
874 for (size_t ec = 0; ec < ecmap_size; ++ec) {
875 in.read((char *)&tmp_id, sizeof(tmp_id));
876
877 // 8.1 read size of equiv class
878 in.read((char *)&vec_size, sizeof(vec_size));
879
880 // 8.2 read each member
881 std::vector<int> tmp_vec;
882 tmp_vec.reserve(vec_size);
883 for (size_t j = 0; j < vec_size; ++j ) {
884 in.read((char *)&tmp_ecval, sizeof(tmp_ecval));
885 tmp_vec.push_back(tmp_ecval);
886 }
887 //ecmap.insert({tmp_id, tmp_vec});
888 ecmap[tmp_id] = tmp_vec;
889 ecmapinv.insert({tmp_vec, tmp_id});
890 }
891
892 // 9. read in target ids
893 target_names_.clear();
894 target_names_.reserve(num_trans);
895
896 size_t tmp_size;
897 size_t bufsz = 1024;
898 char *buffer = new char[bufsz];
899 for (auto i = 0; i < num_trans; ++i) {
900 // 9.1 read in the size
901 in.read((char *)&tmp_size, sizeof(tmp_size));
902
903 if (tmp_size +1 > bufsz) {
904 delete[] buffer;
905 bufsz = 2*(tmp_size+1);
906 buffer = new char[bufsz];
907 }
908
909 // clear the buffer
910 memset(buffer,0,bufsz);
911 // 9.2 read in the character string
912 in.read(buffer, tmp_size);
913
914 /* std::string tmp_targ_id( buffer ); */
915 target_names_.push_back(std::string( buffer ));
916 }
917
918
919 // 10. read contigs
920 size_t contig_size;
921 in.read((char *)&contig_size, sizeof(contig_size));
922 dbGraph.contigs.clear();
923 dbGraph.contigs.reserve(contig_size);
924 for (auto i = 0; i < contig_size; i++) {
925 Contig c;
926 in.read((char *)&c.id, sizeof(c.id));
927 in.read((char *)&c.length, sizeof(c.length));
928 in.read((char *)&tmp_size, sizeof(tmp_size));
929
930 if (tmp_size + 1 > bufsz) {
931 delete[] buffer;
932 bufsz = 2*(tmp_size+1);
933 buffer = new char[bufsz];
934 }
935
936 memset(buffer,0,bufsz);
937 in.read(buffer, tmp_size);
938 c.seq = std::string(buffer); // copy
939
940 // 10.1 read transcript info
941 in.read((char*)&tmp_size, sizeof(tmp_size));
942 c.transcripts.clear();
943 c.transcripts.reserve(tmp_size);
944
945 for (auto j = 0; j < tmp_size; j++) {
946 ContigToTranscript info;
947 in.read((char*)&info.trid, sizeof(info.trid));
948 in.read((char*)&info.pos, sizeof(info.pos));
949 in.read((char*)&info.sense, sizeof(info.sense));
950 c.transcripts.push_back(info);
951 }
952
953 dbGraph.contigs.push_back(c);
954 }
955
956 // 11. read ecs info
957 dbGraph.ecs.clear();
958 dbGraph.ecs.reserve(contig_size);
959 int tmp_ec;
960 for (auto i = 0; i < contig_size; i++) {
961 in.read((char *)&tmp_ec, sizeof(tmp_ec));
962 dbGraph.ecs.push_back(tmp_ec);
963 }
964
965 // delete the buffer
966 delete[] buffer;
967 buffer=nullptr;
968
969 in.close();
970 }
971
972
mapPair(const char * s1,int l1,const char * s2,int l2,int ec) const973 int KmerIndex::mapPair(const char *s1, int l1, const char *s2, int l2, int ec) const {
974 bool d1 = true;
975 bool d2 = true;
976 int p1 = -1;
977 int p2 = -1;
978 int c1 = -1;
979 int c2 = -1;
980
981
982 KmerIterator kit1(s1), kit_end;
983
984 bool found1 = false;
985 for (; kit1 != kit_end; ++kit1) {
986 Kmer x = kit1->first;
987 Kmer xr = x.rep();
988 auto search = kmap.find(xr);
989 bool forward = (x==xr);
990
991 if (search != kmap.end()) {
992 found1 = true;
993 KmerEntry val = search->second;
994 c1 = val.contig;
995 if (forward == val.isFw()) {
996 p1 = val.getPos() - kit1->second;
997 d1 = true;
998 } else {
999 p1 = val.getPos() + k + kit1->second;
1000 d1 = false;
1001 }
1002 break;
1003 }
1004 }
1005
1006 if (!found1) {
1007 return -1;
1008 }
1009
1010
1011
1012 KmerIterator kit2(s2);
1013 bool found2 = false;
1014
1015 for (; kit2 != kit_end; ++kit2) {
1016 Kmer x = kit2->first;
1017 Kmer xr = x.rep();
1018 auto search = kmap.find(xr);
1019 bool forward = (x==xr);
1020
1021 if (search != kmap.end()) {
1022 found2 = true;
1023 KmerEntry val = search->second;
1024 c2 = val.contig;
1025 if (forward== val.isFw()) {
1026 p2 = val.getPos() - kit2->second;
1027 d2 = true;
1028 } else {
1029 p2 = val.getPos() + k + kit2->second;
1030 d2 = false;
1031 }
1032 break;
1033 }
1034 }
1035
1036 if (!found2) {
1037 return -1;
1038 }
1039
1040 if (c1 != c2) {
1041 return -1;
1042 }
1043
1044 if ((d1 && d2) || (!d1 && !d2)) {
1045 //std::cerr << "Reads map to same strand " << s1 << "\t" << s2 << std::endl;
1046 return -1;
1047 }
1048
1049 if (p1>p2) {
1050 return p1-p2;
1051 } else {
1052 return p2-p1;
1053 }
1054
1055 }
1056
1057 // use: match(s,l,v)
1058 // pre: v is initialized
1059 // post: v contains all equiv classes for the k-mers in s
match(const char * s,int l,std::vector<std::pair<KmerEntry,int>> & v) const1060 void KmerIndex::match(const char *s, int l, std::vector<std::pair<KmerEntry, int>>& v) const {
1061 KmerIterator kit(s), kit_end;
1062 bool backOff = false;
1063 int nextPos = 0; // nextPosition to check
1064 for (int i = 0; kit != kit_end; ++i,++kit) {
1065 // need to check it
1066 auto search = kmap.find(kit->first.rep());
1067 int pos = kit->second;
1068
1069 if (search != kmap.end()) {
1070
1071 KmerEntry val = search->second;
1072
1073 v.push_back({val, kit->second});
1074
1075 // see if we can skip ahead
1076 // bring thisback later
1077 bool forward = (kit->first == search->first);
1078 int dist = val.getDist(forward);
1079
1080
1081 //const int lastbp = 10;
1082 if (dist >= 2) {
1083 // where should we jump to?
1084 int nextPos = pos+dist; // default jump
1085
1086 if (pos + dist >= l-k) {
1087 // if we can jump beyond the read, check the end
1088 nextPos = l-k;
1089 }
1090
1091 // check next position
1092 KmerIterator kit2(kit);
1093 kit2.jumpTo(nextPos);
1094 if (kit2 != kit_end) {
1095 Kmer rep2 = kit2->first.rep();
1096 auto search2 = kmap.find(rep2);
1097 bool found2 = false;
1098 int found2pos = pos+dist;
1099 if (search2 == kmap.end()) {
1100 found2=true;
1101 found2pos = pos;
1102 } else if (val.contig == search2->second.contig) {
1103 found2=true;
1104 found2pos = pos+dist;
1105 }
1106 if (found2) {
1107 // great, a match (or nothing) see if we can move the k-mer forward
1108 if (found2pos >= l-k) {
1109 v.push_back({val, l-k}); // push back a fake position
1110 break; //
1111 } else {
1112 v.push_back({val, found2pos});
1113 kit = kit2; // move iterator to this new position
1114 }
1115 } else {
1116 // this is weird, let's try the middle k-mer
1117 bool foundMiddle = false;
1118 if (dist > 4) {
1119 int middlePos = (pos + nextPos)/2;
1120 int middleContig = -1;
1121 int found3pos = pos+dist;
1122 KmerIterator kit3(kit);
1123 kit3.jumpTo(middlePos);
1124 KmerEntry val3;
1125 if (kit3 != kit_end) {
1126 Kmer rep3 = kit3->first.rep();
1127 auto search3 = kmap.find(rep3);
1128 if (search3 != kmap.end()) {
1129 middleContig = search3->second.contig;
1130 if (middleContig == val.contig) {
1131 foundMiddle = true;
1132 found3pos = middlePos;
1133 } else if (middleContig == search2->second.contig) {
1134 foundMiddle = true;
1135 found3pos = pos+dist;
1136 }
1137 }
1138
1139
1140 if (foundMiddle) {
1141 v.push_back({search3->second, found3pos});
1142 if (nextPos >= l-k) {
1143 break;
1144 } else {
1145 kit = kit2;
1146 }
1147 }
1148 }
1149 }
1150
1151
1152 if (!foundMiddle) {
1153 ++kit;
1154 backOff = true;
1155 goto donejumping; // sue me Dijkstra!
1156 }
1157 }
1158 } else {
1159 // the sequence is messed up at this point, let's just take the match
1160 //v.push_back({dbGraph.ecs[val.contig], l-k});
1161 break;
1162 }
1163 }
1164 }
1165
1166 donejumping:
1167
1168 if (backOff) {
1169 // backup plan, let's play it safe and search incrementally for the rest, until nextStop
1170 for (int j = 0; kit != kit_end; ++kit,++j) {
1171 if (j==skip) {
1172 j=0;
1173 }
1174 if (j==0) {
1175 // need to check it
1176 Kmer rep = kit->first.rep();
1177 auto search = kmap.find(rep);
1178 if (search != kmap.end()) {
1179 // if k-mer found
1180 v.push_back({search->second, kit->second}); // add equivalence class, and position
1181 }
1182 }
1183
1184 if (kit->second >= nextPos) {
1185 backOff = false;
1186 break; // break out of backoff for loop
1187 }
1188 }
1189 }
1190 }
1191 }
1192
findPosition(int tr,Kmer km,int p) const1193 std::pair<int,bool> KmerIndex::findPosition(int tr, Kmer km, int p) const {
1194 auto it = kmap.find(km.rep());
1195 if (it != kmap.end()) {
1196 KmerEntry val = it->second;
1197 return findPosition(tr, km, val, p);
1198 } else {
1199 return {-1,true};
1200 }
1201 }
1202
1203 //use: (pos,sense) = index.findPosition(tr,km,val,p)
1204 //pre: index.kmap[km] == val,
1205 // km is the p-th k-mer of a read
1206 // val.contig maps to tr
1207 //post: km is found in position pos (1-based) on the sense/!sense strand of tr
findPosition(int tr,Kmer km,KmerEntry val,int p) const1208 std::pair<int,bool> KmerIndex::findPosition(int tr, Kmer km, KmerEntry val, int p) const {
1209 bool fw = (km == km.rep());
1210 bool csense = (fw == val.isFw());
1211
1212 int trpos = -1;
1213 bool trsense = true;
1214 if (val.contig < 0) {
1215 return {-1, true};
1216 }
1217 const Contig &c = dbGraph.contigs[val.contig];
1218 for (auto x : c.transcripts) {
1219 if (x.trid == tr) {
1220 trpos = x.pos;
1221 trsense = x.sense;
1222 break;
1223 }
1224 }
1225
1226 if (trpos == -1) {
1227 return {-1,true};
1228 }
1229
1230
1231 if (trsense) {
1232 if (csense) {
1233 return {trpos + val.getPos() - p + 1, csense}; // 1-based, case I
1234 } else {
1235 return {trpos + val.getPos() + k + p, csense}; // 1-based, case III
1236 }
1237 } else {
1238 if (csense) {
1239 return {trpos + (c.length - val.getPos() -1) + k + p, !csense}; // 1-based, case IV
1240 } else {
1241 return {trpos + (c.length - val.getPos()) - p, !csense}; // 1-based, case II
1242 }
1243 }
1244 }
1245
1246 /*
1247 // use: r = matchEnd(s,l,v,p)
1248 // pre: v is initialized, p>=0
1249 // post: v contains all equiv classes for the k-mer in s, as
1250 // well as the best match for s[p:]
1251 // if r is false then no match was found
1252 bool KmerIndex::matchEnd(const char *s, int l, std::vector<std::pair<int,int>> &v, int maxPos) const {
1253 // kmer-iterator checks for N's and out of bounds
1254 KmerIterator kit(s+maxPos), kit_end;
1255 if (kit != kit_end && kit->second == 0) {
1256 Kmer last = kit->first;
1257 auto search = kmap.find(last.rep());
1258
1259 if (search == kmap.end()) {
1260 return false; // shouldn't happen
1261 }
1262
1263 KmerEntry val = search->second;
1264 bool forward = (kit->first == search->first);
1265 int dist = val.getDist(forward);
1266 int pos = maxPos + dist + 1; // move 1 past the end of the contig
1267
1268 const char* sLeft = s + pos + (k-1);
1269 int sizeleft = l - (pos + k-1); // characters left in sleft
1270 if (sizeleft <= 0) {
1271 return false; // nothing left to match
1272 }
1273
1274 // figure out end k-mer
1275 const Contig& root = dbGraph.contigs[val.contig];
1276 Kmer end; // last k-mer
1277 bool readFw = (forward == val.isFw());
1278 if (readFw) {
1279 end = Kmer(root.seq.c_str() + root.length-1);
1280 } else {
1281 end = Kmer(root.seq.c_str()).twin();
1282 }
1283
1284
1285 int bestContig = -1;
1286 int bestDist = sizeleft;
1287 int numBest = 0;
1288 for (int i = 0; i < 4; i++) {
1289 Kmer x = end.forwardBase(Dna(i));
1290 Kmer xr = x.rep();
1291 auto searchx = kmap.find(xr);
1292 if (searchx != kmap.end()) {
1293 KmerEntry valx = searchx->second;
1294 const Contig& branch = dbGraph.contigs[valx.contig];
1295 if (branch.length < sizeleft) {
1296 return false; // haven't implemented graph walks yet
1297 }
1298 std::string cs;
1299 bool contigFw = (x==xr) && valx.isFw();
1300 // todo: get rid of this string copy
1301 if (valx.getPos() == 0 && contigFw) {
1302 cs = branch.seq.substr(k-1);
1303 } else if (valx.getPos() == branch.length-1 && !contigFw) {
1304 cs = revcomp(branch.seq).substr(k-1);
1305 }
1306 int hdist = hamming(sLeft,cs.c_str());
1307 if (bestDist >= hdist) {
1308 numBest++;
1309 if (bestDist > hdist) {
1310 bestDist = hdist;
1311 numBest = 1;
1312 }
1313 bestContig = valx.contig;
1314 }
1315 }
1316 }
1317
1318 if (numBest == 1 && bestDist < 10) {
1319 v.push_back({dbGraph.ecs[bestContig], l-k});
1320 return true;
1321 } else {
1322 return false;
1323 }
1324 } else {
1325 return false;
1326 }
1327 }
1328 */
1329
1330
1331 // use: res = intersect(ec,v)
1332 // pre: ec is in ecmap, v is a vector of valid targets
1333 // v is sorted in increasing order
1334 // post: res contains the intersection of ecmap[ec] and v sorted increasing
1335 // res is empty if ec is not in ecma
intersect(int ec,const std::vector<int> & v) const1336 std::vector<int> KmerIndex::intersect(int ec, const std::vector<int>& v) const {
1337 std::vector<int> res;
1338 //auto search = ecmap.find(ec);
1339 if (ec < ecmap.size()) {
1340 //if (search != ecmap.end()) {
1341 //auto& u = search->second;
1342 auto& u = ecmap[ec];
1343 res.reserve(v.size());
1344
1345 auto a = u.begin();
1346 auto b = v.begin();
1347
1348 while (a != u.end() && b != v.end()) {
1349 if (*a < *b) {
1350 ++a;
1351 } else if (*b < *a) {
1352 ++b;
1353 } else {
1354 // match
1355 res.push_back(*a);
1356 ++a;
1357 ++b;
1358 }
1359 }
1360 }
1361 return res;
1362 }
1363
1364
loadTranscriptSequences() const1365 void KmerIndex::loadTranscriptSequences() const {
1366 if (target_seqs_loaded) {
1367 return;
1368 }
1369
1370
1371
1372 std::vector<std::vector<std::pair<int, ContigToTranscript>>> trans_contigs(num_trans);
1373 for (auto &c : dbGraph.contigs) {
1374 for (auto &ct : c.transcripts) {
1375 trans_contigs[ct.trid].push_back({c.id, ct});
1376 }
1377 }
1378
1379 auto &target_seqs = const_cast<std::vector<std::string>&>(target_seqs_);
1380
1381 for (int i = 0; i < trans_contigs.size(); i++) {
1382 auto &v = trans_contigs[i];
1383 std::sort(v.begin(), v.end(), [](std::pair<int,ContigToTranscript> a, std::pair<int,ContigToTranscript> b) {
1384 return a.second.pos < b.second.pos;
1385 });
1386
1387 std::string seq;
1388 seq.reserve(target_lens_[i]);
1389
1390 for (auto &pct : v) {
1391 auto ct = pct.second;
1392 int start = (ct.pos==0) ? 0 : k-1;
1393 const auto& contig = dbGraph.contigs[pct.first];
1394 if (ct.sense) {
1395 seq.append(contig.seq.substr(start));
1396 } else {
1397 seq.append(revcomp(contig.seq).substr(start));
1398 }
1399 }
1400 target_seqs.push_back(seq);
1401 }
1402
1403 bool &t = const_cast<bool&>(target_seqs_loaded);
1404 t = true;//target_seqs_loaded = true;
1405 return;
1406 }
1407
clear()1408 void KmerIndex::clear() {
1409 kmap.clear_table();
1410 ecmap.resize(0);
1411 dbGraph.ecs.resize(0);
1412 dbGraph.contigs.resize(0);
1413 {
1414 std::unordered_map<std::vector<int>, int, SortedVectorHasher> empty;
1415 std::swap(ecmapinv, empty);
1416 }
1417
1418 target_lens_.resize(0);
1419 target_names_.resize(0);
1420 target_seqs_.resize(0);
1421 }
1422
writePseudoBamHeader(std::ostream & o) const1423 void KmerIndex::writePseudoBamHeader(std::ostream &o) const {
1424 // write out header
1425 o << "@HD\tVN:1.0\n";
1426 for (int i = 0; i < num_trans; i++) {
1427 o << "@SQ\tSN:" << target_names_[i] << "\tLN:" << target_lens_[i] << "\n";
1428 }
1429 o << "@PG\tID:kallisto\tPN:kallisto\tVN:"<< KALLISTO_VERSION << "\n";
1430 o.flush();
1431 }
1432