1 #include "KmerIndex.h"
2 #include <algorithm>
3 #include <random>
4 #include <ctype.h>
5 #include <zlib.h>
6 #include <unordered_set>
7 #include "kseq.h"
8 
9 #ifndef KSEQ_INIT_READY
10 #define KSEQ_INIT_READY
KSEQ_INIT(gzFile,gzread)11 KSEQ_INIT(gzFile, gzread)
12 #endif
13 
14 // helper functions
15 // pre: u is sorted
16 bool isUnique(const std::vector<int>& u) {
17   for (int j = 1; j < u.size(); j++) {
18     if (u[j-1] == u[j]) {
19       return false;
20     }
21   }
22   return true;
23 }
24 
unique(const std::vector<int> & u)25 std::vector<int> unique(const std::vector<int>& u) {
26   std::vector<int> v;
27   v.reserve(u.size());
28   v.push_back(u[0]);
29   for (int j = 1; j < u.size(); j++) {
30     if (u[j-1] != u[j]) {
31       v.push_back(u[j]);
32     }
33   }
34   return v;
35 }
36 
Dna(int i)37 const char Dna(int i) {
38   static const char *dna = "ACGT";
39   return dna[i & 0x03];
40 }
41 
hamming(const char * a,const char * b)42 int hamming(const char *a, const char *b) {
43   int h = 0;
44   while (*a != 0 && *b != 0) {
45     if (*a != *b) {
46       h++;
47     }
48     a++;
49     b++;
50   }
51   return h;
52 }
53 
revcomp(const std::string s)54 std::string revcomp(const std::string s) {
55   std::string r(s);
56   std::transform(s.rbegin(), s.rend(), r.begin(), [](char c) {
57       switch(c) {
58       case 'A': return 'T';
59       case 'C': return 'G';
60       case 'G': return 'C';
61       case 'T': return 'A';
62       default: return 'N';
63       }
64       return 'N';
65     });
66   return r;
67 }
68 
BuildTranscripts(const ProgramOptions & opt)69 void KmerIndex::BuildTranscripts(const ProgramOptions& opt) {
70   // read input
71   std::unordered_set<std::string> unique_names;
72   int k = opt.k;
73   for (auto& fasta : opt.transfasta) {
74     std::cerr << "[build] loading fasta file " << fasta
75               << std::endl;
76   }
77   std::cerr << "[build] k-mer length: " << k << std::endl;
78 
79 
80   std::vector<std::string> seqs;
81 
82   // read fasta file
83   gzFile fp = 0;
84   kseq_t *seq;
85   int l = 0;
86   std::mt19937 gen(42);
87   int countNonNucl = 0;
88   int countUNuc = 0;
89   int polyAcount = 0;
90 
91   for (auto& fasta : opt.transfasta) {
92     fp = gzopen(fasta.c_str(), "r");
93     seq = kseq_init(fp);
94     while (true) {
95       l = kseq_read(seq);
96       if (l <= 0) {
97         break;
98       }
99       seqs.emplace_back(seq->seq.s);
100       std::string& str = *seqs.rbegin();
101       auto n = str.size();
102       for (auto i = 0; i < n; i++) {
103         char c = str[i];
104         c = ::toupper(c);
105         if (c=='U') {
106           str[i] = 'T';
107           countUNuc++;
108         } else if (c !='A' && c != 'C' && c != 'G' && c != 'T') {
109           str[i] = Dna(gen()); // replace with pseudorandom string
110           countNonNucl++;
111         }
112       }
113       std::transform(str.begin(), str.end(),str.begin(), ::toupper);
114 
115       if (str.size() >= 10 && str.substr(str.size()-10,10) == "AAAAAAAAAA") {
116         // clip off polyA tail
117         //std::cerr << "[index] clipping off polyA tail" << std::endl;
118         polyAcount++;
119         int j;
120         for (j = str.size()-1; j >= 0 && str[j] == 'A'; j--) {}
121         str = str.substr(0,j+1);
122       }
123 
124 
125       target_lens_.push_back(seq->seq.l);
126       std::string name(seq->name.s);
127       size_t p = name.find(' ');
128       if (p != std::string::npos) {
129         name = name.substr(0,p);
130       }
131 
132       if (unique_names.find(name) != unique_names.end()) {
133         if (!opt.make_unique) {
134           std::cerr << "Error: repeated name in FASTA file " << fasta << "\n" << name << "\n\n" << "Run with --make-unique to replace repeated names with unique names" << std::endl;
135           exit(1);
136         } else {
137           for (int i = 1; ; i++) { // potential bug if you have more than 2^32 repeated names
138             std::string new_name = name + "_" + std::to_string(i);
139             if (unique_names.find(new_name) == unique_names.end()) {
140               name = new_name;
141               break;
142             }
143           }
144         }
145       }
146       unique_names.insert(name);
147       target_names_.push_back(name);
148 
149     }
150     gzclose(fp);
151     fp=0;
152   }
153 
154   if (polyAcount > 0) {
155     std::cerr << "[build] warning: clipped off poly-A tail (longer than 10)" << std::endl << "        from " << polyAcount << " target sequences" << std::endl;
156   }
157 
158 
159   if (countNonNucl > 0) {
160     std::cerr << "[build] warning: replaced " << countNonNucl << " non-ACGUT characters in the input sequence" << std::endl << "        with pseudorandom nucleotides" << std::endl;
161   }
162   if (countUNuc > 0) {
163     std::cerr << "[build] warning: replaced " << countUNuc << " U characters with Ts" << std::endl;
164   }
165 
166   num_trans = seqs.size();
167 
168   // for each target, create it's own equivalence class
169   for (int i = 0; i < seqs.size(); i++ ) {
170     std::vector<int> single(1,i);
171     //ecmap.insert({i,single});
172     ecmap.push_back(single);
173     ecmapinv.insert({single,i});
174   }
175 
176   BuildDeBruijnGraph(opt, seqs);
177   BuildEquivalenceClasses(opt, seqs);
178   //BuildEdges(opt);
179 
180 }
181 
BuildDeBruijnGraph(const ProgramOptions & opt,const std::vector<std::string> & seqs)182 void KmerIndex::BuildDeBruijnGraph(const ProgramOptions& opt, const std::vector<std::string>& seqs) {
183 
184 
185   std::cerr << "[build] counting k-mers ... "; std::cerr.flush();
186   // gather all k-mers
187   for (int i = 0; i < seqs.size(); i++) {
188     const char *s = seqs[i].c_str();
189     KmerIterator kit(s),kit_end;
190     for (; kit != kit_end; ++kit) {
191       kmap.insert({kit->first.rep(), KmerEntry()}); // don't care about repeats
192       //    Kmer rep = kit->first.rep();
193       // std::cout << rep.toString() << "\t" << rep.hash() << std::endl;
194     }
195   }
196   std::cerr << "done." << std::endl;
197 
198   std::cerr << "[build] building target de Bruijn graph ... "; std::cerr.flush();
199   // find out how much we can skip ahead for each k-mer.
200   for (auto& kv : kmap) {
201     if (kv.second.contig == -1) {
202       // ok we haven't processed the k-mer yet
203       std::vector<Kmer> flist, blist;
204 
205       // iterate in forward direction
206       Kmer km = kv.first;
207       Kmer end = km;
208       Kmer last = end;
209       Kmer twin = km.twin();
210       bool selfLoop = false;
211       flist.push_back(km);
212 
213       while (fwStep(end,end)) {
214         if (end == km) {
215           // selfloop
216           selfLoop = true;
217           break;
218         } else if (end == twin) {
219           selfLoop = (flist.size() > 1); // hairpins are not loops
220           // mobius loop
221           break;
222         } else if (end == last.twin()) {
223           // hairpin
224           break;
225         }
226         flist.push_back(end);
227         last = end;
228       }
229 
230       Kmer front = twin;
231       Kmer first = front;
232 
233       if (!selfLoop) {
234         while (fwStep(front,front)) {
235           if (front == twin) {
236             // selfloop
237             selfLoop = true;
238             break;
239           } else if (front == km) {
240             // mobius loop
241             selfLoop = true;
242             break;
243           } else if (front == first.twin()) {
244             // hairpin
245             break;
246           }
247           blist.push_back(front);
248           first = front;
249         }
250       }
251 
252       std::vector<Kmer> klist;
253       for (auto it = blist.rbegin(); it != blist.rend(); ++it) {
254         klist.push_back(it->twin());
255       }
256       for (auto x : flist) {
257         klist.push_back(x);
258       }
259 
260 
261       Contig contig;
262       contig.id = dbGraph.contigs.size();
263       contig.length = klist.size();
264       contig.seq = klist[0].toString();
265       contig.seq.reserve(contig.length + k-1);
266 
267 
268       for (int i = 0; i < klist.size(); i++) {
269         Kmer x = klist[i];
270         Kmer xr = x.rep();
271         bool forward = (x==xr);
272         auto it = kmap.find(xr);
273         assert(it->second.contig==-1);
274         it->second = KmerEntry(contig.id, contig.length, i, forward);
275         if (i > 0) {
276           contig.seq.push_back(x.toString()[k-1]);
277         }
278       }
279 
280       dbGraph.contigs.push_back(contig);
281       dbGraph.ecs.push_back(-1);
282     }
283   }
284   std::cerr << " done " << std::endl;
285 
286 }
287 
BuildEquivalenceClasses(const ProgramOptions & opt,const std::vector<std::string> & seqs)288 void KmerIndex::BuildEquivalenceClasses(const ProgramOptions& opt, const std::vector<std::string>& seqs) {
289   std::cerr << "[build] creating equivalence classes ... "; std::cerr.flush();
290 
291   std::vector<std::vector<TRInfo>> trinfos(dbGraph.contigs.size());
292   //std::cout << "Mapping target " << std::endl;
293   for (int i = 0; i < seqs.size(); i++) {
294     int seqlen = seqs[i].size() - k + 1; // number of k-mers
295     const char *s = seqs[i].c_str();
296     //std::cout << "sequence number " << i << std::endl;
297     KmerIterator kit(s), kit_end;
298     for (; kit != kit_end; ++kit) {
299       Kmer x = kit->first;
300       Kmer xr = x.rep();
301       auto search = kmap.find(xr);
302       bool forward = (x==xr);
303       KmerEntry val = search->second;
304       std::vector<TRInfo>& trinfo = trinfos[val.contig];
305       Contig& contig = dbGraph.contigs[val.contig];
306 
307 
308 
309       TRInfo tr;
310       tr.trid = i;
311       int jump = kit->second;
312       if (forward == val.isFw()) {
313         tr.sense = true;
314         tr.start = val.getPos();
315         if (contig.length - tr.start > seqlen - kit->second) {
316           // tartget stops
317           tr.stop = tr.start + seqlen - kit->second;
318           jump = seqlen;
319         } else {
320           tr.stop = contig.length;
321           jump = kit->second + (tr.stop - tr.start)-1;
322         }
323       } else {
324         tr.sense = false;
325         tr.stop = val.getPos()+1;
326         int stpos = tr.stop - (seqlen - kit->second);
327         if (stpos > 0) {
328           tr.start = stpos;
329           jump = seqlen;
330         } else {
331           tr.start = 0;
332           jump = kit->second + (tr.stop - tr.start) - 1;
333         }
334       }
335 
336       // // debugging -->
337       //std::cout << "covering seq" << std::endl << seqs[i].substr(kit->second, jump-kit->second +k) << std::endl;
338       //std::cout << "id = " << tr.trid << ", (" << tr.start << ", " << tr.stop << ")" << std::endl;
339       //std::cout << "contig seq" << std::endl;
340       // if (forward == val.isFw()) {
341       //   //std::cout << contig.seq << std::endl;
342       //   assert(contig.seq.substr(tr.start, k-1 + tr.stop-tr.start) == seqs[i].substr(kit->second, jump-kit->second +k) );
343       // } else {
344       //   //std::cout << revcomp(contig.seq) << std::endl;
345       //   assert(revcomp(contig.seq.substr(tr.start, k-1 + tr.stop-tr.start)) == seqs[i].substr(kit->second, jump-kit->second +k));
346       // }
347       // if (jump == seqlen) {
348       //   //std::cout << std::string(k-1+(tr.stop-tr.start)-1,'-') << "^" << std::endl;
349       // }
350 
351       // // <-- debugging
352 
353       trinfo.push_back(tr);
354       kit.jumpTo(jump);
355     }
356   }
357 
358 
359   FixSplitContigs(opt, trinfos);
360 
361 
362   int perftr = 0;
363   for (int i = 0; i < trinfos.size(); i++) {
364     bool all = true;
365 
366     int contigLen = dbGraph.contigs[i].length;
367     //std::cout << "contig " << i << ", length = " << contigLen << ", seq = " << dbGraph.contigs[i].seq << std::endl << "tr = ";
368     for (auto x : trinfos[i]) {
369       if (x.start!=0 || x.stop !=contigLen) {
370         all = false;
371       }
372       //std::cout << "[" << x.trid << ",(" << x.start << ", " << x.stop << ")], " ;
373     }
374     //std::cout << std::endl;
375     if (all) {
376       perftr++;
377     }
378   }
379   //std::cerr << "For " << dbGraph.contigs.size() << ", " << (dbGraph.contigs.size() - perftr) << " of them need to be split" << std::endl;
380   // need to create the equivalence classes
381 
382   assert(dbGraph.contigs.size() == trinfos.size());
383   // for each contig
384   for (int i = 0; i < trinfos.size(); i++) {
385     std::vector<int> u;
386     for (auto x : trinfos[i]) {
387       u.push_back(x.trid);
388     }
389     sort(u.begin(), u.end());
390     if (!isUnique(u)){
391       std::vector<int> v = unique(u);
392       swap(u,v);
393     }
394 
395     assert(!u.empty());
396 
397     auto search = ecmapinv.find(u);
398     int ec = -1;
399     if (search != ecmapinv.end()) {
400       // insert contig -> ec info
401       ec = search->second;
402     } else {
403       ec = ecmapinv.size();
404       ecmapinv.insert({u,ec});
405       ecmap.push_back(u);
406     }
407     dbGraph.ecs[i] = ec;
408     assert(ec != -1);
409 
410     // record the transc
411     Contig& contig = dbGraph.contigs[i];
412     contig.ec = ec;
413     // correct ec of all k-mers in contig
414     /*
415     KmerIterator kit(contig.seq.c_str()), kit_end;
416     for (; kit != kit_end; ++kit) {
417       auto ksearch = kmap.find(kit->first.rep());
418       //ksearch->second.ec = ec;
419     }
420     */
421   }
422 
423   // map transcripts to contigs
424   //std::cout << std::endl;
425   for (int i = 0; i < seqs.size(); i++) {
426     int seqlen = seqs[i].size() - k + 1; // number of k-mers
427     // debugging
428     std::string stmp;
429     const char *s = seqs[i].c_str();
430     ////std::cout << "sequence number " << i << std::endl;
431     //std::cout << ">" << target_names_[i] << std::endl;
432     //std::cout << seqs[i] << std::endl;
433     KmerIterator kit(s), kit_end;
434     for (; kit != kit_end; ++kit) {
435       Kmer x = kit->first;
436       //std::cout << "position = " << kit->second << ", mapping " << x.toString() << std::endl;
437       Kmer xr = x.rep();
438       auto search = kmap.find(xr);
439       bool forward = (x==xr);
440       KmerEntry val = search->second;
441       Contig& contig = dbGraph.contigs[val.contig];
442 
443       ContigToTranscript info;
444       info.trid = i;
445       info.pos = kit->second;
446       info.sense = (forward == val.isFw());
447       int jump = kit->second + contig.length-1;
448       //std::cout << "mapped to contig " << val.contig << ", len = " << contig.length <<  ", pos = " << val.getPos() << ", sense = " << info.sense << std::endl;
449       contig.transcripts.push_back(info);
450       // debugging
451       if (info.sense) {
452 
453         if (info.pos == 0) {
454           stmp.append(contig.seq);
455           //std::cout << contig.seq << std::endl;
456         } else {
457           stmp.append(contig.seq.substr(k-1));
458           //std::cout << contig.seq.substr(k-1) << std::endl;
459         }
460       } else {
461         std::string r = revcomp(contig.seq);
462         if (info.pos == 0) {
463           stmp.append(r);
464           //std::cout << r << std::endl;
465         } else {
466           stmp.append(r.substr(k-1));
467           //std::cout << r.substr(k-1) << std::endl;
468         }
469       }
470       //std::cout << stmp << std::endl;
471       //std::cout << "covering seq" << std::endl << seqs[i].substr(kit->second, jump-kit->second +k) << std::endl;
472       //std::cout << "id = " << tr.trid << ", (" << tr.start << ", " << tr.stop << ")" << std::endl;
473       //std::cout << "contig seq" << std::endl;
474       // if (forward == val.isFw()) {
475       //   //std::cout << contig.seq << std::endl;
476       //   assert(contig.seq.substr(tr.start, k-1 + tr.stop-tr.start) == seqs[i].substr(kit->second, jump-kit->second +k) );
477       // } else {
478       //   //std::cout << revcomp(contig.seq) << std::endl;
479       //   assert(revcomp(contig.seq.substr(tr.start, k-1 + tr.stop-tr.start)) == seqs[i].substr(kit->second, jump-kit->second +k));
480       // }
481       // if (jump == seqlen) {
482       //   //std::cout << std::string(k-1+(tr.stop-tr.start)-1,'-') << "^" << std::endl;
483       // }
484 
485       // // <-- debugging
486 
487       kit.jumpTo(jump);
488     }
489     if (seqlen > 0 && seqs[i] != stmp) {
490       /*std::cout << ">" << target_names_[i] << std::endl
491                 << seqs[i] << std::endl
492                 << stmp << std::endl;*/
493       assert(false);
494 
495     }
496   }
497 
498   // double check the contigs
499   for (auto &c : dbGraph.contigs) {
500     for (auto info : c.transcripts) {
501       std::string r;
502       if (info.sense) {
503         r = c.seq;
504       } else {
505         r = revcomp(c.seq);
506       }
507       assert(r == seqs[info.trid].substr(info.pos,r.size()));
508     }
509   }
510 
511 
512   std::cerr << " done" << std::endl;
513   std::cerr << "[build] target de Bruijn graph has " << dbGraph.contigs.size() << " contigs and contains "  << kmap.size() << " k-mers " << std::endl;
514 
515 
516 
517 
518 
519 }
520 
FixSplitContigs(const ProgramOptions & opt,std::vector<std::vector<TRInfo>> & trinfos)521 void KmerIndex::FixSplitContigs(const ProgramOptions& opt, std::vector<std::vector<TRInfo>>& trinfos) {
522 
523   int perftr = 0;
524   int orig_size = trinfos.size();
525 
526   for (int i = 0; i < orig_size; i++) {
527     bool all = true;
528 
529     int contigLen = dbGraph.contigs[i].length;
530     //std::cout << "contig " << i << ", length = " << contigLen << ", seq = " << dbGraph.contigs[i].seq << std::endl << "tr = ";
531     for (auto x : trinfos[i]) {
532       if (x.start!=0 || x.stop !=contigLen) {
533         all = false;
534       }
535       //std::cout << "[" << x.trid << ",(" << x.start << ", " << x.stop << ")], " ;
536       assert(x.start < x.stop);
537     }
538     //std::cout << std::endl;
539 
540     if (all) {
541       perftr++;
542     } else {
543       // break up equivalence classes
544       // sort by start/stop
545       std::vector<int> brpoints;
546       for (auto& x : trinfos[i]) {
547         brpoints.push_back(x.start);
548         brpoints.push_back(x.stop);
549       }
550       sort(brpoints.begin(), brpoints.end());
551       assert(brpoints[0] == 0);
552       assert(brpoints[brpoints.size()-1]==contigLen);
553 
554       // find unique points
555       if (!isUnique(brpoints)) {
556         std::vector<int> u = unique(brpoints);
557         swap(u,brpoints);
558       }
559 
560       assert(!brpoints.empty());
561 
562       // copy sequence
563       std::string seq = dbGraph.contigs[i].seq;
564       // copy old trinfo
565       std::vector<TRInfo> oldtrinfo = trinfos[i];
566 
567       for (int j = 1; j < brpoints.size(); j++) {
568         assert(brpoints[j-1] < brpoints[j]);
569         Contig newc;
570         newc.seq = seq.substr(brpoints[j-1], brpoints[j]-brpoints[j-1]+k-1);
571         newc.length = brpoints[j]-brpoints[j-1];
572 
573         if (j>1) {
574           newc.id = dbGraph.contigs.size();
575           dbGraph.contigs.push_back(newc);
576           dbGraph.ecs.push_back(-1);
577         } else {
578           newc.id = i;
579           dbGraph.contigs[i] = newc;
580         }
581 
582         // repair k-mer mapping
583         KmerIterator kit(newc.seq.c_str()), kit_end;
584         for (; kit != kit_end; ++kit) {
585           Kmer x = kit->first;
586           Kmer xr = x.rep();
587           auto search = kmap.find(xr);
588           assert(search != kmap.end());
589           bool forward = (x==xr);
590           search->second = KmerEntry(newc.id, newc.length,  kit->second, forward);
591         }
592 
593         // repair tr-info
594         std::vector<TRInfo> newtrinfo;
595         for (auto x : oldtrinfo) {
596           if (!(x.stop <= brpoints[j-1] || x.start >= brpoints[j])) {
597             TRInfo trinfo;
598             trinfo.sense = x.sense;
599             trinfo.trid = x.trid;
600             trinfo.start = 0;
601             trinfo.stop = newc.length;
602             newtrinfo.push_back(trinfo);
603           }
604         }
605         if (j>1) {
606           trinfos.push_back(newtrinfo);
607         } else {
608           trinfos[i] = newtrinfo;
609         }
610       }
611     }
612   }
613 
614 
615   //std::cerr << "For " << dbGraph.contigs.size() << ", " << (dbGraph.contigs.size() - perftr) << " of them need to be split" << std::endl;
616 
617 
618 }
619 
620 
621 
622 
write(const std::string & index_out,bool writeKmerTable)623 void KmerIndex::write(const std::string& index_out, bool writeKmerTable) {
624   std::ofstream out;
625   out.open(index_out, std::ios::out | std::ios::binary);
626 
627   if (!out.is_open()) {
628     // TODO: better handling
629     std::cerr << "Error: index output file could not be opened!";
630     exit(1);
631   }
632 
633   // 1. write version
634   out.write((char *)&INDEX_VERSION, sizeof(INDEX_VERSION));
635 
636   // 2. write k
637   out.write((char *)&k, sizeof(k));
638 
639   // 3. write number of targets
640   out.write((char *)&num_trans, sizeof(num_trans));
641 
642   // 4. write out target lengths
643   for (int tlen : target_lens_) {
644     out.write((char *)&tlen, sizeof(tlen));
645   }
646 
647   size_t kmap_size = kmap.size();
648 
649   if (writeKmerTable) {
650     // 5. write number of k-mers in map
651     out.write((char *)&kmap_size, sizeof(kmap_size));
652 
653     // 6. write kmer->ec values
654     for (auto& kv : kmap) {
655       out.write((char *)&kv.first, sizeof(kv.first));
656       out.write((char *)&kv.second, sizeof(kv.second));
657     }
658   } else {
659     // 5. write fake k-mer size
660     kmap_size = 0;
661     out.write((char *)&kmap_size, sizeof(kmap_size));
662 
663     // 6. write none of the kmer->ec values
664   }
665   // 7. write number of equivalence classes
666   size_t tmp_size;
667   tmp_size = ecmap.size();
668   out.write((char *)&tmp_size, sizeof(tmp_size));
669 
670   // 8. write out each equiv class
671   //  for (auto& kv : ecmap) {
672   for (int ec = 0; ec < ecmap.size(); ec++) {
673     out.write((char *)&ec, sizeof(ec));
674     auto& v = ecmap[ec];
675     // 8.1 write out the size of equiv class
676     tmp_size = v.size();
677     out.write((char *)&tmp_size, sizeof(tmp_size));
678     // 8.2 write each member
679     for (auto& val: v) {
680       out.write((char *)&val, sizeof(val));
681     }
682   }
683 
684   // 9. Write out target ids
685   // XXX: num_trans should equal to target_names_.size(), so don't need
686   // to write out again.
687   assert(num_trans == target_names_.size());
688   for (auto& tid : target_names_) {
689     // 9.1 write out how many bytes
690     // XXX: Note: this doesn't actually encore the max targ id size.
691     // might cause problems in the future
692     // tmp_size = tid.size();
693     tmp_size = strlen(tid.c_str());
694     out.write((char *)&tmp_size, sizeof(tmp_size));
695 
696     // 9.2 write out the actual string
697     out.write(tid.c_str(), tmp_size);
698   }
699 
700   // 10. write out contigs
701   if (writeKmerTable) {
702     assert(dbGraph.contigs.size() == dbGraph.ecs.size());
703     tmp_size = dbGraph.contigs.size();
704     out.write((char*)&tmp_size, sizeof(tmp_size));
705     for (auto& contig : dbGraph.contigs) {
706       out.write((char*)&contig.id, sizeof(contig.id));
707       out.write((char*)&contig.length, sizeof(contig.length));
708       tmp_size = strlen(contig.seq.c_str());
709       out.write((char*)&tmp_size, sizeof(tmp_size));
710       out.write(contig.seq.c_str(), tmp_size);
711 
712       // 10.1 write out transcript info
713       tmp_size = contig.transcripts.size();
714       out.write((char*)&tmp_size, sizeof(tmp_size));
715       for (auto& info : contig.transcripts) {
716         out.write((char*)&info.trid, sizeof(info.trid));
717         out.write((char*)&info.pos, sizeof(info.pos));
718         out.write((char*)&info.sense, sizeof(info.sense));
719       }
720     }
721 
722     // 11. write out ecs info
723     for (auto ec : dbGraph.ecs) {
724       out.write((char*)&ec, sizeof(ec));
725     }
726 
727 
728   } else {
729     // write empty dBG
730     tmp_size = 0;
731     out.write((char*)&tmp_size, sizeof(tmp_size));
732   }
733 
734 
735   out.flush();
736   out.close();
737 }
738 
fwStep(Kmer km,Kmer & end) const739 bool KmerIndex::fwStep(Kmer km, Kmer& end) const {
740   int j = -1;
741   int fw_count = 0;
742   for (int i = 0; i < 4; i++) {
743     Kmer fw_rep = end.forwardBase(Dna(i)).rep();
744     auto search = kmap.find(fw_rep);
745     if (search != kmap.end()) {
746       j = i;
747       ++fw_count;
748       if (fw_count > 1) {
749         return false;
750       }
751     }
752   }
753 
754   if (fw_count != 1) {
755     return false;
756   }
757 
758   Kmer fw = end.forwardBase(Dna(j));
759 
760   int bw_count = 0;
761   for (int i = 0; i < 4; i++) {
762     Kmer bw_rep = fw.backwardBase(Dna(i)).rep();
763     if (kmap.find(bw_rep) != kmap.end()) {
764       ++bw_count;
765       if (bw_count > 1) {
766         return false;
767       }
768     }
769   }
770 
771   if (bw_count != 1) {
772     return false;
773   } else {
774     if (fw != km) {
775       end = fw;
776       return true;
777     } else {
778       return false;
779     }
780   }
781 
782 }
783 
load(ProgramOptions & opt,bool loadKmerTable)784 void KmerIndex::load(ProgramOptions& opt, bool loadKmerTable) {
785 
786   std::string& index_in = opt.index;
787   std::ifstream in;
788 
789 
790   in.open(index_in, std::ios::in | std::ios::binary);
791 
792   if (!in.is_open()) {
793     // TODO: better handling
794     std::cerr << "Error: index input file could not be opened!";
795     exit(1);
796   }
797 
798   // 1. read version
799   size_t header_version = 0;
800   in.read((char *)&header_version, sizeof(header_version));
801 
802   if (header_version != INDEX_VERSION) {
803     std::cerr << "Error: incompatible indices. Found version " << header_version << ", expected version " << INDEX_VERSION << std::endl
804               << "Rerun with index to regenerate";
805     exit(1);
806   }
807 
808   // 2. read k
809   in.read((char *)&k, sizeof(k));
810   if (Kmer::k == 0) {
811     //std::cerr << "[index] no k has been set, setting k = " << k << std::endl;
812     Kmer::set_k(k);
813     opt.k = k;
814   } else if (Kmer::k == k) {
815     //std::cerr << "[index] Kmer::k has been set and matches" << k << std::endl;
816     opt.k = k;
817   } else {
818     std::cerr << "Error: Kmer::k was already set to = " << Kmer::k << std::endl
819               << "       conflicts with value of k  = " << k << std::endl;
820     exit(1);
821   }
822 
823   // 3. read in number of targets
824   in.read((char *)&num_trans, sizeof(num_trans));
825 
826   // 4. read in length of targets
827   target_lens_.clear();
828   target_lens_.reserve(num_trans);
829 
830   for (int i = 0; i < num_trans; i++) {
831     int tlen;
832     in.read((char *)&tlen, sizeof(tlen));
833     target_lens_.push_back(tlen);
834   }
835 
836   // 5. read number of k-mers
837   size_t kmap_size;
838   in.read((char *)&kmap_size, sizeof(kmap_size));
839 
840   std::cerr << "[index] k-mer length: " << k << std::endl;
841   std::cerr << "[index] number of targets: " << pretty_num(num_trans)
842     << std::endl;
843   std::cerr << "[index] number of k-mers: " << pretty_num(kmap_size)
844     << std::endl;
845 
846   kmap.clear();
847   if (loadKmerTable) {
848     kmap.reserve(kmap_size,true);
849   }
850 
851   // 6. read kmer->ec values
852   Kmer tmp_kmer;
853   KmerEntry tmp_val;
854   for (size_t i = 0; i < kmap_size; ++i) {
855     in.read((char *)&tmp_kmer, sizeof(tmp_kmer));
856     in.read((char *)&tmp_val, sizeof(tmp_val));
857 
858     if (loadKmerTable) {
859       kmap.insert({tmp_kmer, tmp_val});
860     }
861   }
862 
863   // 7. read number of equivalence classes
864   size_t ecmap_size;
865   in.read((char *)&ecmap_size, sizeof(ecmap_size));
866 
867   std::cerr << "[index] number of equivalence classes: "
868     << pretty_num(ecmap_size) << std::endl;
869   ecmap.resize(ecmap_size);
870   int tmp_id;
871   int tmp_ecval;
872   size_t vec_size;
873   // 8. read each equiv class
874   for (size_t ec = 0; ec < ecmap_size; ++ec) {
875     in.read((char *)&tmp_id, sizeof(tmp_id));
876 
877     // 8.1 read size of equiv class
878     in.read((char *)&vec_size, sizeof(vec_size));
879 
880     // 8.2 read each member
881     std::vector<int> tmp_vec;
882     tmp_vec.reserve(vec_size);
883     for (size_t j = 0; j < vec_size; ++j ) {
884       in.read((char *)&tmp_ecval, sizeof(tmp_ecval));
885       tmp_vec.push_back(tmp_ecval);
886     }
887     //ecmap.insert({tmp_id, tmp_vec});
888     ecmap[tmp_id] = tmp_vec;
889     ecmapinv.insert({tmp_vec, tmp_id});
890   }
891 
892   // 9. read in target ids
893   target_names_.clear();
894   target_names_.reserve(num_trans);
895 
896   size_t tmp_size;
897   size_t bufsz = 1024;
898   char *buffer = new char[bufsz];
899   for (auto i = 0; i < num_trans; ++i) {
900     // 9.1 read in the size
901     in.read((char *)&tmp_size, sizeof(tmp_size));
902 
903     if (tmp_size +1 > bufsz) {
904       delete[] buffer;
905       bufsz = 2*(tmp_size+1);
906       buffer = new char[bufsz];
907     }
908 
909     // clear the buffer
910     memset(buffer,0,bufsz);
911     // 9.2 read in the character string
912     in.read(buffer, tmp_size);
913 
914     /* std::string tmp_targ_id( buffer ); */
915     target_names_.push_back(std::string( buffer ));
916   }
917 
918 
919   // 10. read contigs
920   size_t contig_size;
921   in.read((char *)&contig_size, sizeof(contig_size));
922   dbGraph.contigs.clear();
923   dbGraph.contigs.reserve(contig_size);
924   for (auto i = 0; i < contig_size; i++) {
925     Contig c;
926     in.read((char *)&c.id, sizeof(c.id));
927     in.read((char *)&c.length, sizeof(c.length));
928     in.read((char *)&tmp_size, sizeof(tmp_size));
929 
930     if (tmp_size + 1 > bufsz) {
931       delete[] buffer;
932       bufsz = 2*(tmp_size+1);
933       buffer = new char[bufsz];
934     }
935 
936     memset(buffer,0,bufsz);
937     in.read(buffer, tmp_size);
938     c.seq = std::string(buffer); // copy
939 
940     // 10.1 read transcript info
941     in.read((char*)&tmp_size, sizeof(tmp_size));
942     c.transcripts.clear();
943     c.transcripts.reserve(tmp_size);
944 
945     for (auto j = 0; j < tmp_size; j++) {
946       ContigToTranscript info;
947       in.read((char*)&info.trid, sizeof(info.trid));
948       in.read((char*)&info.pos, sizeof(info.pos));
949       in.read((char*)&info.sense, sizeof(info.sense));
950       c.transcripts.push_back(info);
951     }
952 
953     dbGraph.contigs.push_back(c);
954   }
955 
956   // 11. read ecs info
957   dbGraph.ecs.clear();
958   dbGraph.ecs.reserve(contig_size);
959   int tmp_ec;
960   for (auto i = 0; i < contig_size; i++) {
961     in.read((char *)&tmp_ec, sizeof(tmp_ec));
962     dbGraph.ecs.push_back(tmp_ec);
963   }
964 
965   // delete the buffer
966   delete[] buffer;
967   buffer=nullptr;
968 
969   in.close();
970 }
971 
972 
mapPair(const char * s1,int l1,const char * s2,int l2,int ec) const973 int KmerIndex::mapPair(const char *s1, int l1, const char *s2, int l2, int ec) const {
974   bool d1 = true;
975   bool d2 = true;
976   int p1 = -1;
977   int p2 = -1;
978   int c1 = -1;
979   int c2 = -1;
980 
981 
982   KmerIterator kit1(s1), kit_end;
983 
984   bool found1 = false;
985   for (; kit1 != kit_end; ++kit1) {
986     Kmer x = kit1->first;
987     Kmer xr = x.rep();
988     auto search = kmap.find(xr);
989     bool forward = (x==xr);
990 
991     if (search != kmap.end()) {
992       found1 = true;
993       KmerEntry val = search->second;
994       c1 = val.contig;
995       if (forward == val.isFw()) {
996         p1 = val.getPos() - kit1->second;
997         d1 = true;
998       } else {
999         p1 = val.getPos() + k + kit1->second;
1000         d1 = false;
1001       }
1002       break;
1003     }
1004   }
1005 
1006   if (!found1) {
1007     return -1;
1008   }
1009 
1010 
1011 
1012   KmerIterator kit2(s2);
1013   bool found2 = false;
1014 
1015   for (; kit2 != kit_end; ++kit2) {
1016     Kmer x = kit2->first;
1017     Kmer xr = x.rep();
1018     auto search = kmap.find(xr);
1019     bool forward = (x==xr);
1020 
1021     if (search != kmap.end()) {
1022       found2 = true;
1023       KmerEntry val = search->second;
1024       c2 = val.contig;
1025       if (forward== val.isFw()) {
1026         p2 = val.getPos() - kit2->second;
1027         d2 = true;
1028       } else {
1029         p2 = val.getPos() + k + kit2->second;
1030         d2 = false;
1031       }
1032       break;
1033     }
1034   }
1035 
1036   if (!found2) {
1037     return -1;
1038   }
1039 
1040   if (c1 != c2) {
1041     return -1;
1042   }
1043 
1044   if ((d1 && d2) || (!d1 && !d2)) {
1045     //std::cerr << "Reads map to same strand " << s1 << "\t" << s2 << std::endl;
1046     return -1;
1047   }
1048 
1049   if (p1>p2) {
1050     return p1-p2;
1051   } else {
1052     return p2-p1;
1053   }
1054 
1055 }
1056 
1057 // use:  match(s,l,v)
1058 // pre:  v is initialized
1059 // post: v contains all equiv classes for the k-mers in s
match(const char * s,int l,std::vector<std::pair<KmerEntry,int>> & v) const1060 void KmerIndex::match(const char *s, int l, std::vector<std::pair<KmerEntry, int>>& v) const {
1061   KmerIterator kit(s), kit_end;
1062   bool backOff = false;
1063   int nextPos = 0; // nextPosition to check
1064   for (int i = 0;  kit != kit_end; ++i,++kit) {
1065     // need to check it
1066     auto search = kmap.find(kit->first.rep());
1067     int pos = kit->second;
1068 
1069     if (search != kmap.end()) {
1070 
1071       KmerEntry val = search->second;
1072 
1073       v.push_back({val, kit->second});
1074 
1075       // see if we can skip ahead
1076       // bring thisback later
1077       bool forward = (kit->first == search->first);
1078       int dist = val.getDist(forward);
1079 
1080 
1081       //const int lastbp = 10;
1082       if (dist >= 2) {
1083         // where should we jump to?
1084         int nextPos = pos+dist; // default jump
1085 
1086         if (pos + dist >= l-k) {
1087           // if we can jump beyond the read, check the end
1088           nextPos = l-k;
1089         }
1090 
1091         // check next position
1092         KmerIterator kit2(kit);
1093         kit2.jumpTo(nextPos);
1094         if (kit2 != kit_end) {
1095           Kmer rep2 = kit2->first.rep();
1096           auto search2 = kmap.find(rep2);
1097           bool found2 = false;
1098           int  found2pos = pos+dist;
1099           if (search2 == kmap.end()) {
1100             found2=true;
1101             found2pos = pos;
1102           } else if (val.contig == search2->second.contig) {
1103             found2=true;
1104             found2pos = pos+dist;
1105           }
1106           if (found2) {
1107             // great, a match (or nothing) see if we can move the k-mer forward
1108             if (found2pos >= l-k) {
1109               v.push_back({val, l-k}); // push back a fake position
1110               break; //
1111             } else {
1112               v.push_back({val, found2pos});
1113               kit = kit2; // move iterator to this new position
1114             }
1115           } else {
1116             // this is weird, let's try the middle k-mer
1117             bool foundMiddle = false;
1118             if (dist > 4) {
1119               int middlePos = (pos + nextPos)/2;
1120               int middleContig = -1;
1121               int found3pos = pos+dist;
1122               KmerIterator kit3(kit);
1123               kit3.jumpTo(middlePos);
1124               KmerEntry val3;
1125               if (kit3 != kit_end) {
1126                 Kmer rep3 = kit3->first.rep();
1127                 auto search3 = kmap.find(rep3);
1128                 if (search3 != kmap.end()) {
1129                   middleContig = search3->second.contig;
1130                   if (middleContig == val.contig) {
1131                     foundMiddle = true;
1132                     found3pos = middlePos;
1133                   } else if (middleContig == search2->second.contig) {
1134                     foundMiddle = true;
1135                     found3pos = pos+dist;
1136                   }
1137                 }
1138 
1139 
1140                 if (foundMiddle) {
1141                   v.push_back({search3->second, found3pos});
1142                   if (nextPos >= l-k) {
1143                     break;
1144                   } else {
1145                     kit = kit2;
1146                   }
1147                 }
1148               }
1149             }
1150 
1151 
1152             if (!foundMiddle) {
1153               ++kit;
1154               backOff = true;
1155               goto donejumping; // sue me Dijkstra!
1156             }
1157           }
1158         } else {
1159           // the sequence is messed up at this point, let's just take the match
1160           //v.push_back({dbGraph.ecs[val.contig], l-k});
1161           break;
1162         }
1163       }
1164     }
1165 
1166 donejumping:
1167 
1168     if (backOff) {
1169       // backup plan, let's play it safe and search incrementally for the rest, until nextStop
1170       for (int j = 0; kit != kit_end; ++kit,++j) {
1171         if (j==skip) {
1172           j=0;
1173         }
1174         if (j==0) {
1175           // need to check it
1176           Kmer rep = kit->first.rep();
1177           auto search = kmap.find(rep);
1178           if (search != kmap.end()) {
1179             // if k-mer found
1180             v.push_back({search->second, kit->second}); // add equivalence class, and position
1181           }
1182         }
1183 
1184         if (kit->second >= nextPos) {
1185           backOff = false;
1186           break; // break out of backoff for loop
1187         }
1188       }
1189     }
1190   }
1191 }
1192 
findPosition(int tr,Kmer km,int p) const1193 std::pair<int,bool> KmerIndex::findPosition(int tr, Kmer km, int p) const {
1194   auto it = kmap.find(km.rep());
1195   if (it != kmap.end()) {
1196     KmerEntry val = it->second;
1197     return findPosition(tr, km, val, p);
1198   } else {
1199     return {-1,true};
1200   }
1201 }
1202 
1203 //use:  (pos,sense) = index.findPosition(tr,km,val,p)
1204 //pre:  index.kmap[km] == val,
1205 //      km is the p-th k-mer of a read
1206 //      val.contig maps to tr
1207 //post: km is found in position pos (1-based) on the sense/!sense strand of tr
findPosition(int tr,Kmer km,KmerEntry val,int p) const1208 std::pair<int,bool> KmerIndex::findPosition(int tr, Kmer km, KmerEntry val, int p) const {
1209   bool fw = (km == km.rep());
1210   bool csense = (fw == val.isFw());
1211 
1212   int trpos = -1;
1213   bool trsense = true;
1214   if (val.contig < 0) {
1215     return {-1, true};
1216   }
1217   const Contig &c = dbGraph.contigs[val.contig];
1218   for (auto x : c.transcripts) {
1219     if (x.trid == tr) {
1220       trpos = x.pos;
1221       trsense = x.sense;
1222       break;
1223     }
1224   }
1225 
1226   if (trpos == -1) {
1227     return {-1,true};
1228   }
1229 
1230 
1231   if (trsense) {
1232     if (csense) {
1233       return {trpos + val.getPos() - p + 1, csense}; // 1-based, case I
1234     } else {
1235       return {trpos + val.getPos() + k + p, csense}; // 1-based, case III
1236     }
1237   } else {
1238     if (csense) {
1239       return {trpos + (c.length - val.getPos() -1) + k + p, !csense};  // 1-based, case IV
1240     } else {
1241       return {trpos + (c.length - val.getPos())  - p, !csense}; // 1-based, case II
1242     }
1243   }
1244 }
1245 
1246 /*
1247 // use:  r = matchEnd(s,l,v,p)
1248 // pre:  v is initialized, p>=0
1249 // post: v contains all equiv classes for the k-mer in s, as
1250 //       well as the best match for s[p:]
1251 //       if r is false then no match was found
1252 bool KmerIndex::matchEnd(const char *s, int l, std::vector<std::pair<int,int>> &v, int maxPos) const {
1253   // kmer-iterator checks for N's and out of bounds
1254   KmerIterator kit(s+maxPos), kit_end;
1255   if (kit != kit_end && kit->second == 0) {
1256     Kmer last = kit->first;
1257     auto search = kmap.find(last.rep());
1258 
1259     if (search == kmap.end()) {
1260       return false; // shouldn't happen
1261     }
1262 
1263     KmerEntry val = search->second;
1264     bool forward = (kit->first == search->first);
1265     int dist = val.getDist(forward);
1266     int pos = maxPos + dist + 1; // move 1 past the end of the contig
1267 
1268     const char* sLeft = s + pos + (k-1);
1269     int sizeleft = l -  (pos + k-1); // characters left in sleft
1270     if (sizeleft <= 0) {
1271       return false; // nothing left to match
1272     }
1273 
1274     // figure out end k-mer
1275     const Contig& root = dbGraph.contigs[val.contig];
1276     Kmer end; // last k-mer
1277     bool readFw = (forward == val.isFw());
1278     if (readFw) {
1279       end = Kmer(root.seq.c_str() + root.length-1);
1280     } else {
1281       end = Kmer(root.seq.c_str()).twin();
1282     }
1283 
1284 
1285     int bestContig = -1;
1286     int bestDist = sizeleft;
1287     int numBest = 0;
1288     for (int i = 0; i < 4; i++) {
1289       Kmer x = end.forwardBase(Dna(i));
1290       Kmer xr = x.rep();
1291       auto searchx = kmap.find(xr);
1292       if (searchx != kmap.end()) {
1293         KmerEntry valx = searchx->second;
1294         const Contig& branch = dbGraph.contigs[valx.contig];
1295         if (branch.length < sizeleft) {
1296           return false; // haven't implemented graph walks yet
1297         }
1298         std::string cs;
1299         bool contigFw = (x==xr) && valx.isFw();
1300         // todo: get rid of this string copy
1301         if (valx.getPos() == 0 && contigFw) {
1302           cs = branch.seq.substr(k-1);
1303         } else if (valx.getPos() == branch.length-1 && !contigFw) {
1304           cs = revcomp(branch.seq).substr(k-1);
1305         }
1306         int hdist = hamming(sLeft,cs.c_str());
1307         if (bestDist >= hdist) {
1308           numBest++;
1309           if (bestDist > hdist) {
1310             bestDist = hdist;
1311             numBest = 1;
1312           }
1313           bestContig = valx.contig;
1314         }
1315       }
1316     }
1317 
1318     if (numBest == 1 && bestDist < 10) {
1319       v.push_back({dbGraph.ecs[bestContig], l-k});
1320       return true;
1321     } else {
1322       return false;
1323     }
1324   } else {
1325     return false;
1326   }
1327 }
1328 */
1329 
1330 
1331 // use:  res = intersect(ec,v)
1332 // pre:  ec is in ecmap, v is a vector of valid targets
1333 //       v is sorted in increasing order
1334 // post: res contains the intersection  of ecmap[ec] and v sorted increasing
1335 //       res is empty if ec is not in ecma
intersect(int ec,const std::vector<int> & v) const1336 std::vector<int> KmerIndex::intersect(int ec, const std::vector<int>& v) const {
1337   std::vector<int> res;
1338   //auto search = ecmap.find(ec);
1339   if (ec < ecmap.size()) {
1340     //if (search != ecmap.end()) {
1341     //auto& u = search->second;
1342     auto& u = ecmap[ec];
1343     res.reserve(v.size());
1344 
1345     auto a = u.begin();
1346     auto b = v.begin();
1347 
1348     while (a != u.end() && b != v.end()) {
1349       if (*a < *b) {
1350         ++a;
1351       } else if (*b < *a) {
1352         ++b;
1353       } else {
1354         // match
1355         res.push_back(*a);
1356         ++a;
1357         ++b;
1358       }
1359     }
1360   }
1361   return res;
1362 }
1363 
1364 
loadTranscriptSequences() const1365 void KmerIndex::loadTranscriptSequences() const {
1366   if (target_seqs_loaded) {
1367     return;
1368   }
1369 
1370 
1371 
1372   std::vector<std::vector<std::pair<int, ContigToTranscript>>> trans_contigs(num_trans);
1373   for (auto &c : dbGraph.contigs) {
1374     for (auto &ct : c.transcripts) {
1375       trans_contigs[ct.trid].push_back({c.id, ct});
1376     }
1377   }
1378 
1379   auto &target_seqs = const_cast<std::vector<std::string>&>(target_seqs_);
1380 
1381   for (int i = 0; i < trans_contigs.size(); i++) {
1382     auto &v = trans_contigs[i];
1383     std::sort(v.begin(), v.end(), [](std::pair<int,ContigToTranscript> a, std::pair<int,ContigToTranscript> b) {
1384         return a.second.pos < b.second.pos;
1385       });
1386 
1387     std::string seq;
1388     seq.reserve(target_lens_[i]);
1389 
1390     for (auto &pct : v) {
1391       auto ct = pct.second;
1392       int start = (ct.pos==0) ? 0 : k-1;
1393       const auto& contig = dbGraph.contigs[pct.first];
1394       if (ct.sense) {
1395         seq.append(contig.seq.substr(start));
1396       } else {
1397         seq.append(revcomp(contig.seq).substr(start));
1398       }
1399     }
1400     target_seqs.push_back(seq);
1401   }
1402 
1403   bool &t = const_cast<bool&>(target_seqs_loaded);
1404   t = true;//target_seqs_loaded = true;
1405   return;
1406 }
1407 
clear()1408 void KmerIndex::clear() {
1409   kmap.clear_table();
1410   ecmap.resize(0);
1411   dbGraph.ecs.resize(0);
1412   dbGraph.contigs.resize(0);
1413   {
1414     std::unordered_map<std::vector<int>, int, SortedVectorHasher> empty;
1415     std::swap(ecmapinv, empty);
1416   }
1417 
1418   target_lens_.resize(0);
1419   target_names_.resize(0);
1420   target_seqs_.resize(0);
1421 }
1422 
writePseudoBamHeader(std::ostream & o) const1423 void KmerIndex::writePseudoBamHeader(std::ostream &o) const {
1424   // write out header
1425   o << "@HD\tVN:1.0\n";
1426   for (int i = 0; i < num_trans; i++) {
1427     o << "@SQ\tSN:" << target_names_[i] << "\tLN:" << target_lens_[i] << "\n";
1428   }
1429   o << "@PG\tID:kallisto\tPN:kallisto\tVN:"<< KALLISTO_VERSION << "\n";
1430   o.flush();
1431 }
1432