1 #include "GeneModel.h"
2 #include <assert.h>
3 #include <iostream>
4 #include <htslib/sam.h> // needed for CIGAR ops
5 #include <zlib.h>
6 
strandToChar(bool s)7 char strandToChar(bool s) {
8   return (s) ? '+' : '-';
9 }
10 
charToStrand(char c)11 bool charToStrand(char c) {
12   switch (c) {
13     case '+':
14       return true; break;
15     case '-':
16       return false; break;
17     default:
18       return true; break;
19   }
20 }
21 
chrLookup(const Transcriptome & model,const std::string chr)22 int chrLookup(const Transcriptome& model, const std::string chr) {
23   auto cit = model.chrNameToId.find(chr);
24   if (cit != model.chrNameToId.end()) {
25     int c = cit->second;
26     assert(c >= 0);
27     //assert(c < chr.size());
28     return c;
29   } else {
30     return -1;
31   }
32 };
33 
translateTrPosition(const int tr,const int pos,const int rlen,bool strand,TranscriptAlignment & aln) const34 bool Transcriptome::translateTrPosition(const int tr, const int pos, const int rlen, bool strand, TranscriptAlignment &aln) const {
35   const TranscriptModel& model = transcripts[tr];
36   if (model.chr == -1) {
37     return false;
38   }
39   aln.chr = model.chr;
40   aln.cigar.clear();
41   aln.chrpos = -1;
42 
43 
44   aln.strand = (strand == model.strand);
45   int trpos;
46   int rpos = 0; // how many bp of read have been matched
47 
48   int n_exons = model.exons.size();
49 
50   if (model.strand) {
51     trpos = pos;
52     if (trpos < 0) {
53       // read starts a bit before the transcript, softclip to fit
54       int softclip = -trpos;
55       aln.cigar.push_back((softclip << BAM_CIGAR_SHIFT) | BAM_CSOFT_CLIP);
56       rpos += softclip;
57       aln.chrpos = model.start;
58     }
59     for (int i = 0; i < n_exons; i++) {
60       auto& exon = model.exons[i];
61       int len = exon.stop - exon.start;
62       if (trpos < len) {
63         if (rpos == 0) {
64           aln.chrpos = exon.start + trpos; // left-most position
65         }
66         if (trpos + rlen <= len) {
67           aln.cigar.push_back(((rlen-rpos)<< BAM_CIGAR_SHIFT) | BAM_CMATCH); // rlen-trpos of M
68           rpos = rlen;
69           break; // end of the read
70         } else {
71           int mlen = 0;
72           if (trpos < 0) { // match begins before this exon, extends over it
73             mlen = len;
74           } else {
75             mlen = len - trpos;
76           }
77           aln.cigar.push_back((mlen << BAM_CIGAR_SHIFT) | BAM_CMATCH);
78           if (i +1 < n_exons) {
79             aln.cigar.push_back(((model.exons[i+1].start-exon.stop) << BAM_CIGAR_SHIFT) | BAM_CREF_SKIP); // insert a bunch of N's
80           }
81           rpos += mlen;
82         }
83       }
84       trpos -= len;
85     }
86     if (rpos < rlen) {
87       aln.cigar.push_back(((rlen-rpos) << BAM_CIGAR_SHIFT) | BAM_CSOFT_CLIP);
88     }
89   } else {
90     trpos = model.length - pos - rlen; // counting from back from right most transcript position
91     if (trpos < 0) {
92       int softclip = -trpos;
93       aln.cigar.push_back((softclip << BAM_CIGAR_SHIFT) | BAM_CSOFT_CLIP); // soft clip
94       rpos += softclip;
95       aln.chrpos = model.start;
96     }
97     for (int i = n_exons-1; i >= 0; i--) {
98       auto& exon = model.exons[i];
99       int len = exon.stop - exon.start;
100       if (trpos < len) {
101         if (rpos == 0) {
102           aln.chrpos = exon.start + trpos;
103         }
104         if (trpos + rlen <= len) {
105           aln.cigar.push_back(((rlen-rpos) << BAM_CIGAR_SHIFT) | BAM_CMATCH);
106           rpos = rlen;
107           break;
108         } else {
109           int mlen = 0;
110           if (trpos < 0) { // match begins before this exon, extends over it
111             mlen = len;
112           } else {
113             mlen = len - trpos;
114           }
115           aln.cigar.push_back((mlen << BAM_CIGAR_SHIFT) | BAM_CMATCH);
116           if (i > 0) {
117             aln.cigar.push_back(((model.exons[i-1].start - exon.stop) << BAM_CIGAR_SHIFT) | BAM_CREF_SKIP);
118           }
119           rpos += mlen;
120         }
121       }
122       trpos -= len;
123     }
124     if (rpos < rlen) {
125       aln.cigar.push_back(((rlen-rpos) << BAM_CIGAR_SHIFT) | BAM_CSOFT_CLIP);
126     }
127   }
128 
129 
130   return true;
131 }
132 
loadChromosomes(const std::string & chrom_fn)133 void Transcriptome::loadChromosomes(const std::string &chrom_fn) {
134   std::ifstream in(chrom_fn);
135   std::string type;
136   while (in.good()) {
137     int id = chr.size();
138     Chromosome c;
139     c.len = -1;
140     in >> c.name >> c.len;
141     if (!c.name.empty() && c.len >= 0) {
142       chr.push_back(c);
143       chrNameToId.insert({c.name,id});
144     }
145     std::getline(in,type); // clear end of line
146   }
147 }
148 
149 /*
150 void Transcriptome::loadTranscriptome(const KmerIndex& index, std::istream &in, const ProgramOptions& options) {
151   std::string line;
152   std::string segment;
153   std::string type;
154   std::string gtype, ttype;
155   std::string gene_id, tr_id, chr_id;
156 
157 
158   for (int i = 0; i < index.num_trans; i++) {
159     TranscriptModel tr;
160     tr.id = i;
161     tr.chr = -1;
162     tr.gene_id = -1;
163     tr.length = -1;
164     tr.strand = true;
165     transcripts.push_back(std::move(tr));
166     trNameToId.insert({index.target_names_[i], i});
167   }
168 
169 
170 
171 
172   int tr_extras = 0;
173   //std::unordered_map<std::string,int>
174   while (in.good()) {
175     in >> type;
176     if (type == "GENE") {
177       GeneModel model;
178       char strand = '?';
179       in >> model.name >> model.commonName >> chr_id >> gtype >> strand >> model.start >> model.stop;
180       model.chr = chrLookup(*this, chr_id);
181       if (model.chr == -1) {
182         std::cerr << "Error: chromosome " << chr_id << " not defined before reference" << std::endl;
183         assert(false);
184       }
185       model.id =  genes.size();
186       model.strand = charToStrand(strand);
187 
188       geneNameToId.insert({model.name, model.id});
189       in >> line; // rest of transcripts, ignore for now
190       genes.push_back(model);
191 
192     } else if (type == "TRANSCRIPT") {
193       TranscriptModel model;
194       char strand = '?';
195       in >> tr_id >> gene_id >> chr_id >> ttype >> strand >> model.start >> model.stop;
196       auto it = trNameToId.find(tr_id);
197       if (it != trNameToId.end()) {
198         model.id = it->second;
199       } else {
200         tr_extras++;
201         //std::cerr << "Warning: transcript " << tr_id << " is defined in GTF but not in FASTA" << std::endl;
202         std::getline(in,line);
203         continue;
204       }
205 
206       model.chr = chrLookup(*this, chr_id);
207       if (model.chr == -1) {
208         std::cerr << "Error: chromosome " << chr_id << " not definede before reference" << std::endl;
209         assert(false);
210       }
211 
212       model.strand = charToStrand(strand);
213       model.name = tr_id;
214 
215       in >> line;
216       std::stringstream strline(line);
217       while (std::getline(strline, segment, ';')) {
218         ExonModel ex;
219         ex.chr = model.chr;
220         ex.strand = model.strand;
221         int p = segment.find(',');
222         ex.start = std::stoi(segment.substr(0,p));
223         ex.stop = std::stoi(segment.substr(p+1));
224         model.exons.push_back(std::move(ex));
225       }
226       model.length = 0;
227       for (auto & exon : model.exons) {
228         model.length += exon.stop - exon.start;
229       }
230       std::getline(in, line);
231       int id = model.id;
232       if (transcripts[id].chr == -1) {
233         transcripts[id] = std::move(model);
234       }
235 
236     } else if (type == "CHROMOSOME") {
237       int id = chr.size();
238       Chromosome c;
239       in >> c.name >> c.len;
240       chr.push_back(c);
241       chrNameToId.insert({c.name,id});
242       std::getline(in,line);
243     } else {
244       std::getline(in, line); // read until end of line
245     }
246   }
247 
248   if (tr_extras > 0) {
249     std::cerr << "Warning: " << tr_extras << " transcript defined in GTF but not found in FASTA" << std::endl;
250   }
251 
252   int tr_bad = 0;
253   for (int i = 0; i < index.num_trans; i++) {
254     const auto& tr = transcripts[i];
255     if (tr.chr == -1) {
256       tr_bad++;
257     }
258   }
259   if (tr_bad > 0) {
260     std::cerr << "Warning: there were " << tr_bad << " transcripts out of "
261               << index.num_trans << " that are missing in the GTF file " << std::endl;
262   }
263 
264 }
265 */
266 
267 
addGTFLine(const std::string & line,const KmerIndex & index,bool guessChromosomes)268 int Transcriptome::addGTFLine(const std::string &line, const KmerIndex& index, bool guessChromosomes) {
269 
270   if(line.empty() || line[0] == '#') {
271     return 0;
272   }
273   int p = 0, t=0;
274   // read chr
275   std::string schr;
276   t = line.find('\t',p);
277   schr = line.substr(p,t-p);
278   t = line.find('\t',t+1); // skip annotation source
279   p = t+1;
280   t = line.find('\t',p);
281   std::string typestr = line.substr(p,t-p);
282   enum Type {GENE, TRANSCRIPT, EXON, OTHER};
283   Type type;
284   if (typestr == "gene") {
285     type = Type::GENE;
286     // need id, name, chr, coding, strand, start, stop, transcripts
287   } else if (typestr == "transcript") {
288     type = Type::TRANSCRIPT;
289     //id, gene, chr, coding, strand, start, stop, exons: start,stop; (increasing for +, decreasing for -)
290   } else if (typestr == "exon") {
291     type = Type::EXON;
292   } else {
293     type = Type::OTHER;
294     return 0;
295   }
296   p = t+1;
297   t = line.find('\t',p);
298   int start = std::stoi(line.substr(p,t-p))-1;
299   p = t+1;
300   t = line.find('\t',p);
301   int stop = std::stoi(line.substr(p,t-p));
302   t = line.find('\t',t+1);
303   p = t+1; // skip score
304   t = line.find('\t',p);
305   char strand = line[p];
306   p = t+1;
307   t = line.find('\t',p); // phase, ignore
308   p = t+1;
309 
310   GeneModel gmodel;
311   TranscriptModel tmodel;
312   ExonModel emodel;
313 
314   int ichr = chrLookup(*this, schr);
315   if (ichr == -1) {
316     if (guessChromosomes) {
317       // just add a new chr on the fly
318       int i = chr.size();
319       Chromosome c;
320       c.name = schr;
321       c.len = 536870911; // maximum that bai can index :(
322       chr.push_back(c);
323       chrNameToId.insert({schr, i});
324       ichr = i;
325     } else {
326       return 1; // couldn't find chrom
327     }
328   }
329 
330   if (type == Type::GENE) {
331     gmodel.id = genes.size(); // ?? ever used?
332     gmodel.chr = ichr;
333     gmodel.start = start;
334     gmodel.stop = stop;
335     gmodel.strand = (strand == '+') ? true : false;
336   } else if (type == Type::TRANSCRIPT) {
337     tmodel.chr = ichr;
338     tmodel.start = start;
339     tmodel.stop = stop;
340     tmodel.strand = (strand == '+') ? true : false;
341     tmodel.gene_id = -1;
342   } else if (type == Type::EXON) {
343     emodel.chr = ichr;
344     emodel.start = start;
345     emodel.stop = stop;
346     emodel.strand = (strand == '+') ? true : false;
347   }
348 
349 
350   // line[p:] contains the additional fields as 'key "value";'
351   // line[p:t] is key, line[t+2:]
352   int s = 0;
353   std::string key,value;
354   int keycount = 0;
355   std::string gversion, tversion, gene_name, transcript_name;
356   while (p != std::string::npos) {
357     if ((t = line.find('"',p))== std::string::npos) {
358       break;
359     }
360     if ((s = line.find('"',t+1)) == std::string::npos) {
361       break;
362     }
363     assert(line[t-1] == ' ');
364     key.assign(line.substr(p,t-p-1));
365     assert(s > t);
366     value.assign(line.substr(t+1,s-t-1));
367 
368     // common values
369     if (key == "gene_id") {
370       keycount++;
371       gene_name = std::move(value);
372     } else if (key == "gene_version") {
373       keycount++;
374       gversion = std::move(value);
375     }
376 
377 
378     if (type == Type::GENE) {
379       if (key == "gene_name") {
380         keycount++;
381         gmodel.commonName = std::move(value);
382       } else if (key == "gene_id")
383 
384       if (keycount == 3) {
385         break;
386       }
387     } else {
388       if (key == "transcript_id") {
389         keycount++;
390         transcript_name = std::move(value);
391       } else if (key == "transcript_version") {
392         keycount++;
393         tversion = std::move(value);
394       }
395 
396       if (type == Type::TRANSCRIPT) {
397         if (keycount == 4) {
398           break;
399         }
400       } else if (type == Type::EXON) {
401         if (keycount == 4) {
402           break;
403         }
404       }
405     }
406 
407     assert(s +1 < line.size());
408     assert(line[s+1] == ';');
409     if ((p = line.find(' ',s)) != std::string::npos) {
410       p++;
411       if (p >= line.size()) {
412         break;
413       }
414     }
415   }
416 
417   if (type == Type::GENE) {
418     gmodel.name = gene_name;
419     assert(!gmodel.name.empty());
420     if (!gversion.empty() && gmodel.name.find('.') == std::string::npos) {
421       gmodel.name += "." + gversion;
422     }
423     // add to the transcriptome model
424     geneNameToId.insert({gmodel.name, gmodel.id});
425     genes.push_back(std::move(gmodel));
426   } else if (type == Type::TRANSCRIPT) {
427     tmodel.name = transcript_name;
428     assert(!tmodel.name.empty());
429 
430     // canonical name includes version number
431     if (!tversion.empty() && tmodel.name.find('.') == std::string::npos) {
432       tmodel.name += "." + tversion;
433     }
434     auto it2 = trNameToId.find(tmodel.name);
435     if (it2 == trNameToId.end()) {
436       tmodel.name = transcript_name; // try without version number
437       it2 = trNameToId.find(tmodel.name);
438     }
439 
440 
441     if (it2 != trNameToId.end()) {
442       tmodel.id = it2->second;
443       tmodel.length = index.target_lens_[tmodel.id];
444     } else {
445       // transcript not found, do nothing
446       return 2;
447     }
448 
449     std::string tmp_genename = gene_name; // copy
450     if (!gversion.empty() && gmodel.name.find('.') == std::string::npos) {
451       gene_name += "." + gversion;
452     }
453     auto it = geneNameToId.find(gene_name);
454     if (it == geneNameToId.end()) {
455       gene_name = tmp_genename;
456       it = geneNameToId.find(gene_name);
457     }
458 
459     if (it != geneNameToId.end()) {
460       tmodel.gene_id = it->second;
461     }
462 
463     int id = tmodel.id;
464     if (transcripts[id].chr == -1) {
465       transcripts[id] = std::move(tmodel);
466     }
467   } else if (type == Type::EXON) {
468     std::string tmp_transcript_name = transcript_name;
469     if (!tversion.empty() && transcript_name.find('.') == std::string::npos) {
470       transcript_name += "." + tversion;
471     }
472 
473     auto it = trNameToId.find(transcript_name);
474     if (it == trNameToId.end()) {
475       transcript_name = tmp_transcript_name;
476       it = trNameToId.find(transcript_name);
477     }
478 
479     if (it != trNameToId.end()) {
480       auto& tm = transcripts[it->second];
481       if (tm.chr != -1) {
482         tm.exons.push_back(std::move(emodel));
483       }
484     }
485   }
486 
487 
488 
489   return 0;
490 }
491 
parseGTF(const std::string & gtf_fn,const KmerIndex & index,const ProgramOptions & options,bool guessChromosomes)492 void Transcriptome::parseGTF(const std::string &gtf_fn, const KmerIndex& index, const ProgramOptions& options, bool guessChromosomes) {
493 
494   for (int i = 0; i < index.num_trans; i++) {
495     TranscriptModel tr;
496     tr.id = i;
497     tr.chr = -1;
498     tr.gene_id = -1;
499     tr.strand = true;
500     transcripts.push_back(std::move(tr));
501     trNameToId.insert({index.target_names_[i], i});
502   }
503 
504 
505   int num_chrom_missing = 0, num_trans_missing = 0;
506   int buf_size = 1<<22; // line is < 4M
507   char* buf = new char[buf_size+1];
508   buf[buf_size] = 0;
509   int buf_end = 0;
510   int bufread = 0;
511   int pos = 0;
512   char *pos_end = buf;
513   gzFile file = gzopen(gtf_fn.c_str(), "r");
514   std::string line;
515   if (!file) {
516     std::cerr << "Error: could not open file " << gtf_fn << std::endl;
517     return;
518   }
519 
520   bool done_reading = false;
521   while (!done_reading) {
522     // buf[0:pos] contains a previously unprocessed line
523     int to_read = (buf+buf_size) - pos_end;
524     bufread = gzread(file, pos_end, to_read);
525     buf_end = (pos_end - buf) + bufread;
526     if (bufread < to_read) {
527       if (gzeof(file)) {
528         done_reading = true;
529       }
530     }
531     while (true) {
532       pos_end = std::strchr(buf+pos, '\n');
533       if (pos_end == nullptr) {
534         pos_end = buf + buf_end+1;
535         if (!done_reading) {
536           break;
537         }
538       }
539 
540       // line goes from buf[pos:pos_end]
541       line.assign(buf+pos, pos_end - (buf+pos));
542       int r = addGTFLine(line, index, guessChromosomes);
543       if (r == 1) {
544         num_chrom_missing++;
545       } else if (r==2) {
546         num_trans_missing++;
547       }
548 
549       pos = pos_end - buf + 1;
550       if (pos >= buf_end) {
551         break;
552       }
553       assert(0 <= pos);
554       assert(pos < buf_size);
555     }
556 
557     // buf[pos:bufend] contains now new line,
558     if (done_reading) {
559       break;
560     }
561     int leftover = buf_end - pos;
562     if (leftover > 0) {
563       std::memmove(buf, buf+pos,leftover);
564     }
565     size_t toset = (buf_size - leftover);
566     std::memset(buf+leftover, 0, toset);
567     pos_end = buf + leftover;
568     pos = 0;
569 
570   }
571 
572 
573 
574   delete[] buf;
575   gzclose(file);
576 
577   if (num_chrom_missing > 0) {
578     std::cerr << "Warning: could not find chromosomes for " << num_chrom_missing << " transcripts" << std::endl;
579   }
580   if (num_trans_missing > 0) {
581     std::cerr << "Warning: " << num_trans_missing << " transcripts were defined in GTF file, but not in the index" << std::endl;
582   }
583 }
584