1 #include "GeneModel.h"
2 #include <assert.h>
3 #include <iostream>
4 #include <htslib/sam.h> // needed for CIGAR ops
5 #include <zlib.h>
6
strandToChar(bool s)7 char strandToChar(bool s) {
8 return (s) ? '+' : '-';
9 }
10
charToStrand(char c)11 bool charToStrand(char c) {
12 switch (c) {
13 case '+':
14 return true; break;
15 case '-':
16 return false; break;
17 default:
18 return true; break;
19 }
20 }
21
chrLookup(const Transcriptome & model,const std::string chr)22 int chrLookup(const Transcriptome& model, const std::string chr) {
23 auto cit = model.chrNameToId.find(chr);
24 if (cit != model.chrNameToId.end()) {
25 int c = cit->second;
26 assert(c >= 0);
27 //assert(c < chr.size());
28 return c;
29 } else {
30 return -1;
31 }
32 };
33
translateTrPosition(const int tr,const int pos,const int rlen,bool strand,TranscriptAlignment & aln) const34 bool Transcriptome::translateTrPosition(const int tr, const int pos, const int rlen, bool strand, TranscriptAlignment &aln) const {
35 const TranscriptModel& model = transcripts[tr];
36 if (model.chr == -1) {
37 return false;
38 }
39 aln.chr = model.chr;
40 aln.cigar.clear();
41 aln.chrpos = -1;
42
43
44 aln.strand = (strand == model.strand);
45 int trpos;
46 int rpos = 0; // how many bp of read have been matched
47
48 int n_exons = model.exons.size();
49
50 if (model.strand) {
51 trpos = pos;
52 if (trpos < 0) {
53 // read starts a bit before the transcript, softclip to fit
54 int softclip = -trpos;
55 aln.cigar.push_back((softclip << BAM_CIGAR_SHIFT) | BAM_CSOFT_CLIP);
56 rpos += softclip;
57 aln.chrpos = model.start;
58 }
59 for (int i = 0; i < n_exons; i++) {
60 auto& exon = model.exons[i];
61 int len = exon.stop - exon.start;
62 if (trpos < len) {
63 if (rpos == 0) {
64 aln.chrpos = exon.start + trpos; // left-most position
65 }
66 if (trpos + rlen <= len) {
67 aln.cigar.push_back(((rlen-rpos)<< BAM_CIGAR_SHIFT) | BAM_CMATCH); // rlen-trpos of M
68 rpos = rlen;
69 break; // end of the read
70 } else {
71 int mlen = 0;
72 if (trpos < 0) { // match begins before this exon, extends over it
73 mlen = len;
74 } else {
75 mlen = len - trpos;
76 }
77 aln.cigar.push_back((mlen << BAM_CIGAR_SHIFT) | BAM_CMATCH);
78 if (i +1 < n_exons) {
79 aln.cigar.push_back(((model.exons[i+1].start-exon.stop) << BAM_CIGAR_SHIFT) | BAM_CREF_SKIP); // insert a bunch of N's
80 }
81 rpos += mlen;
82 }
83 }
84 trpos -= len;
85 }
86 if (rpos < rlen) {
87 aln.cigar.push_back(((rlen-rpos) << BAM_CIGAR_SHIFT) | BAM_CSOFT_CLIP);
88 }
89 } else {
90 trpos = model.length - pos - rlen; // counting from back from right most transcript position
91 if (trpos < 0) {
92 int softclip = -trpos;
93 aln.cigar.push_back((softclip << BAM_CIGAR_SHIFT) | BAM_CSOFT_CLIP); // soft clip
94 rpos += softclip;
95 aln.chrpos = model.start;
96 }
97 for (int i = n_exons-1; i >= 0; i--) {
98 auto& exon = model.exons[i];
99 int len = exon.stop - exon.start;
100 if (trpos < len) {
101 if (rpos == 0) {
102 aln.chrpos = exon.start + trpos;
103 }
104 if (trpos + rlen <= len) {
105 aln.cigar.push_back(((rlen-rpos) << BAM_CIGAR_SHIFT) | BAM_CMATCH);
106 rpos = rlen;
107 break;
108 } else {
109 int mlen = 0;
110 if (trpos < 0) { // match begins before this exon, extends over it
111 mlen = len;
112 } else {
113 mlen = len - trpos;
114 }
115 aln.cigar.push_back((mlen << BAM_CIGAR_SHIFT) | BAM_CMATCH);
116 if (i > 0) {
117 aln.cigar.push_back(((model.exons[i-1].start - exon.stop) << BAM_CIGAR_SHIFT) | BAM_CREF_SKIP);
118 }
119 rpos += mlen;
120 }
121 }
122 trpos -= len;
123 }
124 if (rpos < rlen) {
125 aln.cigar.push_back(((rlen-rpos) << BAM_CIGAR_SHIFT) | BAM_CSOFT_CLIP);
126 }
127 }
128
129
130 return true;
131 }
132
loadChromosomes(const std::string & chrom_fn)133 void Transcriptome::loadChromosomes(const std::string &chrom_fn) {
134 std::ifstream in(chrom_fn);
135 std::string type;
136 while (in.good()) {
137 int id = chr.size();
138 Chromosome c;
139 c.len = -1;
140 in >> c.name >> c.len;
141 if (!c.name.empty() && c.len >= 0) {
142 chr.push_back(c);
143 chrNameToId.insert({c.name,id});
144 }
145 std::getline(in,type); // clear end of line
146 }
147 }
148
149 /*
150 void Transcriptome::loadTranscriptome(const KmerIndex& index, std::istream &in, const ProgramOptions& options) {
151 std::string line;
152 std::string segment;
153 std::string type;
154 std::string gtype, ttype;
155 std::string gene_id, tr_id, chr_id;
156
157
158 for (int i = 0; i < index.num_trans; i++) {
159 TranscriptModel tr;
160 tr.id = i;
161 tr.chr = -1;
162 tr.gene_id = -1;
163 tr.length = -1;
164 tr.strand = true;
165 transcripts.push_back(std::move(tr));
166 trNameToId.insert({index.target_names_[i], i});
167 }
168
169
170
171
172 int tr_extras = 0;
173 //std::unordered_map<std::string,int>
174 while (in.good()) {
175 in >> type;
176 if (type == "GENE") {
177 GeneModel model;
178 char strand = '?';
179 in >> model.name >> model.commonName >> chr_id >> gtype >> strand >> model.start >> model.stop;
180 model.chr = chrLookup(*this, chr_id);
181 if (model.chr == -1) {
182 std::cerr << "Error: chromosome " << chr_id << " not defined before reference" << std::endl;
183 assert(false);
184 }
185 model.id = genes.size();
186 model.strand = charToStrand(strand);
187
188 geneNameToId.insert({model.name, model.id});
189 in >> line; // rest of transcripts, ignore for now
190 genes.push_back(model);
191
192 } else if (type == "TRANSCRIPT") {
193 TranscriptModel model;
194 char strand = '?';
195 in >> tr_id >> gene_id >> chr_id >> ttype >> strand >> model.start >> model.stop;
196 auto it = trNameToId.find(tr_id);
197 if (it != trNameToId.end()) {
198 model.id = it->second;
199 } else {
200 tr_extras++;
201 //std::cerr << "Warning: transcript " << tr_id << " is defined in GTF but not in FASTA" << std::endl;
202 std::getline(in,line);
203 continue;
204 }
205
206 model.chr = chrLookup(*this, chr_id);
207 if (model.chr == -1) {
208 std::cerr << "Error: chromosome " << chr_id << " not definede before reference" << std::endl;
209 assert(false);
210 }
211
212 model.strand = charToStrand(strand);
213 model.name = tr_id;
214
215 in >> line;
216 std::stringstream strline(line);
217 while (std::getline(strline, segment, ';')) {
218 ExonModel ex;
219 ex.chr = model.chr;
220 ex.strand = model.strand;
221 int p = segment.find(',');
222 ex.start = std::stoi(segment.substr(0,p));
223 ex.stop = std::stoi(segment.substr(p+1));
224 model.exons.push_back(std::move(ex));
225 }
226 model.length = 0;
227 for (auto & exon : model.exons) {
228 model.length += exon.stop - exon.start;
229 }
230 std::getline(in, line);
231 int id = model.id;
232 if (transcripts[id].chr == -1) {
233 transcripts[id] = std::move(model);
234 }
235
236 } else if (type == "CHROMOSOME") {
237 int id = chr.size();
238 Chromosome c;
239 in >> c.name >> c.len;
240 chr.push_back(c);
241 chrNameToId.insert({c.name,id});
242 std::getline(in,line);
243 } else {
244 std::getline(in, line); // read until end of line
245 }
246 }
247
248 if (tr_extras > 0) {
249 std::cerr << "Warning: " << tr_extras << " transcript defined in GTF but not found in FASTA" << std::endl;
250 }
251
252 int tr_bad = 0;
253 for (int i = 0; i < index.num_trans; i++) {
254 const auto& tr = transcripts[i];
255 if (tr.chr == -1) {
256 tr_bad++;
257 }
258 }
259 if (tr_bad > 0) {
260 std::cerr << "Warning: there were " << tr_bad << " transcripts out of "
261 << index.num_trans << " that are missing in the GTF file " << std::endl;
262 }
263
264 }
265 */
266
267
addGTFLine(const std::string & line,const KmerIndex & index,bool guessChromosomes)268 int Transcriptome::addGTFLine(const std::string &line, const KmerIndex& index, bool guessChromosomes) {
269
270 if(line.empty() || line[0] == '#') {
271 return 0;
272 }
273 int p = 0, t=0;
274 // read chr
275 std::string schr;
276 t = line.find('\t',p);
277 schr = line.substr(p,t-p);
278 t = line.find('\t',t+1); // skip annotation source
279 p = t+1;
280 t = line.find('\t',p);
281 std::string typestr = line.substr(p,t-p);
282 enum Type {GENE, TRANSCRIPT, EXON, OTHER};
283 Type type;
284 if (typestr == "gene") {
285 type = Type::GENE;
286 // need id, name, chr, coding, strand, start, stop, transcripts
287 } else if (typestr == "transcript") {
288 type = Type::TRANSCRIPT;
289 //id, gene, chr, coding, strand, start, stop, exons: start,stop; (increasing for +, decreasing for -)
290 } else if (typestr == "exon") {
291 type = Type::EXON;
292 } else {
293 type = Type::OTHER;
294 return 0;
295 }
296 p = t+1;
297 t = line.find('\t',p);
298 int start = std::stoi(line.substr(p,t-p))-1;
299 p = t+1;
300 t = line.find('\t',p);
301 int stop = std::stoi(line.substr(p,t-p));
302 t = line.find('\t',t+1);
303 p = t+1; // skip score
304 t = line.find('\t',p);
305 char strand = line[p];
306 p = t+1;
307 t = line.find('\t',p); // phase, ignore
308 p = t+1;
309
310 GeneModel gmodel;
311 TranscriptModel tmodel;
312 ExonModel emodel;
313
314 int ichr = chrLookup(*this, schr);
315 if (ichr == -1) {
316 if (guessChromosomes) {
317 // just add a new chr on the fly
318 int i = chr.size();
319 Chromosome c;
320 c.name = schr;
321 c.len = 536870911; // maximum that bai can index :(
322 chr.push_back(c);
323 chrNameToId.insert({schr, i});
324 ichr = i;
325 } else {
326 return 1; // couldn't find chrom
327 }
328 }
329
330 if (type == Type::GENE) {
331 gmodel.id = genes.size(); // ?? ever used?
332 gmodel.chr = ichr;
333 gmodel.start = start;
334 gmodel.stop = stop;
335 gmodel.strand = (strand == '+') ? true : false;
336 } else if (type == Type::TRANSCRIPT) {
337 tmodel.chr = ichr;
338 tmodel.start = start;
339 tmodel.stop = stop;
340 tmodel.strand = (strand == '+') ? true : false;
341 tmodel.gene_id = -1;
342 } else if (type == Type::EXON) {
343 emodel.chr = ichr;
344 emodel.start = start;
345 emodel.stop = stop;
346 emodel.strand = (strand == '+') ? true : false;
347 }
348
349
350 // line[p:] contains the additional fields as 'key "value";'
351 // line[p:t] is key, line[t+2:]
352 int s = 0;
353 std::string key,value;
354 int keycount = 0;
355 std::string gversion, tversion, gene_name, transcript_name;
356 while (p != std::string::npos) {
357 if ((t = line.find('"',p))== std::string::npos) {
358 break;
359 }
360 if ((s = line.find('"',t+1)) == std::string::npos) {
361 break;
362 }
363 assert(line[t-1] == ' ');
364 key.assign(line.substr(p,t-p-1));
365 assert(s > t);
366 value.assign(line.substr(t+1,s-t-1));
367
368 // common values
369 if (key == "gene_id") {
370 keycount++;
371 gene_name = std::move(value);
372 } else if (key == "gene_version") {
373 keycount++;
374 gversion = std::move(value);
375 }
376
377
378 if (type == Type::GENE) {
379 if (key == "gene_name") {
380 keycount++;
381 gmodel.commonName = std::move(value);
382 } else if (key == "gene_id")
383
384 if (keycount == 3) {
385 break;
386 }
387 } else {
388 if (key == "transcript_id") {
389 keycount++;
390 transcript_name = std::move(value);
391 } else if (key == "transcript_version") {
392 keycount++;
393 tversion = std::move(value);
394 }
395
396 if (type == Type::TRANSCRIPT) {
397 if (keycount == 4) {
398 break;
399 }
400 } else if (type == Type::EXON) {
401 if (keycount == 4) {
402 break;
403 }
404 }
405 }
406
407 assert(s +1 < line.size());
408 assert(line[s+1] == ';');
409 if ((p = line.find(' ',s)) != std::string::npos) {
410 p++;
411 if (p >= line.size()) {
412 break;
413 }
414 }
415 }
416
417 if (type == Type::GENE) {
418 gmodel.name = gene_name;
419 assert(!gmodel.name.empty());
420 if (!gversion.empty() && gmodel.name.find('.') == std::string::npos) {
421 gmodel.name += "." + gversion;
422 }
423 // add to the transcriptome model
424 geneNameToId.insert({gmodel.name, gmodel.id});
425 genes.push_back(std::move(gmodel));
426 } else if (type == Type::TRANSCRIPT) {
427 tmodel.name = transcript_name;
428 assert(!tmodel.name.empty());
429
430 // canonical name includes version number
431 if (!tversion.empty() && tmodel.name.find('.') == std::string::npos) {
432 tmodel.name += "." + tversion;
433 }
434 auto it2 = trNameToId.find(tmodel.name);
435 if (it2 == trNameToId.end()) {
436 tmodel.name = transcript_name; // try without version number
437 it2 = trNameToId.find(tmodel.name);
438 }
439
440
441 if (it2 != trNameToId.end()) {
442 tmodel.id = it2->second;
443 tmodel.length = index.target_lens_[tmodel.id];
444 } else {
445 // transcript not found, do nothing
446 return 2;
447 }
448
449 std::string tmp_genename = gene_name; // copy
450 if (!gversion.empty() && gmodel.name.find('.') == std::string::npos) {
451 gene_name += "." + gversion;
452 }
453 auto it = geneNameToId.find(gene_name);
454 if (it == geneNameToId.end()) {
455 gene_name = tmp_genename;
456 it = geneNameToId.find(gene_name);
457 }
458
459 if (it != geneNameToId.end()) {
460 tmodel.gene_id = it->second;
461 }
462
463 int id = tmodel.id;
464 if (transcripts[id].chr == -1) {
465 transcripts[id] = std::move(tmodel);
466 }
467 } else if (type == Type::EXON) {
468 std::string tmp_transcript_name = transcript_name;
469 if (!tversion.empty() && transcript_name.find('.') == std::string::npos) {
470 transcript_name += "." + tversion;
471 }
472
473 auto it = trNameToId.find(transcript_name);
474 if (it == trNameToId.end()) {
475 transcript_name = tmp_transcript_name;
476 it = trNameToId.find(transcript_name);
477 }
478
479 if (it != trNameToId.end()) {
480 auto& tm = transcripts[it->second];
481 if (tm.chr != -1) {
482 tm.exons.push_back(std::move(emodel));
483 }
484 }
485 }
486
487
488
489 return 0;
490 }
491
parseGTF(const std::string & gtf_fn,const KmerIndex & index,const ProgramOptions & options,bool guessChromosomes)492 void Transcriptome::parseGTF(const std::string >f_fn, const KmerIndex& index, const ProgramOptions& options, bool guessChromosomes) {
493
494 for (int i = 0; i < index.num_trans; i++) {
495 TranscriptModel tr;
496 tr.id = i;
497 tr.chr = -1;
498 tr.gene_id = -1;
499 tr.strand = true;
500 transcripts.push_back(std::move(tr));
501 trNameToId.insert({index.target_names_[i], i});
502 }
503
504
505 int num_chrom_missing = 0, num_trans_missing = 0;
506 int buf_size = 1<<22; // line is < 4M
507 char* buf = new char[buf_size+1];
508 buf[buf_size] = 0;
509 int buf_end = 0;
510 int bufread = 0;
511 int pos = 0;
512 char *pos_end = buf;
513 gzFile file = gzopen(gtf_fn.c_str(), "r");
514 std::string line;
515 if (!file) {
516 std::cerr << "Error: could not open file " << gtf_fn << std::endl;
517 return;
518 }
519
520 bool done_reading = false;
521 while (!done_reading) {
522 // buf[0:pos] contains a previously unprocessed line
523 int to_read = (buf+buf_size) - pos_end;
524 bufread = gzread(file, pos_end, to_read);
525 buf_end = (pos_end - buf) + bufread;
526 if (bufread < to_read) {
527 if (gzeof(file)) {
528 done_reading = true;
529 }
530 }
531 while (true) {
532 pos_end = std::strchr(buf+pos, '\n');
533 if (pos_end == nullptr) {
534 pos_end = buf + buf_end+1;
535 if (!done_reading) {
536 break;
537 }
538 }
539
540 // line goes from buf[pos:pos_end]
541 line.assign(buf+pos, pos_end - (buf+pos));
542 int r = addGTFLine(line, index, guessChromosomes);
543 if (r == 1) {
544 num_chrom_missing++;
545 } else if (r==2) {
546 num_trans_missing++;
547 }
548
549 pos = pos_end - buf + 1;
550 if (pos >= buf_end) {
551 break;
552 }
553 assert(0 <= pos);
554 assert(pos < buf_size);
555 }
556
557 // buf[pos:bufend] contains now new line,
558 if (done_reading) {
559 break;
560 }
561 int leftover = buf_end - pos;
562 if (leftover > 0) {
563 std::memmove(buf, buf+pos,leftover);
564 }
565 size_t toset = (buf_size - leftover);
566 std::memset(buf+leftover, 0, toset);
567 pos_end = buf + leftover;
568 pos = 0;
569
570 }
571
572
573
574 delete[] buf;
575 gzclose(file);
576
577 if (num_chrom_missing > 0) {
578 std::cerr << "Warning: could not find chromosomes for " << num_chrom_missing << " transcripts" << std::endl;
579 }
580 if (num_trans_missing > 0) {
581 std::cerr << "Warning: " << num_trans_missing << " transcripts were defined in GTF file, but not in the index" << std::endl;
582 }
583 }
584