1 /*
2 * hits.cpp
3 * Cufflinks
4 *
5 * Created by Cole Trapnell on 3/23/09.
6 * Copyright 2009 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <cassert>
15 #include <cstdlib>
16 #include <cstring>
17 #include <iostream>
18 #include <set>
19 #include <vector>
20
21 #include "common.h"
22 #include "hits.h"
23 #include "tokenize.h"
24
25 #include "abundances.h"
26
27 using namespace std;
28
29 #if ENABLE_THREADS
30 boost::mutex RefSequenceTable::table_lock;
31 #endif
32
33 int num_deleted = 0;
34
trim(int trimmed_length)35 void ReadHit::trim(int trimmed_length)
36 {
37 bool antisense_aln = _sam_flag & 0x10;
38
39 vector<CigarOp> new_cigar;
40 int new_left = 0;
41
42 if (!antisense_aln)
43 {
44 int pos = _left;
45 new_left = _left;
46 int length = 0;
47 for (vector<CigarOp>::iterator i = _cigar.begin(); i < _cigar.end(); ++i)
48 {
49 const CigarOp& op = *i;
50
51 if (length < trimmed_length)
52 {
53 switch(op.opcode)
54 {
55 case REF_SKIP:
56 //gaps_out.push_back(make_pair(pos, pos + op.length - 1));
57 pos += op.length;
58 new_cigar.push_back(op);
59 break;
60 case SOFT_CLIP:
61 assert(false); // not sure if this case is right
62 pos += op.length;
63 length += op.length;
64 new_cigar.push_back(op);
65 break;
66 case HARD_CLIP:
67 new_cigar.push_back(op);
68 break;
69 case MATCH:
70 if (length + op.length < trimmed_length)
71 {
72 pos += op.length;
73 length += op.length;
74 new_cigar.push_back(op);
75 }
76 else
77 {
78 new_cigar.push_back(CigarOp(MATCH, trimmed_length - length));
79 pos += trimmed_length - length;
80 length += trimmed_length - length;
81 }
82 break;
83 case INS:
84 assert(false); // not sure if this case is right
85 pos -= op.length;
86 length -= op.length;
87 new_cigar.push_back(op);
88 break;
89 case DEL:
90 assert(false); // not sure if this case is right
91 pos += op.length;
92 length += op.length;
93 new_cigar.push_back(op);
94 break;
95 default:
96 break;
97 }
98 }
99 }
100 }
101 else
102 {
103 int pos = _right;
104 int length = 0;
105 for (vector<CigarOp>::reverse_iterator i = _cigar.rbegin(); i < _cigar.rend(); ++i)
106 {
107 const CigarOp& op = *i;
108
109 if (length < trimmed_length)
110 {
111 switch(op.opcode)
112 {
113 case REF_SKIP:
114 //gaps_out.push_back(make_pair(pos, pos + op.length - 1));
115 pos -= op.length;
116 new_cigar.push_back(op);
117 break;
118 case SOFT_CLIP:
119 assert(false); // not sure if this case is right
120 pos -= op.length;
121 length += op.length;
122 new_cigar.push_back(op);
123 break;
124 case HARD_CLIP:
125 new_cigar.push_back(op);
126 break;
127 case MATCH:
128 if (length + op.length < trimmed_length)
129 {
130 pos -= op.length;
131 length += op.length;
132 new_cigar.push_back(op);
133 }
134 else
135 {
136 new_cigar.push_back(CigarOp(MATCH, trimmed_length - length));
137 pos -= trimmed_length - length;
138 length += trimmed_length - length;
139 }
140 break;
141 case INS:
142 assert(false); // not sure if this case is right
143 pos += op.length;
144 length -= op.length;
145 new_cigar.push_back(op);
146 break;
147 case DEL:
148 assert(false); // not sure if this case is right
149 pos -= op.length;
150 length += op.length;
151 new_cigar.push_back(op);
152 break;
153 default:
154 break;
155 }
156 }
157 }
158 _left = pos;
159 }
160 _cigar = new_cigar;
161 _right = get_right();
162 assert (trimmed_length == read_len());
163 }
164
165 //static const int max_read_length = 1024;
166
hit_insert_id_lt(const ReadHit & h1,const ReadHit & h2)167 bool hit_insert_id_lt(const ReadHit& h1, const ReadHit& h2)
168 {
169 return h1.insert_id() < h2.insert_id();
170 }
171
hits_eq_mod_id(const ReadHit & lhs,const ReadHit & rhs)172 bool hits_eq_mod_id(const ReadHit& lhs, const ReadHit& rhs)
173 {
174 return (lhs.ref_id() == rhs.ref_id() &&
175 lhs.antisense_align() == rhs.antisense_align() &&
176 lhs.left() == rhs.left() &&
177 lhs.source_strand() == rhs.source_strand() &&
178 lhs.cigar() == rhs.cigar());
179 }
180
181 // Compares for structural equality, but won't declare multihits equal to one another
hits_eq_non_multi(const MateHit & lhs,const MateHit & rhs)182 bool hits_eq_non_multi(const MateHit& lhs, const MateHit& rhs)
183 {
184 if ((lhs.is_multi() || rhs.is_multi() ) && lhs.insert_id() != rhs.insert_id())
185 return false;
186 return hits_equals(lhs, rhs);
187 }
188
189 // Compares for structural equality, but won't declare multihits equal to one another
190 // and won't return true for hits from different read groups (e.g. replicate samples)
hits_eq_non_multi_non_replicate(const MateHit & lhs,const MateHit & rhs)191 bool hits_eq_non_multi_non_replicate(const MateHit& lhs, const MateHit& rhs)
192 {
193 if (((lhs.is_multi() || rhs.is_multi()) && lhs.insert_id() != rhs.insert_id()) || lhs.read_group_props() != rhs.read_group_props())
194 return false;
195 return hits_equals(lhs, rhs);
196 }
197
198 // Does NOT care about the read group this hit came from.
hits_equals(const MateHit & lhs,const MateHit & rhs)199 bool hits_equals(const MateHit& lhs, const MateHit& rhs)
200 {
201 if (lhs.ref_id() != rhs.ref_id())
202 return false;
203
204 if ((lhs.left_alignment() == NULL) != (rhs.left_alignment() == NULL))
205 return false;
206 if ((lhs.right_alignment() == NULL) != (rhs.right_alignment() == NULL))
207 return false;
208 if (lhs.left_alignment())
209 {
210 if (!(hits_eq_mod_id(*lhs.left_alignment(),*(rhs.left_alignment()))))
211 return false;
212 }
213 if (lhs.right_alignment())
214 {
215 if (!(hits_eq_mod_id(*lhs.right_alignment(),*(rhs.right_alignment()))))
216 return false;
217 }
218 return true;
219 }
220
has_no_collapse_mass(const MateHit & hit)221 bool has_no_collapse_mass(const MateHit& hit)
222 {
223 return hit.collapse_mass() == 0;
224 }
225
226 // Assumes hits are sorted by mate_hit_lt
227 // Does not collapse hits that are multi-reads
collapse_hits(const vector<MateHit> & hits,vector<MateHit> & non_redundant)228 void collapse_hits(const vector<MateHit>& hits,
229 vector<MateHit>& non_redundant)
230 {
231 copy(hits.begin(), hits.end(), back_inserter(non_redundant));
232 vector<MateHit>::iterator new_end = unique(non_redundant.begin(),
233 non_redundant.end(),
234 hits_eq_non_multi_non_replicate);
235 non_redundant.erase(new_end, non_redundant.end());
236 non_redundant.resize(non_redundant.size());
237
238 BOOST_FOREACH(MateHit& hit, non_redundant)
239 hit.collapse_mass(0);
240
241 size_t curr_aln = 0;
242 size_t curr_unique_aln = 0;
243 while (curr_aln < hits.size())
244 {
245 if (hits_eq_non_multi_non_replicate(non_redundant[curr_unique_aln], hits[curr_aln]) || hits_eq_non_multi_non_replicate(non_redundant[++curr_unique_aln], hits[curr_aln]))
246 {
247 double more_mass = hits[curr_aln].internal_scale_mass();
248 //assert(non_redundant[curr_unique_aln].collapse_mass() == 0 || !non_redundant[curr_unique_aln].is_multi());
249 non_redundant[curr_unique_aln].incr_collapse_mass(more_mass);
250 }
251 else
252 assert(false);
253
254 ++curr_aln;
255 }
256
257 //BOOST_FOREACH(MateHit& hit, non_redundant)
258 //assert(hit.collapse_mass() <= 1 || !hit.is_multi());
259
260 //non_redundant.erase(remove_if(non_redundant.begin(),non_redundant.end(),has_no_collapse_mass), non_redundant.end());
261
262 }
263
264 // Places multi-reads to the right of reads they match
mate_hit_lt(const MateHit & lhs,const MateHit & rhs)265 bool mate_hit_lt(const MateHit& lhs, const MateHit& rhs)
266 {
267 if (lhs.left() != rhs.left())
268 return lhs.left() < rhs.left();
269 if (lhs.right() != rhs.right())
270 return lhs.right() > rhs.right();
271
272 if ((lhs.left_alignment() == NULL) != (rhs.left_alignment() == NULL))
273 return (lhs.left_alignment() == NULL) < (rhs.left_alignment() == NULL);
274
275 if ((lhs.right_alignment() == NULL) != (rhs.right_alignment() == NULL))
276 return (lhs.right_alignment() == NULL) < (rhs.right_alignment() == NULL);
277
278 assert ((lhs.right_alignment() == NULL) == (rhs.right_alignment() == NULL));
279 assert ((lhs.left_alignment() == NULL) == (rhs.left_alignment() == NULL));
280
281 const ReadHit* lhs_l = lhs.left_alignment();
282 const ReadHit* lhs_r = lhs.right_alignment();
283
284 const ReadHit* rhs_l = rhs.left_alignment();
285 const ReadHit* rhs_r = rhs.right_alignment();
286
287 if (lhs_l && rhs_l)
288 {
289 if (lhs_l->cigar().size() != rhs_l->cigar().size())
290 return lhs_l->cigar().size() < rhs_l->cigar().size();
291 for (size_t i = 0; i < lhs_l->cigar().size(); ++i)
292 {
293 if (lhs_l->cigar()[i].opcode != rhs_l->cigar()[i].opcode)
294 return lhs_l->cigar()[i].opcode < rhs_l->cigar()[i].opcode;
295 if (lhs_l->cigar()[i].length != rhs_l->cigar()[i].length)
296 return lhs_l->cigar()[i].length < rhs_l->cigar()[i].length;
297 }
298 }
299
300 if (lhs_r && rhs_r)
301 {
302 if (lhs_r->cigar().size() != rhs_r->cigar().size())
303 return lhs_r->cigar().size() < rhs_r->cigar().size();
304 for (size_t i = 0; i < lhs_r->cigar().size(); ++i)
305 {
306 if (lhs_r->cigar()[i].opcode != rhs_r->cigar()[i].opcode)
307 return lhs_r->cigar()[i].opcode < rhs_r->cigar()[i].opcode;
308 if (lhs_r->cigar()[i].length != rhs_r->cigar()[i].length)
309 return lhs_r->cigar()[i].length < rhs_r->cigar()[i].length;
310 }
311 }
312
313 if (lhs.is_multi() != rhs.is_multi())
314 {
315 return rhs.is_multi();
316 }
317
318 if (lhs.read_group_props() != rhs.read_group_props())
319 {
320 return lhs.read_group_props() < rhs.read_group_props();
321 }
322
323 return false;
324 }
325
326
create_hit(const string & insert_name,const string & ref_name,int left,const vector<CigarOp> & cigar,CuffStrand source_strand,const string & partner_ref,int partner_pos,unsigned int edit_dist,int num_hits,float base_mass,uint32_t sam_flag)327 ReadHit HitFactory::create_hit(const string& insert_name,
328 const string& ref_name,
329 int left,
330 const vector<CigarOp>& cigar,
331 CuffStrand source_strand,
332 const string& partner_ref,
333 int partner_pos,
334 unsigned int edit_dist,
335 int num_hits,
336 float base_mass,
337 uint32_t sam_flag)
338 {
339 InsertID insert_id = _insert_table.get_id(insert_name);
340 RefID reference_id = _ref_table.get_id(ref_name, NULL);
341 RefID partner_ref_id = _ref_table.get_id(partner_ref, NULL);
342
343 return ReadHit(reference_id,
344 insert_id,
345 left,
346 cigar,
347 source_strand,
348 partner_ref_id,
349 partner_pos,
350 edit_dist,
351 num_hits,
352 base_mass,
353 sam_flag);
354 }
355
create_hit(const string & insert_name,const string & ref_name,uint32_t left,uint32_t read_len,CuffStrand source_strand,const string & partner_ref,int partner_pos,unsigned int edit_dist,int num_hits,float base_mass,uint32_t sam_flag)356 ReadHit HitFactory::create_hit(const string& insert_name,
357 const string& ref_name,
358 uint32_t left,
359 uint32_t read_len,
360 CuffStrand source_strand,
361 const string& partner_ref,
362 int partner_pos,
363 unsigned int edit_dist,
364 int num_hits,
365 float base_mass,
366 uint32_t sam_flag)
367 {
368 InsertID insert_id = _insert_table.get_id(insert_name);
369 RefID reference_id = _ref_table.get_id(ref_name, NULL);
370 RefID partner_ref_id = _ref_table.get_id(partner_ref, NULL);
371
372 return ReadHit(reference_id,
373 insert_id,
374 left,
375 read_len,
376 source_strand,
377 partner_ref_id,
378 partner_pos,
379 edit_dist,
380 num_hits,
381 base_mass,
382 sam_flag);
383 }
384
use_stranded_protocol(uint32_t sam_flag,MateStrandMapping msm)385 CuffStrand use_stranded_protocol(uint32_t sam_flag, MateStrandMapping msm)
386 {
387 bool antisense_aln = sam_flag & 0x10;
388 if (((sam_flag & BAM_FPAIRED) && (sam_flag & BAM_FREAD1)) || !(sam_flag & BAM_FPAIRED)) // first-in-pair or single-end
389 {
390 switch(msm)
391 {
392 case FF:
393 case FR:
394 return (antisense_aln) ? CUFF_REV : CUFF_FWD;
395 break;
396 case RF:
397 case RR:
398 return (antisense_aln) ? CUFF_FWD : CUFF_REV;
399 break;
400 }
401 }
402 else // second-in-pair read
403 {
404 switch (msm)
405 {
406 case FF:
407 case RF:
408 return (antisense_aln) ? CUFF_REV : CUFF_FWD;
409 break;
410 case FR:
411 case RR:
412 return (antisense_aln) ? CUFF_FWD : CUFF_REV;
413 break;
414 }
415 }
416 assert(false);
417 return CUFF_STRAND_UNKNOWN;
418 }
419
get_hit_from_bam1t(const bam1_t * hit_buf,const bam_hdr_t * header,ReadHit & bh)420 bool BAMHitFactory::get_hit_from_bam1t(const bam1_t* hit_buf,
421 const bam_hdr_t* header,
422 ReadHit& bh)
423 {
424 uint32_t sam_flag = hit_buf->core.flag;
425
426 int text_offset = hit_buf->core.pos;
427 int text_mate_pos = hit_buf->core.mpos;
428 int target_id = hit_buf->core.tid;
429 int mate_target_id = hit_buf->core.mtid;
430
431 vector<CigarOp> cigar;
432 bool spliced_alignment = false;
433 int num_hits = 1;
434
435 //header->target_name[c->tid]
436
437 if (sam_flag & 0x4 || target_id < 0)
438 {
439 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
440 bh = create_hit(bam_get_qname(hit_buf),
441 "*",
442 0, // SAM files are 1-indexed
443 0,
444 CUFF_STRAND_UNKNOWN,
445 "*",
446 0,
447 0,
448 1,
449 1.0,
450 sam_flag);
451 return true;
452 }
453 if (target_id >= header->n_targets)
454 {
455 fprintf (stderr, "BAM error: file contains hits to sequences not in header SQ records (%s)\n", bam_get_qname(hit_buf));
456 return false;
457 }
458
459 string text_name = header->target_name[target_id];
460
461 for (int i = 0; i < hit_buf->core.n_cigar; ++i)
462 {
463 //char* t;
464
465 int length = bam_get_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT;
466 if (length <= 0)
467 {
468 fprintf (stderr, "BAM error: CIGAR op has zero length (%s)\n", bam_get_qname(hit_buf));
469 return false;
470 }
471
472 CigarOpCode opcode;
473 switch(bam_get_cigar(hit_buf)[i] & BAM_CIGAR_MASK)
474 {
475 case BAM_CMATCH: opcode = MATCH; break;
476 case BAM_CINS: opcode = INS; break;
477 case BAM_CDEL: opcode = DEL; break;
478 case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break;
479 case BAM_CHARD_CLIP: opcode = HARD_CLIP; break;
480 case BAM_CPAD: opcode = PAD; break;
481 case BAM_CREF_SKIP:
482 opcode = REF_SKIP;
483 spliced_alignment = true;
484 if (length > (int)max_intron_length)
485 {
486 //fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n");
487 return false;
488 }
489 break;
490 default:
491 //fprintf (stderr, "SAM error on line %d: invalid CIGAR operation\n", _line_num);
492 return false;
493 }
494 if (opcode != HARD_CLIP)
495 cigar.push_back(CigarOp(opcode, length));
496 }
497
498 string mrnm;
499 if (mate_target_id >= 0)
500 {
501 if (mate_target_id == target_id)
502 {
503 mrnm = header->target_name[mate_target_id];
504 // if (abs((int)text_mate_pos - (int)text_offset) > (int)max_intron_length)
505 // {
506 // //fprintf (stderr, "Mates are too distant, skipping\n");
507 // return false;
508 // }
509 }
510 else
511 {
512 //fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n");
513 return false;
514 }
515 }
516 else
517 {
518 text_mate_pos = 0;
519 }
520
521 CuffStrand source_strand = CUFF_STRAND_UNKNOWN;
522 unsigned char num_mismatches = 0;
523
524 uint8_t* ptr = bam_aux_get(hit_buf, "XS");
525 if (ptr)
526 {
527 char src_strand_char = bam_aux2A(ptr);
528 if (src_strand_char == '-')
529 source_strand = CUFF_REV;
530 else if (src_strand_char == '+')
531 source_strand = CUFF_FWD;
532 }
533
534 ptr = bam_aux_get(hit_buf, "NM");
535 if (ptr)
536 {
537 num_mismatches = bam_aux2i(ptr);
538 }
539
540 ptr = bam_aux_get(hit_buf, "NH");
541 if (ptr)
542 {
543 num_hits = bam_aux2i(ptr);
544 }
545
546 double mass = 1.0;
547 ptr = bam_aux_get(hit_buf, "ZF");
548 if (ptr)
549 {
550 mass = bam_aux2i(ptr);
551 if (mass <= 0.0)
552 mass = 1.0;
553 }
554
555 if (_rg_props.strandedness() == STRANDED_PROTOCOL && source_strand == CUFF_STRAND_UNKNOWN)
556 source_strand = use_stranded_protocol(sam_flag, _rg_props.mate_strand_mapping());
557
558 if (!spliced_alignment)
559 {
560 //assert(_rg_props.strandedness() == STRANDED_PROTOCOL || source_strand == CUFF_STRAND_UNKNOWN);
561
562 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
563 bh = create_hit(bam_get_qname(hit_buf),
564 text_name,
565 text_offset, // BAM files are 0-indexed
566 cigar,
567 source_strand,
568 mrnm,
569 text_mate_pos,
570 num_mismatches,
571 num_hits,
572 mass,
573 sam_flag);
574 return true;
575
576 }
577 else
578 {
579 if (source_strand == CUFF_STRAND_UNKNOWN)
580 {
581 fprintf(stderr, "BAM record error: found spliced alignment without XS attribute\n");
582 }
583
584 bh = create_hit(bam_get_qname(hit_buf),
585 text_name,
586 text_offset, // BAM files are 0-indexed
587 cigar,
588 source_strand,
589 mrnm,
590 text_mate_pos,
591 num_mismatches,
592 num_hits,
593 mass,
594 sam_flag);
595 return true;
596 }
597
598
599 return true;
600 }
601
602
603
str_to_platform(const string pl_str)604 Platform str_to_platform(const string pl_str)
605 {
606 if (pl_str == "SOLiD")
607 {
608 return SOLID;
609 }
610 else if (pl_str == "Illumina")
611 {
612 return ILLUMINA;
613 }
614 else
615 {
616 return UNKNOWN_PLATFORM;
617 }
618 }
619
620 // Parses the header to determine platform and other properties
parse_header_string(const string & header_rec,ReadGroupProperties & rg_props)621 bool HitFactory::parse_header_string(const string& header_rec,
622 ReadGroupProperties& rg_props)
623 {
624 vector<string> columns;
625 tokenize(header_rec, "\t", columns);
626
627 if (columns[0] == "@RG")
628 {
629 for (size_t i = 1; i < columns.size(); ++i)
630 {
631 vector<string> fields;
632 tokenize(columns[i], ":", fields);
633 if (fields[0] == "PL")
634 {
635 if (rg_props.platform() == UNKNOWN_PLATFORM)
636 {
637 Platform p = str_to_platform(fields[1]);
638 rg_props.platform(p);
639 }
640 else
641 {
642 Platform p = str_to_platform(fields[1]);
643 if (p != rg_props.platform())
644 {
645 fprintf(stderr, "Error: Processing reads from different platforms is not currently supported\n");
646 return false;
647 }
648 }
649
650 }
651 }
652 }
653 else if (columns[0] == "@SQ")
654 {
655 _num_seq_header_recs++;
656 for (size_t i = 1; i < columns.size(); ++i)
657 {
658 vector<string> fields;
659 tokenize(columns[i], ":", fields);
660 if (fields[0] == "SN")
661 {
662 // Populate the RefSequenceTable with the sequence dictionary,
663 // to ensure that (for example) downstream GTF files are sorted
664 // in an order consistent with the header, and to enforce that
665 // BAM records appear in the order implied by the header
666 RefID _id = _ref_table.get_id(fields[1], NULL);
667
668 const RefSequenceTable::SequenceInfo* info = _ref_table.get_info(_id);
669
670 if (info->observation_order != _num_seq_header_recs)
671 {
672 if (info->name != fields[1])
673 {
674 fprintf(stderr, "Error: Hash collision between references '%s' and '%s'.\n", info->name, fields[1].c_str());
675 }
676 else
677 {
678 fprintf(stderr, "Error: sort order of reads in BAMs must be the same\n");
679 }
680 exit(1);
681 }
682 }
683 }
684 }
685
686 return true;
687 }
688
689 // Parses multiple header lines to determine platform and other properties
parse_header_lines(const string & h_text,ReadGroupProperties & _rg_props)690 void HitFactory::parse_header_lines(const string& h_text,
691 ReadGroupProperties& _rg_props)
692 {
693 size_t offset = 0;
694 while(offset < h_text.size())
695 {
696 size_t next_newline = h_text.find('\n', offset);
697 size_t line_length = next_newline == string::npos ? string::npos : next_newline - offset;
698 std::string this_line = h_text.substr(offset, line_length);
699 parse_header_string(this_line, _rg_props);
700
701 if(next_newline == string::npos)
702 break;
703 offset = next_newline + 1;
704 }
705 }
706
finalize_rg_props()707 void HitFactory::finalize_rg_props()
708 {
709 if (_rg_props.platform() == SOLID)
710 {
711 _rg_props.strandedness(STRANDED_PROTOCOL);
712 _rg_props.std_mate_orientation(MATES_POINT_TOWARD);
713 }
714 else
715 {
716 // Default to Illumina's unstranded protocol params for strandedness and
717 // mate orientation
718 _rg_props.strandedness(UNSTRANDED_PROTOCOL);
719 _rg_props.std_mate_orientation(MATES_POINT_TOWARD);
720 }
721 }
722
723 static const unsigned MAX_HEADER_LEN = 64 * 1024 * 1024; // 4 MB
724
read_next_hit(ReadHit & bh)725 bool SAMHitFactory::read_next_hit(ReadHit& bh)
726 {
727 bool new_rec = fgets(_hit_buf, _hit_buf_max_sz - 1, _hit_file);
728 if (!new_rec)
729 return false;
730 ++_line_num;
731 char* nl = strrchr(_hit_buf, '\n');
732 if (nl) *nl = 0;
733
734 const char* buf = _hit_buf;
735
736 // Are we still in the header region?
737 if (buf[0] == '@')
738 return false;
739
740 const char* _name = strsep((char**)&buf,"\t");
741 if (!_name)
742 return false;
743 char name[2048];
744 strncpy(name, _name, 2047);
745
746 const char* sam_flag_str = strsep((char**)&buf,"\t");
747 if (!sam_flag_str)
748 return false;
749
750 const char* text_name = strsep((char**)&buf,"\t");
751 if (!text_name)
752 return false;
753
754 const char* text_offset_str = strsep((char**)&buf,"\t");
755 if (!text_offset_str)
756 return false;
757
758 const char* map_qual_str = strsep((char**)&buf,"\t");
759 if (!map_qual_str)
760 return false;
761
762 const char* cigar_str = strsep((char**)&buf,"\t");
763 if (!cigar_str)
764 return false;
765
766 const char* mate_ref_name = strsep((char**)&buf,"\t");
767 if (!mate_ref_name)
768 return false;
769
770 const char* mate_pos_str = strsep((char**)&buf,"\t");
771 if (!mate_pos_str)
772 return false;
773
774 const char* inferred_insert_sz_str = strsep((char**)&buf,"\t");
775 if (!inferred_insert_sz_str)
776 return false;
777
778 const char* seq_str = strsep((char**)&buf,"\t");
779 if (!seq_str)
780 return false;
781
782 const char* qual_str = strsep((char**)&buf,"\t");
783 if (!qual_str)
784 return false;
785
786
787 int sam_flag = atoi(sam_flag_str);
788 int text_offset = atoi(text_offset_str);
789 int text_mate_pos = atoi(mate_pos_str);
790
791 const char* p_cig = cigar_str;
792 //int len = strlen(sequence);
793 vector<CigarOp> cigar;
794 bool spliced_alignment = false;
795 int num_hits = 1;
796
797 if ((sam_flag & 0x4) ||!strcmp(text_name, "*"))
798 {
799 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
800 bh = create_hit(name,
801 "*",
802 0, // SAM files are 1-indexed
803 0,
804 CUFF_STRAND_UNKNOWN,
805 "*",
806 0,
807 0,
808 1,
809 1.0,
810 sam_flag);
811 return true;
812 }
813 // Mostly pilfered direct from the SAM tools:
814 while (*p_cig)
815 {
816 char* t;
817 int length = (int)strtol(p_cig, &t, 10);
818 if (length <= 0)
819 {
820 fprintf (stderr, "SAM error on line %d: CIGAR op has zero length\n", _line_num);
821 fprintf (stderr,"%s\n", _hit_buf);
822 return false;
823 }
824 char op_char = toupper(*t);
825 CigarOpCode opcode;
826 if (op_char == 'M')
827 {
828 /*if (length > max_read_length)
829 {
830 fprintf(stderr, "SAM error on line %d: %s: MATCH op has length > %d\n", line_num, name, max_read_length);
831 return false;
832 }*/
833 opcode = MATCH;
834 }
835 else if (op_char == 'I') opcode = INS;
836 else if (op_char == 'D')
837 {
838 opcode = DEL;
839 }
840 else if (op_char == 'N')
841 {
842 opcode = REF_SKIP;
843 spliced_alignment = true;
844 if (length > (int)max_intron_length)
845 {
846 //fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n");
847 return false;
848 }
849 }
850 else if (op_char == 'S') opcode = SOFT_CLIP;
851 else if (op_char == 'H') opcode = HARD_CLIP;
852 else if (op_char == 'P') opcode = PAD;
853 else
854 {
855 fprintf (stderr, "SAM error on line %d: invalid CIGAR operation\n", _line_num);
856 return false;
857 }
858 p_cig = t + 1;
859 //i += length;
860 if (opcode != HARD_CLIP)
861 cigar.push_back(CigarOp(opcode, length));
862 }
863 if (*p_cig)
864 {
865 fprintf (stderr, "SAM error on line %d: unmatched CIGAR operation\n", _line_num);
866 return false;
867 }
868
869 string mrnm;
870 if (strcmp(mate_ref_name, "*"))
871 {
872 if (!strcmp(mate_ref_name, "=") || !strcmp(mate_ref_name, text_name))
873 {
874 mrnm = text_name;
875 // if (abs((int)text_mate_pos - (int)text_offset) > (int)max_intron_length)
876 // {
877 // //fprintf (stderr, "Mates are too distant, skipping\n");
878 // return false;
879 // }
880 }
881 else
882 {
883 //fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n");
884 return false;
885 }
886 }
887 else
888 {
889 text_mate_pos = 0;
890 }
891
892 CuffStrand source_strand = CUFF_STRAND_UNKNOWN;
893 unsigned char num_mismatches = 0;
894
895 const char* tag_buf = buf;
896
897 double mass = 1.0;
898
899 while((tag_buf = strsep((char**)&buf,"\t")))
900 {
901
902 char* first_colon = (char*)strchr(tag_buf, ':');
903 if (first_colon)
904 {
905 *first_colon = 0;
906 ++first_colon;
907 char* second_colon = strchr(first_colon, ':');
908 if (second_colon)
909 {
910 *second_colon = 0;
911 ++second_colon;
912 const char* first_token = tag_buf;
913 //const char* second_token = first_colon;
914 const char* third_token = second_colon;
915 if (!strcmp(first_token, "XS"))
916 {
917 if (*third_token == '-')
918 source_strand = CUFF_REV;
919 else if (*third_token == '+')
920 source_strand = CUFF_FWD;
921 }
922 else if (!strcmp(first_token, "NM"))
923 {
924 num_mismatches = atoi(third_token);
925 }
926 else if (!strcmp(first_token, "NH"))
927 {
928 num_hits = atoi(third_token);
929 }
930 else if (!strcmp(first_token, "ZF"))
931 {
932 mass = atof(third_token);
933 if (mass <= 0.0)
934 mass = 1.0;
935 }
936 else
937 {
938
939 }
940 }
941 }
942 }
943
944 // Don't let the protocol setting override explicit XS tags
945 if (_rg_props.strandedness() == STRANDED_PROTOCOL && source_strand == CUFF_STRAND_UNKNOWN)
946 source_strand = use_stranded_protocol(sam_flag, _rg_props.mate_strand_mapping());
947
948 if (!spliced_alignment)
949 {
950 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
951 bh = create_hit(name,
952 text_name,
953 text_offset - 1,
954 cigar,
955 source_strand,
956 mrnm,
957 text_mate_pos - 1,
958 num_mismatches,
959 num_hits,
960 mass,
961 sam_flag);
962 return true;
963
964 }
965 else
966 {
967 if (source_strand == CUFF_STRAND_UNKNOWN)
968 {
969 fprintf(stderr, "SAM error on line %d: found spliced alignment without XS attribute\n", _line_num);
970 }
971
972 bh = create_hit(name,
973 text_name,
974 text_offset - 1,
975 cigar,
976 source_strand,
977 mrnm,
978 text_mate_pos - 1,
979 num_mismatches,
980 num_hits,
981 mass,
982 sam_flag);
983 return true;
984 }
985 return false;
986 }
987
inspect_header()988 bool SAMHitFactory::inspect_header()
989 {
990 char pBuf[10 * 1024];
991
992 off_t curr_pos = ftello(_hit_file);
993 rewind(_hit_file);
994
995 while (fgets(pBuf, 10*1024, _hit_file))
996 {
997 if (pBuf[0] != '@')
998 {
999 break; // done with the header.
1000 }
1001 char* nl = strchr(pBuf, '\n');
1002 if (nl)
1003 {
1004 *nl = 0;
1005 parse_header_string(pBuf, _rg_props);
1006 }
1007 }
1008
1009 fseek(_hit_file, curr_pos, SEEK_SET);
1010
1011 finalize_rg_props();
1012 return true;
1013 }
1014
1015 //////////////////////////////////////////
1016
load_count_tables(const string & expression_file_name)1017 void PrecomputedExpressionHitFactory::load_count_tables(const string& expression_file_name)
1018 {
1019 //map<int, AbundanceGroup > ab_groups;
1020
1021
1022 std::ifstream ifs(expression_file_name.c_str());
1023 boost::archive::binary_iarchive ia(ifs);
1024
1025 //map<string, AbundanceGroup> single_sample_tracking;
1026
1027 size_t num_loci = 0;
1028 ia >> num_loci;
1029
1030 if (num_loci > 0)
1031 {
1032 pair<int, AbundanceGroup> first_locus;
1033 ia >> first_locus;
1034 boost::shared_ptr<AbundanceGroup> ab = boost::shared_ptr<AbundanceGroup>(new AbundanceGroup(first_locus.second));
1035
1036 // populate the cached count tables so we can make convincing fake bundles later on.
1037 ReadGroupProperties rg_props = **(ab->rg_props().begin());
1038
1039 int i = 0;
1040 BOOST_FOREACH(const LocusCount& c, rg_props.raw_compatible_counts())
1041 {
1042 compat_mass[i++] = c.count;
1043 //compat_mass[c.locus_desc] = c.count;
1044 }
1045
1046 i = 0;
1047 BOOST_FOREACH(const LocusCount& c, rg_props.raw_total_counts())
1048 {
1049 total_mass[i++] = c.count;
1050 //total_mass[c.locus_desc] = c.count;
1051 }
1052 }
1053 }
1054
load_checked_parameters(const string & expression_file_name)1055 void PrecomputedExpressionHitFactory::load_checked_parameters(const string& expression_file_name)
1056 {
1057 std::ifstream ifs(expression_file_name.c_str());
1058 boost::archive::binary_iarchive ia(ifs);
1059
1060 //map<string, AbundanceGroup> single_sample_tracking;
1061
1062 size_t num_loci = 0;
1063 ia >> num_loci;
1064
1065 if (num_loci > 0)
1066 {
1067 pair<int, AbundanceGroup> first_locus;
1068 ia >> first_locus;
1069 boost::shared_ptr<AbundanceGroup> ab = boost::shared_ptr<AbundanceGroup>(new AbundanceGroup(first_locus.second));
1070
1071 // populate the cached count tables so we can make convincing fake bundles later on.
1072 ReadGroupProperties rg_props = **(ab->rg_props().begin());
1073 _rg_props.checked_parameters(rg_props.checked_parameters());
1074 }
1075 }
1076
read_next_hit(ReadHit & buf)1077 bool PrecomputedExpressionHitFactory::read_next_hit(ReadHit& buf)
1078 {
1079 return false;
1080 }
1081
inspect_header()1082 bool PrecomputedExpressionHitFactory::inspect_header()
1083 {
1084
1085 std::ifstream ifs(_expression_file_name.c_str());
1086 boost::archive::binary_iarchive ia(ifs);
1087
1088 RefSequenceTable& rt = ref_table();
1089
1090 size_t num_loci = 0;
1091 ia >> num_loci;
1092
1093 for (size_t i = 0; i < num_loci; ++i)
1094 {
1095 pair<int, AbundanceGroup> locus;
1096
1097 ia >> locus;
1098 boost::shared_ptr<AbundanceGroup> ab = boost::shared_ptr<AbundanceGroup>(new AbundanceGroup(locus.second));
1099
1100 const string locus_tag = ab->locus_tag();
1101
1102 string::size_type idx = locus_tag.find(':');
1103 if (idx != string::npos)
1104 {
1105 string chrom_name = locus_tag.substr(0, idx);
1106 rt.get_id(chrom_name.c_str(), NULL); // make sure the chromosome names are added to the RefSequenceTable in the order that they occur in the expression files.
1107 }
1108 }
1109
1110 return true;
1111 }
1112
get_abundance_for_locus(int locus_id)1113 boost::shared_ptr<const AbundanceGroup> PrecomputedExpressionHitFactory::get_abundance_for_locus(int locus_id)
1114 {
1115 #if ENABLE_THREADS
1116 boost::mutex::scoped_lock lock(_factory_lock);
1117 #endif
1118 map<int, boost::shared_ptr<const AbundanceGroup> >::const_iterator itr = _curr_ab_groups.find(locus_id);
1119 if (itr != _curr_ab_groups.end())
1120 return itr->second;
1121 else
1122 return boost::shared_ptr<const AbundanceGroup>();
1123 }
1124
clear_abundance_for_locus(int locus_id)1125 void PrecomputedExpressionHitFactory::clear_abundance_for_locus(int locus_id)
1126 {
1127 #if ENABLE_THREADS
1128 boost::mutex::scoped_lock lock(_factory_lock);
1129 #endif
1130
1131 map<int, boost::shared_ptr<const AbundanceGroup> >::iterator itr = _curr_ab_groups.find(locus_id);
1132
1133 if (itr != _curr_ab_groups.end())
1134 _curr_ab_groups.erase(itr);
1135 }
1136
next_locus(int locus_id,bool cache_locus)1137 boost::shared_ptr<const AbundanceGroup> PrecomputedExpressionHitFactory::next_locus(int locus_id, bool cache_locus)
1138 {
1139 #if ENABLE_THREADS
1140 boost::mutex::scoped_lock lock(_factory_lock);
1141 #endif
1142 // if (locus_id == 7130)
1143 // {
1144 // fprintf(stderr, "Trying to get a chr13_random\n");
1145 // }
1146
1147 if (_last_locus_id >= locus_id)
1148 return boost::shared_ptr<const AbundanceGroup>(); // we already processed this one
1149
1150 boost::shared_ptr<const AbundanceGroup> sought_group;
1151
1152 map<int, boost::shared_ptr<const AbundanceGroup> >::iterator itr = _curr_ab_groups.find(locus_id);
1153
1154 if (itr != _curr_ab_groups.end())
1155 return itr->second;
1156
1157 for (;_curr_locus_idx < _num_loci; ++_curr_locus_idx)
1158 {
1159 pair<int, AbundanceGroup> p;
1160 *_ia >> p;
1161 _last_locus_id = p.first;
1162 boost::shared_ptr<AbundanceGroup> ab = boost::shared_ptr<AbundanceGroup>(new AbundanceGroup(p.second));
1163 if (_last_locus_id == locus_id)
1164 {
1165 sought_group = ab;
1166 break;
1167 }
1168 else // we don't want to lose this one...
1169 {
1170 if (cache_locus)
1171 _curr_ab_groups[_last_locus_id] = ab;
1172 }
1173 }
1174 if (cache_locus)
1175 _curr_ab_groups[locus_id] = sought_group;
1176
1177 return sought_group;
1178 }
1179
1180 #ifdef HAVE_HTSLIB
1181
read_next_hit(ReadHit & bh)1182 bool HTSHitFactory::read_next_hit(ReadHit& bh)
1183 {
1184 if(!records_remain())
1185 return false;
1186
1187 int read_ret = sam_read1(_hit_file, _file_header, _next_hit);
1188 if(read_ret < 0)
1189 {
1190 _eof_encountered = true;
1191 return false;
1192 }
1193
1194 return get_hit_from_bam1t(_next_hit, _file_header, bh);
1195 }
1196
inspect_header()1197 bool HTSHitFactory::inspect_header()
1198 {
1199 if (_file_header->text == NULL || _file_header->l_text == 0)
1200 {
1201 throw std::runtime_error("Warning: SAM/BAM/CRAM header has 0 length or is corrupted. Try using 'samtools reheader'");
1202 }
1203
1204 parse_header_lines(_file_header->text, _rg_props);
1205
1206 finalize_rg_props();
1207 return true;
1208 }
1209
createSamHitFactory(const string & hit_file_name,ReadTable & it,RefSequenceTable & rt)1210 HitFactory* createSamHitFactory(const string& hit_file_name, ReadTable& it, RefSequenceTable& rt)
1211 {
1212 try {
1213 return new HTSHitFactory(hit_file_name, it, rt);
1214 }
1215 catch (std::runtime_error& e)
1216 {
1217 fprintf(stderr, "Error: cannot open alignment file %s for reading\n",
1218 hit_file_name.c_str());
1219 fprintf(stderr, "Cause: %s\n", e.what());
1220 exit(1);
1221 }
1222 }
1223
1224 #else // ndef HAVE_HTSLIB
1225
inspect_header()1226 bool SamtoolsHitFactory::inspect_header()
1227 {
1228 bam_header_t* header = _hit_file->header;
1229
1230 if (header == NULL)
1231 {
1232 fprintf(stderr, "Warning: No BAM header\n");
1233 return false;
1234 }
1235
1236 // if (header->l_text >= MAX_HEADER_LEN)
1237 // {
1238 // fprintf(stderr, "Warning: BAM header too large\n");
1239 // return false;
1240 // }
1241
1242 if (header->l_text == 0 || header->text == NULL)
1243 {
1244 fprintf(stderr, "Warning: BAM header has 0 length or is corrupted. Try using 'samtools reheader'.\n");
1245 return false;
1246 }
1247
1248 parse_header_lines(header->text, _rg_props);
1249
1250 finalize_rg_props();
1251 return true;
1252 }
1253
1254 // populate a bam_t This will
read_next_hit(ReadHit & bh)1255 bool SamtoolsHitFactory::read_next_hit(ReadHit& bh)
1256 {
1257 if (_next_hit.data)
1258 {
1259 free(_next_hit.data);
1260 _next_hit.data = NULL;
1261 }
1262
1263 if (records_remain() == false)
1264 return false;
1265
1266 memset(&_next_hit, 0, sizeof(_next_hit));
1267
1268 int bytes_read = samread(_hit_file, &_next_hit);
1269 if (bytes_read < 0)
1270 {
1271 _eof_encountered = true;
1272 return false;
1273 }
1274
1275 return get_hit_from_bam1t(&_next_hit, _hit_file->header, bh);
1276 }
1277
createSamHitFactory(const string & hit_file_name,ReadTable & it,RefSequenceTable & rt)1278 HitFactory* createSamHitFactory(const string& hit_file_name, ReadTable& it, RefSequenceTable& rt)
1279 {
1280 try
1281 {
1282 return new SamtoolsHitFactory(hit_file_name, it, rt);
1283 }
1284 catch (std::runtime_error& e)
1285 {
1286 fprintf(stderr, "File %s doesn't appear to be a valid BAM file, trying SAM...\n",
1287 hit_file_name.c_str());
1288 try
1289 {
1290 return new SAMHitFactory(hit_file_name, it, rt);
1291 }
1292 catch (std::runtime_error& e)
1293 {
1294 fprintf(stderr, "Error: cannot open alignment file %s for reading\n",
1295 hit_file_name.c_str());
1296 exit(1);
1297 }
1298 }
1299 }
1300
1301 #endif
1302