1 /*
2  *  hits.cpp
3  *  Cufflinks
4  *
5  *  Created by Cole Trapnell on 3/23/09.
6  *  Copyright 2009 Cole Trapnell. All rights reserved.
7  *
8  */
9 
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13 
14 #include <cassert>
15 #include <cstdlib>
16 #include <cstring>
17 #include <iostream>
18 #include <set>
19 #include <vector>
20 
21 #include "common.h"
22 #include "hits.h"
23 #include "tokenize.h"
24 
25 #include "abundances.h"
26 
27 using namespace std;
28 
29 #if ENABLE_THREADS
30 boost::mutex RefSequenceTable::table_lock;
31 #endif
32 
33 int num_deleted = 0;
34 
trim(int trimmed_length)35 void ReadHit::trim(int trimmed_length)
36 {
37     bool antisense_aln = _sam_flag & 0x10;
38 
39     vector<CigarOp> new_cigar;
40     int new_left = 0;
41 
42     if (!antisense_aln)
43     {
44         int pos = _left;
45         new_left = _left;
46         int length = 0;
47 		for (vector<CigarOp>::iterator i = _cigar.begin(); i < _cigar.end(); ++i)
48 		{
49 			const CigarOp& op = *i;
50 
51             if (length < trimmed_length)
52             {
53                 switch(op.opcode)
54                 {
55                     case REF_SKIP:
56                         //gaps_out.push_back(make_pair(pos, pos + op.length - 1));
57                         pos += op.length;
58                         new_cigar.push_back(op);
59                         break;
60                     case SOFT_CLIP:
61                         assert(false); // not sure if this case is right
62                         pos += op.length;
63                         length += op.length;
64                         new_cigar.push_back(op);
65                         break;
66                     case HARD_CLIP:
67                         new_cigar.push_back(op);
68                         break;
69                     case MATCH:
70                         if (length + op.length < trimmed_length)
71                         {
72                             pos += op.length;
73                             length += op.length;
74                             new_cigar.push_back(op);
75                         }
76                         else
77                         {
78                             new_cigar.push_back(CigarOp(MATCH, trimmed_length - length));
79                             pos += trimmed_length - length;
80                             length += trimmed_length - length;
81                         }
82                         break;
83                     case INS:
84                         assert(false); // not sure if this case is right
85                         pos -= op.length;
86                         length -= op.length;
87                         new_cigar.push_back(op);
88                         break;
89                     case DEL:
90                         assert(false); // not sure if this case is right
91                         pos += op.length;
92                         length += op.length;
93                         new_cigar.push_back(op);
94                         break;
95                     default:
96                         break;
97                 }
98             }
99 		}
100     }
101     else
102     {
103         int pos = _right;
104         int length = 0;
105 		for (vector<CigarOp>::reverse_iterator i = _cigar.rbegin(); i < _cigar.rend(); ++i)
106 		{
107 			const CigarOp& op = *i;
108 
109             if (length < trimmed_length)
110             {
111                 switch(op.opcode)
112                 {
113                     case REF_SKIP:
114                         //gaps_out.push_back(make_pair(pos, pos + op.length - 1));
115                         pos -= op.length;
116                         new_cigar.push_back(op);
117                         break;
118                     case SOFT_CLIP:
119                         assert(false); // not sure if this case is right
120                         pos -= op.length;
121                         length += op.length;
122                         new_cigar.push_back(op);
123                         break;
124                     case HARD_CLIP:
125                         new_cigar.push_back(op);
126                         break;
127                     case MATCH:
128                         if (length + op.length < trimmed_length)
129                         {
130                             pos -= op.length;
131                             length += op.length;
132                             new_cigar.push_back(op);
133                         }
134                         else
135                         {
136                             new_cigar.push_back(CigarOp(MATCH, trimmed_length - length));
137                             pos -= trimmed_length - length;
138                             length += trimmed_length - length;
139                         }
140                         break;
141                     case INS:
142                         assert(false); // not sure if this case is right
143                         pos += op.length;
144                         length -= op.length;
145                         new_cigar.push_back(op);
146                         break;
147                     case DEL:
148                         assert(false); // not sure if this case is right
149                         pos -= op.length;
150                         length += op.length;
151                         new_cigar.push_back(op);
152                         break;
153                     default:
154                         break;
155                 }
156             }
157 		}
158         _left = pos;
159     }
160     _cigar = new_cigar;
161     _right = get_right();
162     assert (trimmed_length == read_len());
163 }
164 
165 //static const int max_read_length = 1024;
166 
hit_insert_id_lt(const ReadHit & h1,const ReadHit & h2)167 bool hit_insert_id_lt(const ReadHit& h1, const ReadHit& h2)
168 {
169 	return h1.insert_id() < h2.insert_id();
170 }
171 
hits_eq_mod_id(const ReadHit & lhs,const ReadHit & rhs)172 bool hits_eq_mod_id(const ReadHit& lhs, const ReadHit& rhs)
173 {
174 	return (lhs.ref_id() == rhs.ref_id() &&
175 			lhs.antisense_align() == rhs.antisense_align() &&
176 			lhs.left() == rhs.left() &&
177 			lhs.source_strand() == rhs.source_strand() &&
178 			lhs.cigar() == rhs.cigar());
179 }
180 
181 // Compares for structural equality, but won't declare multihits equal to one another
hits_eq_non_multi(const MateHit & lhs,const MateHit & rhs)182 bool hits_eq_non_multi(const MateHit& lhs, const MateHit& rhs)
183 {
184 	if ((lhs.is_multi() || rhs.is_multi() ) && lhs.insert_id() != rhs.insert_id())
185 		return false;
186 	return hits_equals(lhs, rhs);
187 }
188 
189 // Compares for structural equality, but won't declare multihits equal to one another
190 // and won't return true for hits from different read groups (e.g. replicate samples)
hits_eq_non_multi_non_replicate(const MateHit & lhs,const MateHit & rhs)191 bool hits_eq_non_multi_non_replicate(const MateHit& lhs, const MateHit& rhs)
192 {
193 	if (((lhs.is_multi() || rhs.is_multi()) && lhs.insert_id() != rhs.insert_id()) || lhs.read_group_props() != rhs.read_group_props())
194 		return false;
195 	return hits_equals(lhs, rhs);
196 }
197 
198 // Does NOT care about the read group this hit came from.
hits_equals(const MateHit & lhs,const MateHit & rhs)199 bool hits_equals(const MateHit& lhs, const MateHit& rhs)
200 {
201 	if (lhs.ref_id() != rhs.ref_id())
202 		return false;
203 
204 	if ((lhs.left_alignment() == NULL) != (rhs.left_alignment() == NULL))
205 		return false;
206 	if ((lhs.right_alignment() == NULL) != (rhs.right_alignment() == NULL))
207 		return false;
208 	if (lhs.left_alignment())
209 	{
210 		if (!(hits_eq_mod_id(*lhs.left_alignment(),*(rhs.left_alignment()))))
211 			return false;
212 	}
213 	if (lhs.right_alignment())
214 	{
215 		if (!(hits_eq_mod_id(*lhs.right_alignment(),*(rhs.right_alignment()))))
216 			return false;
217 	}
218 	return true;
219 }
220 
has_no_collapse_mass(const MateHit & hit)221 bool has_no_collapse_mass(const MateHit& hit)
222 {
223 	return hit.collapse_mass() == 0;
224 }
225 
226 // Assumes hits are sorted by mate_hit_lt
227 // Does not collapse hits that are multi-reads
collapse_hits(const vector<MateHit> & hits,vector<MateHit> & non_redundant)228 void collapse_hits(const vector<MateHit>& hits,
229 				   vector<MateHit>& non_redundant)
230 {
231 	copy(hits.begin(), hits.end(), back_inserter(non_redundant));
232 	vector<MateHit>::iterator new_end = unique(non_redundant.begin(),
233 											   non_redundant.end(),
234 											   hits_eq_non_multi_non_replicate);
235 	non_redundant.erase(new_end, non_redundant.end());
236     non_redundant.resize(non_redundant.size());
237 
238 	BOOST_FOREACH(MateHit& hit, non_redundant)
239 		hit.collapse_mass(0);
240 
241 	size_t curr_aln = 0;
242 	size_t curr_unique_aln = 0;
243 	while (curr_aln < hits.size())
244 	{
245 		if (hits_eq_non_multi_non_replicate(non_redundant[curr_unique_aln], hits[curr_aln]) || hits_eq_non_multi_non_replicate(non_redundant[++curr_unique_aln], hits[curr_aln]))
246 		{
247             double more_mass = hits[curr_aln].internal_scale_mass();
248 			//assert(non_redundant[curr_unique_aln].collapse_mass() == 0 || !non_redundant[curr_unique_aln].is_multi());
249 			non_redundant[curr_unique_aln].incr_collapse_mass(more_mass);
250 		}
251 		else
252 			assert(false);
253 
254 		++curr_aln;
255 	}
256 
257 	//BOOST_FOREACH(MateHit& hit, non_redundant)
258 		//assert(hit.collapse_mass() <= 1 || !hit.is_multi());
259 
260 	//non_redundant.erase(remove_if(non_redundant.begin(),non_redundant.end(),has_no_collapse_mass), non_redundant.end());
261 
262 }
263 
264 // Places multi-reads to the right of reads they match
mate_hit_lt(const MateHit & lhs,const MateHit & rhs)265 bool mate_hit_lt(const MateHit& lhs, const MateHit& rhs)
266 {
267 	if (lhs.left() != rhs.left())
268 		return lhs.left() < rhs.left();
269 	if (lhs.right() != rhs.right())
270 		return lhs.right() > rhs.right();
271 
272 	if ((lhs.left_alignment() == NULL) != (rhs.left_alignment() == NULL))
273 		return (lhs.left_alignment() == NULL) < (rhs.left_alignment() == NULL);
274 
275 	if ((lhs.right_alignment() == NULL) != (rhs.right_alignment() == NULL))
276 		return (lhs.right_alignment() == NULL) < (rhs.right_alignment() == NULL);
277 
278 	assert ((lhs.right_alignment() == NULL) == (rhs.right_alignment() == NULL));
279 	assert ((lhs.left_alignment() == NULL) == (rhs.left_alignment() == NULL));
280 
281 	const ReadHit* lhs_l = lhs.left_alignment();
282 	const ReadHit* lhs_r = lhs.right_alignment();
283 
284 	const ReadHit* rhs_l = rhs.left_alignment();
285 	const ReadHit* rhs_r = rhs.right_alignment();
286 
287 	if (lhs_l && rhs_l)
288 	{
289 		if (lhs_l->cigar().size() != rhs_l->cigar().size())
290 			return lhs_l->cigar().size() < rhs_l->cigar().size();
291 		for (size_t i = 0; i < lhs_l->cigar().size(); ++i)
292 		{
293 			if (lhs_l->cigar()[i].opcode != rhs_l->cigar()[i].opcode)
294 				return lhs_l->cigar()[i].opcode < rhs_l->cigar()[i].opcode;
295 			if (lhs_l->cigar()[i].length != rhs_l->cigar()[i].length)
296 				return lhs_l->cigar()[i].length < rhs_l->cigar()[i].length;
297 		}
298 	}
299 
300 	if (lhs_r && rhs_r)
301 	{
302 		if (lhs_r->cigar().size() != rhs_r->cigar().size())
303 			return lhs_r->cigar().size() < rhs_r->cigar().size();
304 		for (size_t i = 0; i < lhs_r->cigar().size(); ++i)
305 		{
306 			if (lhs_r->cigar()[i].opcode != rhs_r->cigar()[i].opcode)
307 				return lhs_r->cigar()[i].opcode < rhs_r->cigar()[i].opcode;
308 			if (lhs_r->cigar()[i].length != rhs_r->cigar()[i].length)
309 				return lhs_r->cigar()[i].length < rhs_r->cigar()[i].length;
310 		}
311 	}
312 
313 	if (lhs.is_multi() != rhs.is_multi())
314 	{
315 		return rhs.is_multi();
316 	}
317 
318     if (lhs.read_group_props() != rhs.read_group_props())
319 	{
320 		return lhs.read_group_props() < rhs.read_group_props();
321 	}
322 
323 	return false;
324 }
325 
326 
create_hit(const string & insert_name,const string & ref_name,int left,const vector<CigarOp> & cigar,CuffStrand source_strand,const string & partner_ref,int partner_pos,unsigned int edit_dist,int num_hits,float base_mass,uint32_t sam_flag)327 ReadHit HitFactory::create_hit(const string& insert_name,
328 							   const string& ref_name,
329 							   int left,
330 							   const vector<CigarOp>& cigar,
331 							   CuffStrand source_strand,
332 							   const string& partner_ref,
333 							   int partner_pos,
334 							   unsigned int edit_dist,
335 							   int num_hits,
336                                float base_mass,
337                                uint32_t sam_flag)
338 {
339 	InsertID insert_id = _insert_table.get_id(insert_name);
340 	RefID reference_id = _ref_table.get_id(ref_name, NULL);
341 	RefID partner_ref_id = _ref_table.get_id(partner_ref, NULL);
342 
343 	return ReadHit(reference_id,
344 				   insert_id,
345 				   left,
346 				   cigar,
347 				   source_strand,
348 				   partner_ref_id,
349 				   partner_pos,
350 				   edit_dist,
351 				   num_hits,
352                    base_mass,
353                    sam_flag);
354 }
355 
create_hit(const string & insert_name,const string & ref_name,uint32_t left,uint32_t read_len,CuffStrand source_strand,const string & partner_ref,int partner_pos,unsigned int edit_dist,int num_hits,float base_mass,uint32_t sam_flag)356 ReadHit HitFactory::create_hit(const string& insert_name,
357 							   const string& ref_name,
358 							   uint32_t left,
359 							   uint32_t read_len,
360 							   CuffStrand source_strand,
361 							   const string& partner_ref,
362 							   int partner_pos,
363 							   unsigned int edit_dist,
364 							   int num_hits,
365                                float base_mass,
366                                uint32_t sam_flag)
367 {
368 	InsertID insert_id = _insert_table.get_id(insert_name);
369 	RefID reference_id = _ref_table.get_id(ref_name, NULL);
370 	RefID partner_ref_id = _ref_table.get_id(partner_ref, NULL);
371 
372 	return ReadHit(reference_id,
373 				   insert_id,
374 				   left,
375 				   read_len,
376 				   source_strand,
377 				   partner_ref_id,
378 				   partner_pos,
379 				   edit_dist,
380 				   num_hits,
381                    base_mass,
382                    sam_flag);
383 }
384 
use_stranded_protocol(uint32_t sam_flag,MateStrandMapping msm)385 CuffStrand use_stranded_protocol(uint32_t sam_flag,  MateStrandMapping msm)
386 {
387     bool antisense_aln = sam_flag & 0x10;
388 	if (((sam_flag & BAM_FPAIRED) && (sam_flag & BAM_FREAD1)) || !(sam_flag & BAM_FPAIRED)) // first-in-pair or single-end
389 	{
390 		switch(msm)
391 		{
392 			case FF:
393 			case FR:
394 				return (antisense_aln) ? CUFF_REV : CUFF_FWD;
395 				break;
396 			case RF:
397 			case RR:
398 				return (antisense_aln) ? CUFF_FWD : CUFF_REV;
399 				break;
400 		}
401 	}
402 	else // second-in-pair read
403 	{
404 		switch (msm)
405 		{
406 			case FF:
407 			case RF:
408 				return (antisense_aln) ? CUFF_REV : CUFF_FWD;
409 				break;
410 			case FR:
411 			case RR:
412 				return (antisense_aln) ? CUFF_FWD : CUFF_REV;
413 				break;
414 		}
415 	}
416 	assert(false);
417     return CUFF_STRAND_UNKNOWN;
418 }
419 
get_hit_from_bam1t(const bam1_t * hit_buf,const bam_hdr_t * header,ReadHit & bh)420 bool BAMHitFactory::get_hit_from_bam1t(const bam1_t* hit_buf,
421 									   const bam_hdr_t* header,
422 									   ReadHit& bh)
423 {
424 	uint32_t sam_flag = hit_buf->core.flag;
425 
426 	int text_offset = hit_buf->core.pos;
427 	int text_mate_pos = hit_buf->core.mpos;
428 	int target_id = hit_buf->core.tid;
429 	int mate_target_id = hit_buf->core.mtid;
430 
431 	vector<CigarOp> cigar;
432 	bool spliced_alignment = false;
433 	int num_hits = 1;
434 
435 	//header->target_name[c->tid]
436 
437 	if (sam_flag & 0x4 || target_id < 0)
438 	{
439 		//assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
440 		bh = create_hit(bam_get_qname(hit_buf),
441 						"*",
442 						0, // SAM files are 1-indexed
443 						0,
444 						CUFF_STRAND_UNKNOWN,
445 						"*",
446 						0,
447 						0,
448 						1,
449                         1.0,
450                         sam_flag);
451 		return true;
452 	}
453 	if (target_id >= header->n_targets)
454     {
455         fprintf (stderr, "BAM error: file contains hits to sequences not in header SQ records (%s)\n", bam_get_qname(hit_buf));
456         return false;
457     }
458 
459 	string text_name = header->target_name[target_id];
460 
461 	for (int i = 0; i < hit_buf->core.n_cigar; ++i)
462 	{
463 		//char* t;
464 
465 		int length = bam_get_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT;
466 		if (length <= 0)
467 		{
468 		  fprintf (stderr, "BAM error: CIGAR op has zero length (%s)\n", bam_get_qname(hit_buf));
469 			return false;
470 		}
471 
472 		CigarOpCode opcode;
473 		switch(bam_get_cigar(hit_buf)[i] & BAM_CIGAR_MASK)
474 		{
475 			case BAM_CMATCH: opcode  = MATCH; break;
476 			case BAM_CINS: opcode  = INS; break;
477 			case BAM_CDEL: opcode  = DEL; break;
478 			case BAM_CSOFT_CLIP: opcode  = SOFT_CLIP; break;
479 			case BAM_CHARD_CLIP: opcode  = HARD_CLIP; break;
480 			case BAM_CPAD: opcode  = PAD; break;
481 			case BAM_CREF_SKIP:
482                 opcode = REF_SKIP;
483 				spliced_alignment = true;
484 				if (length > (int)max_intron_length)
485 				{
486 					//fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n");
487 					return false;
488 				}
489 				break;
490 			default:
491 				//fprintf (stderr, "SAM error on line %d: invalid CIGAR operation\n", _line_num);
492 				return false;
493 		}
494 		if (opcode != HARD_CLIP)
495 			cigar.push_back(CigarOp(opcode, length));
496 	}
497 
498 	string mrnm;
499 	if (mate_target_id >= 0)
500 	{
501 		if (mate_target_id == target_id)
502 		{
503 			mrnm = header->target_name[mate_target_id];
504 //			if (abs((int)text_mate_pos - (int)text_offset) > (int)max_intron_length)
505 //			{
506 //				//fprintf (stderr, "Mates are too distant, skipping\n");
507 //				return false;
508 //			}
509 		}
510 		else
511 		{
512 			//fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n");
513 			return false;
514 		}
515 	}
516 	else
517 	{
518 		text_mate_pos = 0;
519 	}
520 
521 	CuffStrand source_strand = CUFF_STRAND_UNKNOWN;
522 	unsigned char num_mismatches = 0;
523 
524 	uint8_t* ptr = bam_aux_get(hit_buf, "XS");
525 	if (ptr)
526 	{
527 		char src_strand_char = bam_aux2A(ptr);
528 		if (src_strand_char == '-')
529 			source_strand = CUFF_REV;
530 		else if (src_strand_char == '+')
531 			source_strand = CUFF_FWD;
532 	}
533 
534 	ptr = bam_aux_get(hit_buf, "NM");
535 	if (ptr)
536 	{
537 		num_mismatches = bam_aux2i(ptr);
538 	}
539 
540 	ptr = bam_aux_get(hit_buf, "NH");
541 	if (ptr)
542 	{
543 		num_hits = bam_aux2i(ptr);
544 	}
545 
546     double mass = 1.0;
547     ptr = bam_aux_get(hit_buf, "ZF");
548 	if (ptr)
549 	{
550 		mass = bam_aux2i(ptr);
551         if (mass <= 0.0)
552             mass = 1.0;
553 	}
554 
555     if (_rg_props.strandedness() == STRANDED_PROTOCOL && source_strand == CUFF_STRAND_UNKNOWN)
556 		source_strand = use_stranded_protocol(sam_flag, _rg_props.mate_strand_mapping());
557 
558 	if (!spliced_alignment)
559 	{
560 		//assert(_rg_props.strandedness() == STRANDED_PROTOCOL || source_strand == CUFF_STRAND_UNKNOWN);
561 
562 		//assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
563 		bh = create_hit(bam_get_qname(hit_buf),
564 						text_name,
565 						text_offset,  // BAM files are 0-indexed
566 						cigar,
567 						source_strand,
568 						mrnm,
569 						text_mate_pos,
570 						num_mismatches,
571 						num_hits,
572                         mass,
573                         sam_flag);
574 		return true;
575 
576 	}
577 	else
578 	{
579 		if (source_strand == CUFF_STRAND_UNKNOWN)
580 		{
581 			fprintf(stderr, "BAM record error: found spliced alignment without XS attribute\n");
582 		}
583 
584 		bh = create_hit(bam_get_qname(hit_buf),
585 						text_name,
586 						text_offset,  // BAM files are 0-indexed
587 						cigar,
588 						source_strand,
589 						mrnm,
590 						text_mate_pos,
591 						num_mismatches,
592 						num_hits,
593                         mass,
594                         sam_flag);
595 		return true;
596 	}
597 
598 
599 	return true;
600 }
601 
602 
603 
str_to_platform(const string pl_str)604 Platform str_to_platform(const string pl_str)
605 {
606     if (pl_str == "SOLiD")
607     {
608         return SOLID;
609     }
610     else if (pl_str == "Illumina")
611     {
612         return ILLUMINA;
613     }
614     else
615     {
616         return UNKNOWN_PLATFORM;
617     }
618 }
619 
620 // Parses the header to determine platform and other properties
parse_header_string(const string & header_rec,ReadGroupProperties & rg_props)621 bool HitFactory::parse_header_string(const string& header_rec,
622                                      ReadGroupProperties& rg_props)
623 {
624     vector<string> columns;
625     tokenize(header_rec, "\t", columns);
626 
627     if (columns[0] == "@RG")
628     {
629         for (size_t i = 1; i < columns.size(); ++i)
630         {
631             vector<string> fields;
632             tokenize(columns[i], ":", fields);
633             if (fields[0] == "PL")
634             {
635                 if (rg_props.platform() == UNKNOWN_PLATFORM)
636                 {
637                     Platform p = str_to_platform(fields[1]);
638                     rg_props.platform(p);
639                 }
640                 else
641                 {
642                     Platform p = str_to_platform(fields[1]);
643                     if (p != rg_props.platform())
644                     {
645                         fprintf(stderr, "Error: Processing reads from different platforms is not currently supported\n");
646                         return false;
647                     }
648                 }
649 
650             }
651         }
652     }
653     else if (columns[0] == "@SQ")
654     {
655         _num_seq_header_recs++;
656         for (size_t i = 1; i < columns.size(); ++i)
657         {
658             vector<string> fields;
659             tokenize(columns[i], ":", fields);
660             if (fields[0] == "SN")
661             {
662                 // Populate the RefSequenceTable with the sequence dictionary,
663                 // to ensure that (for example) downstream GTF files are sorted
664                 // in an order consistent with the header, and to enforce that
665                 // BAM records appear in the order implied by the header
666                 RefID _id = _ref_table.get_id(fields[1], NULL);
667 
668                 const RefSequenceTable::SequenceInfo* info = _ref_table.get_info(_id);
669 
670                 if (info->observation_order != _num_seq_header_recs)
671                 {
672                     if (info->name != fields[1])
673                     {
674                         fprintf(stderr, "Error: Hash collision between references '%s' and '%s'.\n", info->name, fields[1].c_str());
675                     }
676                     else
677                     {
678                         fprintf(stderr, "Error: sort order of reads in BAMs must be the same\n");
679 					}
680                     exit(1);
681                 }
682             }
683         }
684     }
685 
686     return true;
687 }
688 
689 // Parses multiple header lines to determine platform and other properties
parse_header_lines(const string & h_text,ReadGroupProperties & _rg_props)690 void HitFactory::parse_header_lines(const string& h_text,
691 									ReadGroupProperties& _rg_props)
692 {
693 	size_t offset = 0;
694 	while(offset < h_text.size())
695 	{
696 		size_t next_newline = h_text.find('\n', offset);
697 		size_t line_length = next_newline == string::npos ? string::npos : next_newline - offset;
698 		std::string this_line = h_text.substr(offset, line_length);
699 		parse_header_string(this_line, _rg_props);
700 
701 		if(next_newline == string::npos)
702 			break;
703 		offset = next_newline + 1;
704 	}
705 }
706 
finalize_rg_props()707 void HitFactory::finalize_rg_props()
708 {
709     if (_rg_props.platform() == SOLID)
710     {
711         _rg_props.strandedness(STRANDED_PROTOCOL);
712         _rg_props.std_mate_orientation(MATES_POINT_TOWARD);
713     }
714     else
715     {
716         // Default to Illumina's unstranded protocol params for strandedness and
717         // mate orientation
718         _rg_props.strandedness(UNSTRANDED_PROTOCOL);
719         _rg_props.std_mate_orientation(MATES_POINT_TOWARD);
720     }
721 }
722 
723 static const unsigned MAX_HEADER_LEN = 64 * 1024 * 1024; // 4 MB
724 
read_next_hit(ReadHit & bh)725 bool SAMHitFactory::read_next_hit(ReadHit& bh)
726 {
727 	bool new_rec = fgets(_hit_buf,  _hit_buf_max_sz - 1, _hit_file);
728 	if (!new_rec)
729 		return false;
730 	++_line_num;
731 	char* nl = strrchr(_hit_buf, '\n');
732 	if (nl) *nl = 0;
733 
734 	const char* buf = _hit_buf;
735 
736 	// Are we still in the header region?
737 	if (buf[0] == '@')
738 		return false;
739 
740 	const char* _name = strsep((char**)&buf,"\t");
741 	if (!_name)
742 		return false;
743 	char name[2048];
744 	strncpy(name, _name, 2047);
745 
746 	const char* sam_flag_str = strsep((char**)&buf,"\t");
747 	if (!sam_flag_str)
748 		return false;
749 
750 	const char* text_name = strsep((char**)&buf,"\t");
751 	if (!text_name)
752 		return false;
753 
754 	const char* text_offset_str = strsep((char**)&buf,"\t");
755 	if (!text_offset_str)
756 		return false;
757 
758 	const char* map_qual_str =  strsep((char**)&buf,"\t");
759 	if (!map_qual_str)
760 		return false;
761 
762 	const char* cigar_str = strsep((char**)&buf,"\t");
763 	if (!cigar_str)
764 		return false;
765 
766 	const char* mate_ref_name =  strsep((char**)&buf,"\t");
767 	if (!mate_ref_name)
768 		return false;
769 
770 	const char* mate_pos_str =  strsep((char**)&buf,"\t");
771 	if (!mate_pos_str)
772 		return false;
773 
774 	const char* inferred_insert_sz_str =  strsep((char**)&buf,"\t");
775 	if (!inferred_insert_sz_str)
776 		return false;
777 
778 	const char* seq_str =  strsep((char**)&buf,"\t");
779 	if (!seq_str)
780 		return false;
781 
782 	const char* qual_str =  strsep((char**)&buf,"\t");
783 	if (!qual_str)
784 		return false;
785 
786 
787 	int sam_flag = atoi(sam_flag_str);
788 	int text_offset = atoi(text_offset_str);
789 	int text_mate_pos = atoi(mate_pos_str);
790 
791 	const char* p_cig = cigar_str;
792 	//int len = strlen(sequence);
793 	vector<CigarOp> cigar;
794 	bool spliced_alignment = false;
795 	int num_hits = 1;
796 
797     if ((sam_flag & 0x4) ||!strcmp(text_name, "*"))
798 	{
799 		//assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
800 		bh = create_hit(name,
801 						"*",
802 						0, // SAM files are 1-indexed
803 						0,
804 						CUFF_STRAND_UNKNOWN,
805 						"*",
806 						0,
807 						0,
808 						1,
809                         1.0,
810                         sam_flag);
811 		return true;
812 	}
813 	// Mostly pilfered direct from the SAM tools:
814 	while (*p_cig)
815 	{
816 		char* t;
817 		int length = (int)strtol(p_cig, &t, 10);
818 		if (length <= 0)
819 		{
820 			fprintf (stderr, "SAM error on line %d: CIGAR op has zero length\n", _line_num);
821             fprintf (stderr,"%s\n", _hit_buf);
822 			return false;
823 		}
824 		char op_char = toupper(*t);
825 		CigarOpCode opcode;
826 		if (op_char == 'M')
827 		{
828 			/*if (length > max_read_length)
829 			 {
830 			 fprintf(stderr, "SAM error on line %d:  %s: MATCH op has length > %d\n", line_num, name, max_read_length);
831 			 return false;
832 			 }*/
833 			opcode = MATCH;
834 		}
835 		else if (op_char == 'I') opcode = INS;
836 		else if (op_char == 'D')
837         {
838             opcode = DEL;
839         }
840 		else if (op_char == 'N')
841 		{
842 			opcode = REF_SKIP;
843 			spliced_alignment = true;
844 			if (length > (int)max_intron_length)
845 			{
846 				//fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n");
847 				return false;
848 			}
849 		}
850 		else if (op_char == 'S') opcode = SOFT_CLIP;
851 		else if (op_char == 'H') opcode = HARD_CLIP;
852 		else if (op_char == 'P') opcode = PAD;
853 		else
854 		{
855 			fprintf (stderr, "SAM error on line %d: invalid CIGAR operation\n", _line_num);
856 			return false;
857 		}
858 		p_cig = t + 1;
859 		//i += length;
860 		if (opcode != HARD_CLIP)
861 			cigar.push_back(CigarOp(opcode, length));
862 	}
863 	if (*p_cig)
864 	{
865 		fprintf (stderr, "SAM error on line %d: unmatched CIGAR operation\n", _line_num);
866 		return false;
867 	}
868 
869 	string mrnm;
870 	if (strcmp(mate_ref_name, "*"))
871 	{
872 		if (!strcmp(mate_ref_name, "=") || !strcmp(mate_ref_name, text_name))
873 		{
874 			mrnm = text_name;
875 //			if (abs((int)text_mate_pos - (int)text_offset) > (int)max_intron_length)
876 //			{
877 //				//fprintf (stderr, "Mates are too distant, skipping\n");
878 //				return false;
879 //			}
880 		}
881 		else
882 		{
883 			//fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n");
884 			return false;
885 		}
886 	}
887 	else
888 	{
889 		text_mate_pos = 0;
890 	}
891 
892 	CuffStrand source_strand = CUFF_STRAND_UNKNOWN;
893 	unsigned char num_mismatches = 0;
894 
895 	const char* tag_buf = buf;
896 
897     double mass = 1.0;
898 
899 	while((tag_buf = strsep((char**)&buf,"\t")))
900 	{
901 
902 		char* first_colon = (char*)strchr(tag_buf, ':');
903 		if (first_colon)
904 		{
905 			*first_colon = 0;
906 			++first_colon;
907 			char* second_colon = strchr(first_colon, ':');
908 			if (second_colon)
909 			{
910 				*second_colon = 0;
911 				++second_colon;
912 				const char* first_token = tag_buf;
913 				//const char* second_token = first_colon;
914 				const char* third_token = second_colon;
915 				if (!strcmp(first_token, "XS"))
916 				{
917 					if (*third_token == '-')
918 						source_strand = CUFF_REV;
919 					else if (*third_token == '+')
920 						source_strand = CUFF_FWD;
921 				}
922 				else if (!strcmp(first_token, "NM"))
923 				{
924 					num_mismatches = atoi(third_token);
925 				}
926 				else if (!strcmp(first_token, "NH"))
927 				{
928                     num_hits = atoi(third_token);
929 				}
930                 else if (!strcmp(first_token, "ZF"))
931 				{
932 					mass = atof(third_token);
933                     if (mass <= 0.0)
934                         mass = 1.0;
935 				}
936 				else
937 				{
938 
939 				}
940 			}
941 		}
942 	}
943 
944     // Don't let the protocol setting override explicit XS tags
945 	if (_rg_props.strandedness() == STRANDED_PROTOCOL && source_strand == CUFF_STRAND_UNKNOWN)
946 		source_strand = use_stranded_protocol(sam_flag, _rg_props.mate_strand_mapping());
947 
948 	if (!spliced_alignment)
949 	{
950 		//assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
951 		bh = create_hit(name,
952 						text_name,
953 						text_offset - 1,
954 						cigar,
955 						source_strand,
956 						mrnm,
957 						text_mate_pos - 1,
958 						num_mismatches,
959 						num_hits,
960                         mass,
961                         sam_flag);
962 		return true;
963 
964 	}
965 	else
966 	{
967 		if (source_strand == CUFF_STRAND_UNKNOWN)
968 		{
969 			fprintf(stderr, "SAM error on line %d: found spliced alignment without XS attribute\n", _line_num);
970 		}
971 
972 		bh = create_hit(name,
973 						text_name,
974 						text_offset - 1,
975 						cigar,
976 						source_strand,
977 						mrnm,
978 						text_mate_pos - 1,
979 						num_mismatches,
980 						num_hits,
981                         mass,
982                         sam_flag);
983 		return true;
984 	}
985 	return false;
986 }
987 
inspect_header()988 bool SAMHitFactory::inspect_header()
989 {
990     char pBuf[10 * 1024];
991 
992     off_t curr_pos = ftello(_hit_file);
993     rewind(_hit_file);
994 
995     while (fgets(pBuf, 10*1024, _hit_file))
996     {
997         if (pBuf[0] != '@')
998         {
999             break; // done with the header.
1000         }
1001         char* nl = strchr(pBuf, '\n');
1002         if (nl)
1003         {
1004             *nl = 0;
1005             parse_header_string(pBuf, _rg_props);
1006         }
1007     }
1008 
1009     fseek(_hit_file, curr_pos, SEEK_SET);
1010 
1011     finalize_rg_props();
1012     return true;
1013 }
1014 
1015 //////////////////////////////////////////
1016 
load_count_tables(const string & expression_file_name)1017 void PrecomputedExpressionHitFactory::load_count_tables(const string& expression_file_name)
1018 {
1019     //map<int, AbundanceGroup > ab_groups;
1020 
1021 
1022     std::ifstream ifs(expression_file_name.c_str());
1023     boost::archive::binary_iarchive ia(ifs);
1024 
1025     //map<string, AbundanceGroup> single_sample_tracking;
1026 
1027     size_t num_loci = 0;
1028     ia >> num_loci;
1029 
1030     if (num_loci > 0)
1031     {
1032         pair<int, AbundanceGroup> first_locus;
1033         ia >> first_locus;
1034         boost::shared_ptr<AbundanceGroup> ab = boost::shared_ptr<AbundanceGroup>(new AbundanceGroup(first_locus.second));
1035 
1036         // populate the cached count tables so we can make convincing fake bundles later on.
1037         ReadGroupProperties rg_props = **(ab->rg_props().begin());
1038 
1039         int i = 0;
1040         BOOST_FOREACH(const LocusCount& c, rg_props.raw_compatible_counts())
1041         {
1042             compat_mass[i++] = c.count;
1043             //compat_mass[c.locus_desc] = c.count;
1044         }
1045 
1046         i = 0;
1047         BOOST_FOREACH(const LocusCount& c, rg_props.raw_total_counts())
1048         {
1049             total_mass[i++] = c.count;
1050             //total_mass[c.locus_desc] = c.count;
1051         }
1052     }
1053 }
1054 
load_checked_parameters(const string & expression_file_name)1055 void PrecomputedExpressionHitFactory::load_checked_parameters(const string& expression_file_name)
1056 {
1057     std::ifstream ifs(expression_file_name.c_str());
1058     boost::archive::binary_iarchive ia(ifs);
1059 
1060     //map<string, AbundanceGroup> single_sample_tracking;
1061 
1062     size_t num_loci = 0;
1063     ia >> num_loci;
1064 
1065     if (num_loci > 0)
1066     {
1067         pair<int, AbundanceGroup> first_locus;
1068         ia >> first_locus;
1069         boost::shared_ptr<AbundanceGroup> ab = boost::shared_ptr<AbundanceGroup>(new AbundanceGroup(first_locus.second));
1070 
1071         // populate the cached count tables so we can make convincing fake bundles later on.
1072         ReadGroupProperties rg_props = **(ab->rg_props().begin());
1073         _rg_props.checked_parameters(rg_props.checked_parameters());
1074     }
1075 }
1076 
read_next_hit(ReadHit & buf)1077 bool PrecomputedExpressionHitFactory::read_next_hit(ReadHit& buf)
1078 {
1079 	return false;
1080 }
1081 
inspect_header()1082 bool PrecomputedExpressionHitFactory::inspect_header()
1083 {
1084 
1085     std::ifstream ifs(_expression_file_name.c_str());
1086     boost::archive::binary_iarchive ia(ifs);
1087 
1088     RefSequenceTable& rt = ref_table();
1089 
1090     size_t num_loci = 0;
1091     ia >> num_loci;
1092 
1093     for (size_t i = 0; i < num_loci; ++i)
1094     {
1095         pair<int, AbundanceGroup> locus;
1096 
1097         ia >> locus;
1098         boost::shared_ptr<AbundanceGroup> ab = boost::shared_ptr<AbundanceGroup>(new AbundanceGroup(locus.second));
1099 
1100         const string locus_tag = ab->locus_tag();
1101 
1102         string::size_type idx = locus_tag.find(':');
1103         if (idx != string::npos)
1104         {
1105             string chrom_name = locus_tag.substr(0, idx);
1106             rt.get_id(chrom_name.c_str(), NULL); // make sure the chromosome names are added to the RefSequenceTable in the order that they occur in the expression files.
1107         }
1108     }
1109 
1110     return true;
1111 }
1112 
get_abundance_for_locus(int locus_id)1113 boost::shared_ptr<const AbundanceGroup> PrecomputedExpressionHitFactory::get_abundance_for_locus(int locus_id)
1114 {
1115 #if ENABLE_THREADS
1116     boost::mutex::scoped_lock lock(_factory_lock);
1117 #endif
1118     map<int, boost::shared_ptr<const AbundanceGroup> >::const_iterator itr = _curr_ab_groups.find(locus_id);
1119     if (itr != _curr_ab_groups.end())
1120         return itr->second;
1121     else
1122         return boost::shared_ptr<const AbundanceGroup>();
1123 }
1124 
clear_abundance_for_locus(int locus_id)1125 void PrecomputedExpressionHitFactory::clear_abundance_for_locus(int locus_id)
1126 {
1127 #if ENABLE_THREADS
1128     boost::mutex::scoped_lock lock(_factory_lock);
1129 #endif
1130 
1131     map<int, boost::shared_ptr<const AbundanceGroup> >::iterator itr = _curr_ab_groups.find(locus_id);
1132 
1133     if (itr != _curr_ab_groups.end())
1134         _curr_ab_groups.erase(itr);
1135 }
1136 
next_locus(int locus_id,bool cache_locus)1137 boost::shared_ptr<const AbundanceGroup> PrecomputedExpressionHitFactory::next_locus(int locus_id, bool cache_locus)
1138 {
1139 #if ENABLE_THREADS
1140     boost::mutex::scoped_lock lock(_factory_lock);
1141 #endif
1142 //    if (locus_id == 7130)
1143 //    {
1144 //        fprintf(stderr, "Trying to get a chr13_random\n");
1145 //    }
1146 
1147     if (_last_locus_id >= locus_id)
1148         return boost::shared_ptr<const AbundanceGroup>(); // we already processed this one
1149 
1150     boost::shared_ptr<const AbundanceGroup> sought_group;
1151 
1152     map<int, boost::shared_ptr<const AbundanceGroup> >::iterator itr = _curr_ab_groups.find(locus_id);
1153 
1154     if (itr != _curr_ab_groups.end())
1155         return itr->second;
1156 
1157     for (;_curr_locus_idx < _num_loci; ++_curr_locus_idx)
1158     {
1159         pair<int, AbundanceGroup> p;
1160         *_ia >> p;
1161         _last_locus_id = p.first;
1162         boost::shared_ptr<AbundanceGroup> ab = boost::shared_ptr<AbundanceGroup>(new AbundanceGroup(p.second));
1163         if (_last_locus_id == locus_id)
1164         {
1165             sought_group = ab;
1166             break;
1167         }
1168         else // we don't want to lose this one...
1169         {
1170             if (cache_locus)
1171                 _curr_ab_groups[_last_locus_id] = ab;
1172         }
1173     }
1174     if (cache_locus)
1175         _curr_ab_groups[locus_id] = sought_group;
1176 
1177     return sought_group;
1178 }
1179 
1180 #ifdef HAVE_HTSLIB
1181 
read_next_hit(ReadHit & bh)1182 bool HTSHitFactory::read_next_hit(ReadHit& bh)
1183 {
1184 	if(!records_remain())
1185 		return false;
1186 
1187 	int read_ret = sam_read1(_hit_file, _file_header, _next_hit);
1188 	if(read_ret < 0)
1189 	{
1190 		_eof_encountered = true;
1191 		return false;
1192 	}
1193 
1194 	return get_hit_from_bam1t(_next_hit, _file_header, bh);
1195 }
1196 
inspect_header()1197 bool HTSHitFactory::inspect_header()
1198 {
1199 	if (_file_header->text == NULL || _file_header->l_text == 0)
1200 	{
1201 		throw std::runtime_error("Warning: SAM/BAM/CRAM header has 0 length or is corrupted.  Try using 'samtools reheader'");
1202 	}
1203 
1204 	parse_header_lines(_file_header->text, _rg_props);
1205 
1206     finalize_rg_props();
1207     return true;
1208 }
1209 
createSamHitFactory(const string & hit_file_name,ReadTable & it,RefSequenceTable & rt)1210 HitFactory* createSamHitFactory(const string& hit_file_name, ReadTable& it, RefSequenceTable& rt)
1211 {
1212 	try {
1213 		return new HTSHitFactory(hit_file_name, it, rt);
1214 	}
1215 	catch (std::runtime_error& e)
1216 	{
1217 		fprintf(stderr, "Error: cannot open alignment file %s for reading\n",
1218 				hit_file_name.c_str());
1219 		fprintf(stderr, "Cause: %s\n", e.what());
1220 		exit(1);
1221 	}
1222 }
1223 
1224 #else // ndef HAVE_HTSLIB
1225 
inspect_header()1226 bool SamtoolsHitFactory::inspect_header()
1227 {
1228     bam_header_t* header = _hit_file->header;
1229 
1230     if (header == NULL)
1231     {
1232         fprintf(stderr, "Warning: No BAM header\n");
1233         return false;
1234     }
1235 
1236 //    if (header->l_text >= MAX_HEADER_LEN)
1237 //    {
1238 //        fprintf(stderr, "Warning: BAM header too large\n");
1239 //        return false;
1240 //    }
1241 
1242 	if (header->l_text == 0 || header->text == NULL)
1243 	{
1244 		fprintf(stderr, "Warning: BAM header has 0 length or is corrupted.  Try using 'samtools reheader'.\n");
1245         return false;
1246 	}
1247 
1248 	parse_header_lines(header->text, _rg_props);
1249 
1250     finalize_rg_props();
1251     return true;
1252 }
1253 
1254 // populate a bam_t This will
read_next_hit(ReadHit & bh)1255 bool SamtoolsHitFactory::read_next_hit(ReadHit& bh)
1256 {
1257     if (_next_hit.data)
1258     {
1259         free(_next_hit.data);
1260         _next_hit.data = NULL;
1261     }
1262 
1263     if (records_remain() == false)
1264         return false;
1265 
1266     memset(&_next_hit, 0, sizeof(_next_hit));
1267 
1268 	int bytes_read = samread(_hit_file, &_next_hit);
1269 	if (bytes_read < 0)
1270     {
1271         _eof_encountered = true;
1272 		return false;
1273     }
1274 
1275 	return get_hit_from_bam1t(&_next_hit, _hit_file->header, bh);
1276 }
1277 
createSamHitFactory(const string & hit_file_name,ReadTable & it,RefSequenceTable & rt)1278 HitFactory* createSamHitFactory(const string& hit_file_name, ReadTable& it, RefSequenceTable& rt)
1279 {
1280 	try
1281 	{
1282 		return new SamtoolsHitFactory(hit_file_name, it, rt);
1283 	}
1284 	catch (std::runtime_error& e)
1285 	{
1286 		fprintf(stderr, "File %s doesn't appear to be a valid BAM file, trying SAM...\n",
1287 				hit_file_name.c_str());
1288         try
1289         {
1290             return new SAMHitFactory(hit_file_name, it, rt);
1291         }
1292         catch (std::runtime_error& e)
1293         {
1294             fprintf(stderr, "Error: cannot open alignment file %s for reading\n",
1295                     hit_file_name.c_str());
1296             exit(1);
1297         }
1298 	}
1299 }
1300 
1301 #endif
1302