1 /*
2 * bundles.cpp
3 * cufflinks
4 *
5 * Created by Cole Trapnell on 9/6/09.
6 * Copyright 2009 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #include <list>
11 #include <map>
12 #include <numeric>
13 #include <boost/math/distributions/binomial.hpp>
14 #include <boost/crc.hpp>
15
16 #include "common.h"
17 #include "bundles.h"
18 #include "scaffolds.h"
19
20 #include "abundances.h"
21
22 using namespace std;
23 using boost::math::binomial;
24
25 //struct ScaffoldSorter
26 //{
27 // ScaffoldSorter(RefSequenceTable& _rt) : rt(_rt) {}
28 // bool operator()(boost::shared_ptr<Scaffold const> lhs, boost::shared_ptr<Scaffold const> rhs)
29 // {
30 // assert (lhs);
31 // assert (rhs);
32 // const char* lhs_name = rt.get_name(lhs->ref_id());
33 // const char* rhs_name = rt.get_name(rhs->ref_id());
34 // int c = strcmp(lhs_name, rhs_name);
35 // if (c != 0)
36 // {
37 // return c < 0;
38 // }
39 // if (lhs->left() != rhs->left())
40 // {
41 // return lhs->left() < rhs->left();
42 // }
43 // return false;
44 // }
45 //
46 // RefSequenceTable& rt;
47 //};
48
49 struct ScaffoldSorter
50 {
ScaffoldSorterScaffoldSorter51 ScaffoldSorter(RefSequenceTable& _rt) : rt(_rt) {}
operator ()ScaffoldSorter52 bool operator()(boost::shared_ptr<Scaffold const> lhs, boost::shared_ptr<Scaffold const> rhs)
53 {
54 //assert (lhs);
55 //assert (rhs);
56 if (!lhs || !rhs)
57 return false;
58 int lhs_order = rt.observation_order(lhs->ref_id());
59 assert (lhs_order != -1);
60 int rhs_order = rt.observation_order(rhs->ref_id());
61 assert (rhs_order != -1);
62
63 if (lhs_order != rhs_order)
64 {
65 return lhs_order < rhs_order;
66 }
67 if (lhs->left() != rhs->left())
68 {
69 return lhs->left() < rhs->left();
70 }
71 return false;
72 }
73
74 RefSequenceTable& rt;
75 };
76
77 //FIXME: needs refactoring
load_ref_rnas(FILE * ref_mRNA_file,RefSequenceTable & rt,vector<boost::shared_ptr<Scaffold>> & ref_mRNAs,boost::crc_32_type & gtf_crc_result,bool loadSeqs,bool loadFPKM)78 void load_ref_rnas(FILE* ref_mRNA_file,
79 RefSequenceTable& rt,
80 vector<boost::shared_ptr<Scaffold> >& ref_mRNAs,
81 boost::crc_32_type& gtf_crc_result,
82 bool loadSeqs,
83 bool loadFPKM)
84 {
85 if (loadSeqs)
86 ProgressBar p_bar("Loading reference annotation and sequence.",0);
87 else
88 ProgressBar p_bar("Loading reference annotation.",0);
89
90 GList<GSeqData> ref_rnas;
91
92 // If the RefSequenceTable already has entries, we will sort the GTF records
93 // according to their observation order. Otherwise, we will sort the
94 // RefSequenceTable's records lexicographically.
95 bool reorder_GTF_recs_lexicographically = false;
96 if (rt.size() == 0)
97 {
98 reorder_GTF_recs_lexicographically = true;
99 }
100
101 if (ref_mRNA_file)
102 {
103 gtf_tracking_verbose=cuff_verbose;
104 read_transcripts(ref_mRNA_file, ref_rnas, gtf_crc_result, true);
105 }
106
107 int last_gseq_id = -1;
108 GFaSeqGet* faseq = NULL;
109 GFastaHandler gfasta(fasta_dir.c_str());
110 // Geo groups them by chr.
111 if (ref_rnas.Count()>0) //if any ref data was loaded
112 {
113 for (int j = 0; j < ref_rnas.Count(); ++j)
114 { //ref data is grouped by genomic sequence
115 //const char* name = ref_rnas[j]->gseq_name;
116
117 int f = 0;
118 int r = 0;
119 int u = 0;
120 GffObj* rna_p;
121 RefID ref_id = rt.get_id(ref_rnas[j]->gseq_name, NULL);
122 int f_count = ref_rnas[j]->mrnas_f.Count();
123 int r_count = ref_rnas[j]->mrnas_r.Count();
124 int u_count = ref_rnas[j]->umrnas.Count();
125
126 while(!(f==f_count && r==r_count && u==u_count))
127 {
128 CuffStrand strand;
129
130 if (f < f_count)
131 {
132 rna_p = ref_rnas[j]->mrnas_f[f++];
133 strand = CUFF_FWD;
134 }
135 else if (r < r_count)
136 {
137 rna_p = ref_rnas[j]->mrnas_r[r++];
138 strand = CUFF_REV;
139 }
140 else
141 {
142 rna_p = ref_rnas[j]->umrnas[u++];
143 strand = CUFF_STRAND_UNKNOWN;
144 }
145
146 GffObj& rna = *rna_p;
147
148 if (loadSeqs && rna.gseq_id != last_gseq_id) //next chromosome
149 {
150 delete faseq;
151 faseq = NULL;
152 last_gseq_id = rna.gseq_id;
153 faseq = gfasta.fetch(last_gseq_id);
154 if (faseq==NULL)
155 {
156 fprintf(stderr,"This contig will not be bias corrected.\n");
157 }
158 }
159
160 vector<AugmentedCuffOp> ops;
161 for (int e = 0; e < rna.exons.Count(); ++e)
162 {
163 GffExon& ex = *(rna.exons[e]);
164 ops.push_back(AugmentedCuffOp(CUFF_MATCH, ex.start - 1, ex.end - ex.start + 1));
165
166 if (e + 1 < rna.exons.Count())
167 {
168 GffExon& next_ex = *(rna.exons[e+1]);
169 ops.push_back(AugmentedCuffOp(CUFF_INTRON, ex.end, next_ex.start - ex.end - 1));
170 }
171 }
172
173 Scaffold ref_scaff(ref_id, strand, ops, true);
174
175 char* rna_seq = 0;
176 int seqlen=0;
177 if (loadSeqs && faseq){
178 rna_seq = rna.getSpliced(faseq, false, &seqlen);
179 }
180
181 if (rna.getID())
182 ref_scaff.annotated_trans_id(rna.getID());
183
184
185 if (rna.getGeneID())
186 ref_scaff.annotated_gene_id(rna.getGeneID());
187
188 if (rna.getGeneName())
189 ref_scaff.annotated_gene_name(rna.getGeneName());
190
191
192 char* nearest_ref_match = rna.getAttr("nearest_ref");
193 char* class_code = rna.getAttr("class_code");
194
195 if (nearest_ref_match && class_code)
196 {
197 ref_scaff.nearest_ref_id(nearest_ref_match);
198 ref_scaff.nearest_ref_classcode(*class_code);
199 }
200
201 char* protein_id = rna.getAttr("p_id");
202 if (protein_id)
203 ref_scaff.annotated_protein_id(protein_id);
204
205
206 char* tss_id = rna.getAttr("tss_id");
207 if (tss_id)
208 ref_scaff.annotated_tss_id(tss_id);
209
210
211 if (loadFPKM)
212 {
213 const char* expr = rna.getAttr("FPKM");
214 if (expr!=NULL) {
215 if (expr[0]=='"') expr++;
216 ref_scaff.fpkm(strtod(expr, NULL));
217 }
218 }
219
220 if (loadSeqs)
221 {
222 string rs = (rna_seq) ? rna_seq:"";
223 std::transform(rs.begin(), rs.end(), rs.begin(), (int (*)(int))std::toupper);
224 ref_scaff.seq(rs);
225 GFREE(rna_seq);
226 }
227
228 boost::shared_ptr<Scaffold> scaff(new Scaffold());
229 *scaff = ref_scaff;
230 assert (scaff);
231 ref_mRNAs.push_back(scaff);
232 }
233 }
234
235 BOOST_FOREACH (boost::shared_ptr<Scaffold> s, ref_mRNAs)
236 {
237 assert (s);
238 }
239
240 if (reorder_GTF_recs_lexicographically)
241 {
242 rt.order_recs_lexicographically();
243 }
244
245 ScaffoldSorter sorter(rt);
246 sort(ref_mRNAs.begin(), ref_mRNAs.end(), sorter);
247
248 }
249 delete faseq;
250 }
251
252
253 int HitBundle::_next_id = 0;
254
add_hit(const MateHit & hit)255 bool HitBundle::add_hit(const MateHit& hit)
256 {
257 if (_final)
258 {
259 return false;
260 }
261
262 // Update the bounds on the span
263 if (hit.left() < _leftmost)
264 _leftmost = hit.left();
265 if (hit.right() > _rightmost)
266 _rightmost = hit.right();
267
268
269 _hits.push_back(hit);
270 return true;
271 }
272
273 struct HitlessScaffold
274 {
operator ()HitlessScaffold275 bool operator()(boost::shared_ptr<Scaffold> x)
276 {
277 return x->mate_hits().empty();
278 }
279 };
280
unmapped_hit(const MateHit & x)281 bool unmapped_hit(const MateHit& x)
282 {
283 return !(x.is_mapped());
284 }
285
286
add_open_hit(boost::shared_ptr<ReadGroupProperties const> rg_props,const ReadHit * bh,bool expand_by_partner)287 bool HitBundle::add_open_hit(boost::shared_ptr<ReadGroupProperties const> rg_props,
288 const ReadHit* bh,
289 bool expand_by_partner)
290 {
291 assert (bh != NULL);
292
293 _leftmost = min(_leftmost, bh->left());
294 _ref_id = bh->ref_id();
295
296 if (bh->is_singleton() || no_read_pairs)
297 {
298 _rightmost = max(_rightmost, bh->right());
299 MateHit m(rg_props, bh->ref_id(), bh, NULL);
300 if (m.right() - m.left() > max_gene_length)
301 {
302 fprintf(stderr, "Warning: hit is longer than max_gene_length, skipping\n");
303 return false;
304 }
305 add_hit(m);
306 }
307 else
308 {
309 if (abs(bh->right() - bh->partner_pos()+1) > max_gene_length)
310 {
311 fprintf(stderr, "Warning: hit is longer than max_gene_length, skipping\n");
312 return false;
313 }
314 if (expand_by_partner)
315 _rightmost = max(max(_rightmost, bh->right()), bh->partner_pos()+1);
316
317 uint64_t search_key = bh->insert_id() ^ bh->left();
318 std::pair<OpenMates::iterator, OpenMates::iterator> its = _open_mates.equal_range(search_key);
319
320 // Does this hit close an open mate?
321 bool found_partner = false;
322 for(OpenMates::iterator it = its.first; it != its.second; ++it) {
323
324 MateHit& pm = it->second;
325
326 if(pm.left_alignment()->partner_pos() != bh->left())
327 continue;
328
329 if(pm.insert_id() != bh->insert_id())
330 continue;
331
332 {
333
334 Scaffold L(MateHit(rg_props, bh->ref_id(), pm.left_alignment(), NULL));
335 Scaffold R(MateHit(rg_props, bh->ref_id(), bh, NULL));
336
337 bool strand_agree = L.strand() == CUFF_STRAND_UNKNOWN ||
338 R.strand() == CUFF_STRAND_UNKNOWN ||
339 L.strand() == R.strand();
340
341 //bool orientation_agree = pm.left_alignment()->antisense_align() != bh->antisense_align();
342
343 if (strand_agree &&
344 (!Scaffold::overlap_in_genome(L, R, olap_radius) ||
345 Scaffold::compatible(L,R)))
346 {
347 pm.right_alignment(bh);
348 add_hit(pm);
349 _open_mates.erase(it);
350 // Boost unordered_multimap ordinarily never shrinks.
351 // Compel it to do so if significantly underloaded.
352 if(_open_mates.size() > _rehash_size_threshold && _open_mates.load_factor() < _rehash_threshold) {
353 _open_mates.rehash(0);
354 if(_open_mates.load_factor() < _rehash_threshold) {
355 // It didn't shrink -- looks like this Boost implementation has a minimum
356 // size limit, or other reason to keep the table large. Don't keep rehashing with every remove op:
357 _rehash_size_threshold = _open_mates.size();
358 }
359 }
360 found_partner = true;
361 break;
362 }
363
364 }
365
366 }
367
368 if(!found_partner) {
369
370 // Add it to the list of open mates, unless we would
371 // already have seen it's partner
372 if(bh->left() <= bh->partner_pos())
373 {
374 MateHit open_hit(rg_props,
375 bh->ref_id(),
376 bh,
377 NULL);
378
379 uint64_t insert_key = bh->insert_id() ^ bh->partner_pos();
380 _open_mates.insert(make_pair(insert_key, open_hit));
381 }
382 else
383 {
384 // This should never happen during hit_driven or ref_guided bundling, and in the case of
385 // ref_driven, this read clearly shouldn't map to any of the transcripts anyways.
386 // Adding this hit would cause problems with multi-reads that straddle boundaries after assembly.
387 // add_hit(MateHit(rg_props,bh->ref_id(), bh, NULL));
388 return false;
389 }
390
391 }
392
393 }
394 return true;
395 }
396
collapse_hits()397 void HitBundle::collapse_hits()
398 {
399 ::collapse_hits(_hits, _non_redundant);
400 //_non_redundant = _hits;
401 }
402
finalize_open_mates()403 void HitBundle::finalize_open_mates()
404 {
405 // We don't want to split reads accross boundaries since this would only occur
406 // in ref_driven mode and the read shouldn't map to any of the references in this case.
407
408 for(OpenMates::iterator itr = _open_mates.begin(); itr != _open_mates.end(); ++itr)
409 {
410 delete itr->second.left_alignment();
411 delete itr->second.right_alignment();
412 }
413 _open_mates.clear();
414 }
415
remove_hitless_scaffolds()416 void HitBundle::remove_hitless_scaffolds()
417 {
418 vector<boost::shared_ptr<Scaffold> >::iterator new_end = remove_if(_ref_scaffs.begin(),
419 _ref_scaffs.end(),
420 HitlessScaffold());
421 _ref_scaffs.erase(new_end, _ref_scaffs.end());
422 }
423
424
425
combine(const vector<HitBundle * > & in_bundles,HitBundle & out_bundle)426 void HitBundle::combine(const vector<HitBundle*>& in_bundles,
427 HitBundle& out_bundle)
428 {
429 out_bundle._hits.clear();
430 out_bundle._non_redundant.clear();
431 out_bundle._ref_scaffs.clear();
432
433 for (size_t i = 1; i < in_bundles.size(); ++i)
434 {
435 assert(in_bundles[i]->ref_id() == in_bundles[i-1]->ref_id());
436 }
437
438 // Merge hits
439 vector<size_t> indices(in_bundles.size(),0);
440 while(true)
441 {
442 int next_bundle = -1;
443 const MateHit* next_hit=NULL;
444 for(size_t i = 0; i < in_bundles.size(); ++i)
445 {
446 const vector<MateHit>& curr_hits = in_bundles[i]->hits();
447
448 if (indices[i] == curr_hits.size())
449 continue;
450
451 const MateHit* curr_hit = &curr_hits[indices[i]];
452
453 if (next_bundle == -1 || mate_hit_lt(*curr_hit, *next_hit))
454 {
455 next_bundle = i;
456 next_hit = curr_hit;
457 }
458 }
459
460 if(next_bundle==-1)
461 break;
462
463 out_bundle._hits.push_back(*next_hit);
464 indices[next_bundle]++;
465 }
466
467 // Merge collapsed hits
468 indices = vector<size_t>(in_bundles.size(), 0);
469 while(true)
470 {
471 int next_bundle = -1;
472 const MateHit* next_hit = NULL;
473 for(size_t i = 0; i < in_bundles.size(); ++i)
474 {
475 const vector<MateHit>& curr_non_redundant_hits = in_bundles[i]->non_redundant_hits();
476
477 if (indices[i] == curr_non_redundant_hits.size())
478 continue;
479
480 const MateHit* curr_hit = &curr_non_redundant_hits[indices[i]];
481
482 if (next_bundle == -1 || mate_hit_lt(*curr_hit, *next_hit))
483 {
484 next_bundle = i;
485 next_hit = curr_hit;
486 }
487 }
488
489 if(next_bundle==-1)
490 break;
491
492 out_bundle._non_redundant.push_back(*next_hit);
493 indices[next_bundle]++;
494 }
495
496 for(size_t i = 0; i < in_bundles.size(); ++i)
497 {
498 for (size_t j = 0; j < in_bundles[i]->_ref_scaffs.size(); ++j)
499 {
500 in_bundles[i]->_ref_scaffs[j]->clear_hits();
501 }
502 }
503
504 /*
505 // Merge ref scaffolds
506 indices = vector<size_t>(in_bundles.size(), 0);
507 while(true)
508 {
509 int next_bundle = -1;
510 boost::shared_ptr<Scaffold> next_scaff;
511 for(size_t i = 0; i < in_bundles.size(); ++i)
512 {
513 const vector<boost::shared_ptr<Scaffold> >& curr_scaffs = in_bundles[i]->_ref_scaffs;
514
515 if (indices[i] == curr_scaffs.size())
516 continue;
517
518 boost::shared_ptr<Scaffold> curr_scaff = curr_scaffs[indices[i]];
519
520 if (next_bundle == -1 || scaff_lt_rt_oplt(*curr_scaff, *next_scaff))
521 {
522 next_bundle = i;
523 next_scaff = curr_scaff;
524 }
525 }
526
527 if(next_bundle==-1)
528 break;
529
530 if (out_bundle._ref_scaffs.size()==0 || out_bundle._ref_scaffs.back()->annotated_trans_id() != next_scaff->annotated_trans_id())
531 out_bundle.add_ref_scaffold(next_scaff);
532 indices[next_bundle]++;
533 }
534 */
535
536 for (size_t i = 0; i < in_bundles.size(); ++i)
537 {
538 for (size_t j = 0; j < in_bundles[i]->ref_scaffolds().size(); ++j)
539 {
540 out_bundle.add_ref_scaffold(in_bundles[i]->ref_scaffolds()[j]);
541 }
542 }
543
544 sort(out_bundle._ref_scaffs.begin(), out_bundle._ref_scaffs.end(), scaff_lt_rt_oplt_sp);
545 vector<boost::shared_ptr<Scaffold> >::iterator new_end = unique(out_bundle._ref_scaffs.begin(),
546 out_bundle._ref_scaffs.end(),
547 StructurallyEqualScaffolds());
548 out_bundle._ref_scaffs.erase(new_end, out_bundle._ref_scaffs.end());
549 vector<boost::shared_ptr<Scaffold> >(out_bundle._ref_scaffs).swap(out_bundle._ref_scaffs);
550
551 out_bundle.finalize(true); // true means everything is already sorted, etc.
552 out_bundle._num_replicates = (int)in_bundles.size();
553 }
554
555
finalize(bool is_combined)556 void HitBundle::finalize(bool is_combined)
557 {
558 _final = true;
559 if (!is_combined)
560 {
561 // only perform read skipping on primary bundles
562 // (i.e. don't do it on bundles we're making by combining two or more other bundles)
563 size_t num_skipped = _hits.size() * read_skip_fraction;
564 if (num_skipped > 0 && num_skipped < _hits.size())
565 {
566 random_shuffle(_hits.begin(), _hits.end());
567 for (int i = (int)_hits.size() - num_skipped; i >= 0 && i < (int)_hits.size(); ++i)
568 {
569 delete _hits[i].left_alignment();
570 _hits[i].left_alignment(NULL);
571
572 delete _hits[i].right_alignment();
573 _hits[i].right_alignment(NULL);
574 }
575 _hits.resize(_hits.size() - num_skipped);
576 is_combined = false;
577 }
578 else if (num_skipped >= _hits.size())
579 {
580 for (size_t i = 0; i < _hits.size(); ++i)
581 {
582 delete _hits[i].left_alignment();
583 delete _hits[i].right_alignment();
584 }
585 _hits.clear();
586 }
587
588 sort(_hits.begin(), _hits.end(), mate_hit_lt);
589 if (cond_prob_collapse)
590 {
591 collapse_hits();
592 }
593 else
594 {
595 BOOST_FOREACH (MateHit& hit, _hits)
596 {
597 hit.incr_collapse_mass(hit.internal_scale_mass());
598 }
599 _non_redundant = _hits;
600
601 }
602 sort(_ref_scaffs.begin(), _ref_scaffs.end(), scaff_lt_rt_oplt_sp);
603 vector<boost::shared_ptr<Scaffold> >::iterator new_end = unique(_ref_scaffs.begin(),
604 _ref_scaffs.end(),
605 StructurallyEqualScaffolds());
606 _ref_scaffs.erase(new_end, _ref_scaffs.end());
607 vector<boost::shared_ptr<Scaffold> >(_ref_scaffs).swap(_ref_scaffs);
608 }
609
610 for (size_t j = 0; j < _ref_scaffs.size(); ++j)
611 {
612 _ref_scaffs[j]->clear_hits();
613 }
614
615 _compatible_mass = 0.0;
616
617 for (size_t i = 0; i < _hits.size(); ++i)
618 {
619 MateHit& hit = _hits[i];
620
621 Scaffold hs(hit);
622
623 if (i >= 1)
624 {
625 assert (hit.ref_id() == _hits[i-1].ref_id());
626 }
627 hit.is_mapped(false);
628 for (size_t j = 0; j < _ref_scaffs.size(); ++j)
629 {
630 // add hit only adds if the hit is structurally compatible
631 if (_ref_scaffs[j]->contains(hs))
632 {
633 bool added = _ref_scaffs[j]->add_hit(&hit);
634 if (added)
635 hit.is_mapped(true);
636 }
637 }
638 if (hit.is_mapped())
639 {
640 _compatible_mass += hit.internal_scale_mass();
641 }
642 }
643
644 }
645
print_sort_error(const char * last_chr_name,int last_chr_pos,const char * bh_name,int bh_pos)646 void print_sort_error(const char* last_chr_name,
647 int last_chr_pos,
648 const char* bh_name,
649 int bh_pos)
650 {
651 fprintf(stderr, "\nError: this SAM file doesn't appear to be correctly sorted!\n");
652 fprintf(stderr, "\tcurrent hit is at %s:%d, last one was at %s:%d\n",
653 bh_name,
654 bh_pos,
655 last_chr_name,
656 last_chr_pos);
657 fprintf(stderr, "Cufflinks requires that if your file has SQ records in\nthe SAM header that they appear in the same order as the chromosomes names \nin the alignments.\nIf there are no SQ records in the header, or if the header is missing,\nthe alignments must be sorted lexicographically by chromsome\nname and by position.\n \n");
658 }
659
660
next_valid_alignment(const ReadHit * & bh)661 double BundleFactory::next_valid_alignment(const ReadHit*& bh)
662 {
663 bh = NULL;
664
665 // Keep track of mass of hits we skip
666 double raw_mass = 0;
667
668 while (_hit_fac->records_remain())
669 {
670 ReadHit tmp;
671 if (!_hit_fac->next_hit(tmp))
672 continue;
673
674 if (tmp.ref_id() == 12638153115695167477) // corresponds to SAM "*" under FNV hash. unaligned read record
675 continue;
676
677 raw_mass += tmp.mass();
678
679 if (_hit_fac->ref_table().get_name(tmp.ref_id())==NULL) // unaligned read record (!?)
680 continue;
681
682 if (spans_bad_intron(tmp))
683 continue;
684
685 // Check for reads with no matching CIGAR entries. Generally such a read should have been rejected
686 // as unmapped, but such records have been seen in the wild. If they were allowed to stay they would cause
687 // trouble when converted to Scaffolds, yielding an invalid aug_ops vector.
688 const vector<CigarOp>& cig = tmp.cigar();
689 bool found_match = false;
690 for(vector<CigarOp>::const_iterator it = cig.begin(), itend = cig.end(); it != itend && !found_match; ++it)
691 if(it->opcode == MATCH)
692 found_match = true;
693
694 if(!found_match) {
695 fprintf(stderr, "Skipping hit with no Match operators in its CIGAR string\n");
696 continue;
697 }
698
699 int order = _hit_fac->ref_table().observation_order(tmp.ref_id());
700 if (_prev_pos != 0)
701 {
702 int prev_order = _hit_fac->ref_table().observation_order(_prev_ref_id);
703
704 if (prev_order > order || (prev_order == order && _prev_pos > tmp.left()))
705 {
706 const char* bh_chr_name = _hit_fac->ref_table().get_name(tmp.ref_id());
707 const char* last_bh_chr_name = _hit_fac->ref_table().get_name(_prev_ref_id);
708
709 print_sort_error(last_bh_chr_name,
710 _prev_pos,
711 bh_chr_name,
712 tmp.left());
713 exit(1);
714 }
715 }
716
717 _prev_ref_id = tmp.ref_id();
718 _prev_pos = tmp.left();
719
720 bool hit_within_mask = false;
721
722 // We want to skip stuff that overlaps masked GTF records, so
723 // sync up the masking chromosome
724 if (!mask_gtf_recs.empty() &&
725 next_mask_scaff != mask_gtf_recs.end() &&
726 (*next_mask_scaff)->ref_id() != tmp.ref_id())
727 {
728 bool found_scaff = false;
729 vector<boost::shared_ptr<Scaffold> >::iterator curr_mask_scaff = mask_gtf_recs.begin();
730 for (size_t i = 0; i < _mask_scaff_offsets.size(); ++i)
731 {
732 if (_mask_scaff_offsets[i].first == tmp.ref_id())
733 {
734 curr_mask_scaff = _mask_scaff_offsets[i].second;
735 found_scaff = true;
736 break;
737 }
738 }
739
740 next_mask_scaff = curr_mask_scaff;
741 }
742
743 //check that we aren't sitting in the middle of a masked scaffold
744 while (next_mask_scaff != mask_gtf_recs.end() &&
745 (*next_mask_scaff)->ref_id() == tmp.ref_id() &&
746 (*next_mask_scaff)->right() <= tmp.left())
747 {
748 if ((*next_mask_scaff)->left() >= tmp.left())
749 {
750 break;
751 }
752
753 next_mask_scaff++;
754 }
755
756 if (next_mask_scaff != mask_gtf_recs.end() &&
757 (*next_mask_scaff)->ref_id() == tmp.ref_id() &&
758 (*next_mask_scaff)->left() <= tmp.left() &&
759 (*next_mask_scaff)->right() >= tmp.right())
760 {
761 hit_within_mask = true;
762 }
763
764 if (hit_within_mask)
765 continue;
766
767 // if the user's asked for read trimming, do it here.
768 if (trim_read_length > 0)
769 {
770 tmp.trim(trim_read_length);
771 }
772
773 bh = new ReadHit(tmp);
774
775 break;
776 }
777
778 return raw_mass;
779 }
780
rewind_hit(const ReadHit * rh)781 double BundleFactory::rewind_hit(const ReadHit* rh)
782 {
783 double mass = rh->mass();
784 delete rh;
785 _hit_fac->undo_hit();
786 return mass;
787 }
788
next_bundle_hit_driven(HitBundle & bundle)789 bool BundleFactory::next_bundle_hit_driven(HitBundle& bundle)
790 {
791 const ReadHit* bh = NULL;
792
793 bool skip_read = false;
794
795 while(bh == NULL)
796 {
797 if (!_hit_fac->records_remain())
798 {
799 return false;
800 }
801
802 // If we are randomly throwing out reads, check to see
803 // whether this one should be kept.
804 if (bundle.hits().size() >= max_frags_per_bundle)
805 {
806 skip_read = true;
807 next_valid_alignment(bh);
808 }
809 else
810 {
811 double raw_mass = next_valid_alignment(bh);
812 if (bh && bh->num_hits() > max_frag_multihits)
813 {
814 skip_read = true;
815 }
816 else
817 {
818 bundle.add_raw_mass(raw_mass);
819 }
820 }
821 }
822
823 if ((skip_read || !bundle.add_open_hit(read_group_properties(), bh)) && bh != NULL)
824 {
825 delete bh;
826 bh = NULL;
827 }
828 _expand_by_hits(bundle);
829
830 assert(bundle.left() != -1);
831 bundle.finalize_open_mates();
832 bundle.finalize();
833 assert(bundle.right() != -1);
834
835 return true;
836 }
837
next_bundle_ref_driven(HitBundle & bundle)838 bool BundleFactory::next_bundle_ref_driven(HitBundle& bundle)
839 {
840 if (next_ref_scaff == ref_mRNAs.end())
841 {
842 const ReadHit* bh = NULL;
843 while(_hit_fac->records_remain())
844 {
845 double raw_mass = next_valid_alignment(bh);
846 if (bundle.hits().size() < max_frags_per_bundle)
847 {
848 if (bh && bh->num_hits() > max_frag_multihits)
849 {
850
851 }
852 else
853 {
854 bundle.add_raw_mass(raw_mass);
855 }
856 if (bh) { delete bh; }
857 }
858 else
859 {
860 delete bh;
861 bh = NULL;
862 }
863
864 }
865 bundle.finalize();
866 return false;
867 }
868
869 bundle.add_ref_scaffold(*next_ref_scaff);
870
871 ++next_ref_scaff;
872
873 _expand_by_refs(bundle);
874
875 // for (size_t i = 0; i < bundle.ref_scaffolds().size(); ++i)
876 // {
877 // boost::shared_ptr<Scaffold> s = bundle.ref_scaffolds()[i];
878 // if (s->annotated_gene_id() == "ERCC-00002")
879 // {
880 // int a = 4;
881 // }
882 // }
883
884
885 // The most recent RefID and position we've seen in the hit stream
886 RefID last_hit_ref_id_seen = 0;
887 int last_hit_pos_seen = 0;
888
889 // include hits that lay within the bundle interval
890 while(true)
891 {
892 const ReadHit* bh = NULL;
893
894 bool skip_read = false;
895 // If we are randomly throwing out reads, check to see
896 // whether this one should be kept.
897 if (bundle.hits().size() >= max_frags_per_bundle)
898 {
899 next_valid_alignment(bh);
900 skip_read = true;
901 }
902 else
903 {
904 double raw_mass = next_valid_alignment(bh);
905 if (bh && bh->num_hits() > max_frag_multihits)
906 {
907 skip_read = true;
908 }
909 else
910 {
911 bundle.add_raw_mass(raw_mass);
912 }
913 }
914
915 if (bh == NULL)
916 {
917 if (_hit_fac->records_remain())
918 continue;
919 else
920 break;
921 }
922
923 last_hit_ref_id_seen = bh->ref_id();
924 last_hit_pos_seen = bh->left();
925
926 // test if the hit stream needs to catch up or has gone too far based on ref_id
927 if (bh->ref_id() != bundle.ref_id())
928 {
929 int bh_chr_order = _hit_fac->ref_table().observation_order(bh->ref_id());
930 int bundle_chr_order = _hit_fac->ref_table().observation_order(bundle.ref_id());
931
932 if (bh_chr_order < bundle_chr_order) // the hit stream has not caught up, skip
933 {
934 delete bh;
935 bh = NULL;
936 continue;
937 }
938 else // the hit stream has gone too far, rewind and break
939 {
940 rewind_hit(bh);
941 bh = NULL;
942 break;
943 }
944 }
945
946 if (bh == NULL) // the hit stream has gone too far, break
947 break;
948
949 if (bh->left() >= bundle.left() && bh->right() <= bundle.right())
950 {
951 if (skip_read)
952 {
953 delete bh;
954 bh = NULL;
955 }
956 else
957 {
958 if (!bundle.add_open_hit(read_group_properties(), bh, false))
959 {
960 delete bh;
961 bh = NULL;
962 }
963 }
964 }
965 else if (bh->left() >= bundle.right())
966 {
967 if (skip_read == false)
968 {
969 bundle.rem_raw_mass(rewind_hit(bh));
970 bh = NULL;
971 }
972 else
973 {
974 delete bh;
975 bh = NULL;
976 }
977 break;
978 }
979 else
980 {
981 // It's not within the bundle bounds, but it's also not past the
982 // right end, so skip it.
983 delete bh;
984 bh = NULL;
985 }
986
987 if (skip_read == true && bh != NULL)
988 {
989 delete bh;
990 bh = NULL;
991 }
992 }
993
994 assert(bundle.left() != -1);
995 bundle.finalize_open_mates();
996 bundle.finalize();
997 assert(bundle.right() != -1);
998
999 return true;
1000 }
1001
1002 // NOTE: does not support read skipping yet or max hits per bundle yet.
next_bundle_ref_guided(HitBundle & bundle)1003 bool BundleFactory::next_bundle_ref_guided(HitBundle& bundle)
1004 {
1005
1006 if (next_ref_scaff == ref_mRNAs.end())
1007 {
1008 return next_bundle_hit_driven(bundle);
1009 }
1010
1011 const ReadHit* bh = NULL;
1012 while(bh == NULL)
1013 {
1014 if (!_hit_fac->records_remain())
1015 {
1016 return next_bundle_ref_driven(bundle);
1017 }
1018 bundle.add_raw_mass(next_valid_alignment(bh));
1019 }
1020
1021 if (bh->ref_id() != (*next_ref_scaff)->ref_id())
1022 {
1023 int bh_chr_order = _hit_fac->ref_table().observation_order(bh->ref_id());
1024 int scaff_chr_order = _hit_fac->ref_table().observation_order((*next_ref_scaff)->ref_id());
1025
1026 bundle.rem_raw_mass(rewind_hit(bh));
1027 bh = NULL;
1028
1029 if (bh_chr_order < scaff_chr_order)
1030 {
1031 return next_bundle_hit_driven(bundle);
1032 }
1033 else
1034 {
1035 return next_bundle_ref_driven(bundle);
1036 }
1037 }
1038
1039 if (bh->left() < (*next_ref_scaff)->left())
1040 {
1041 if (!bundle.add_open_hit(read_group_properties(), bh))
1042 {
1043 delete bh;
1044 bh = NULL;
1045 }
1046 }
1047 else
1048 {
1049 bundle.rem_raw_mass(rewind_hit(bh));
1050 bh = NULL;
1051
1052 bundle.add_ref_scaffold(*next_ref_scaff);
1053 next_ref_scaff++;
1054 _expand_by_refs(bundle);
1055 }
1056
1057 while(_expand_by_hits(bundle) ||
1058 _expand_by_refs(bundle)) {}
1059
1060 assert(bundle.left() != -1);
1061 bundle.finalize_open_mates();
1062 bundle.finalize();
1063 assert(bundle.right() != -1);
1064
1065 return true;
1066 }
1067
1068 // expand the bundle interval as far as needed to include the overlapping
1069 // chain of reference transcripts that also overlap the initial bundle
1070 // interval
_expand_by_refs(HitBundle & bundle)1071 bool BundleFactory::_expand_by_refs(HitBundle& bundle)
1072 {
1073 int initial_right = bundle.right();
1074 while(next_ref_scaff < ref_mRNAs.end())
1075 {
1076 assert(bundle.ref_id() != (*next_ref_scaff)->ref_id() || (*next_ref_scaff)->left() >= bundle.left());
1077 // if (*next_ref_scaff && (*next_ref_scaff)->annotated_gene_id() == "XLOC_009372")
1078 // {
1079 // int a = 5;
1080 // }
1081 if (bundle.ref_id() == (*next_ref_scaff)->ref_id()
1082 && overlap_in_genome((*next_ref_scaff)->left(),(*next_ref_scaff)->right(),bundle.left(), bundle.right()))
1083 {
1084 bundle.add_ref_scaffold(*next_ref_scaff);
1085 next_ref_scaff++;
1086 }
1087 else
1088 {
1089 break;
1090 }
1091 }
1092
1093
1094 return (bundle.right() > initial_right);
1095 }
1096
1097 // expand bundle by chaining overlapping hits
_expand_by_hits(HitBundle & bundle)1098 bool BundleFactory::_expand_by_hits(HitBundle& bundle)
1099 {
1100 int initial_right = bundle.right();
1101 while(true)
1102 {
1103 bool skip_read = false;
1104 const ReadHit* bh = NULL;
1105
1106 double raw_mass = next_valid_alignment(bh);
1107 if (bh && bh->num_hits() > max_frag_multihits)
1108 {
1109 skip_read = true;
1110 }
1111 else
1112 {
1113 bundle.add_raw_mass(raw_mass);
1114 }
1115
1116 if (bh == NULL)
1117 {
1118 if (_hit_fac->records_remain())
1119 {
1120 continue;
1121 }
1122 else
1123 {
1124 break;
1125 }
1126 }
1127
1128 if (bh->ref_id() == bundle.ref_id() && bh->left() < bundle.right() + olap_radius)
1129 {
1130 if (skip_read || !bundle.add_open_hit(read_group_properties(), bh))
1131 {
1132 delete bh;
1133 bh = NULL;
1134 }
1135 }
1136 else
1137 {
1138 bundle.rem_raw_mass(rewind_hit(bh));
1139
1140 break;
1141 }
1142 }
1143
1144 return (bundle.right() > initial_right);
1145 }
1146
next_bundle(HitBundle & bundle,bool cache_bundle)1147 bool BundleFactory::next_bundle(HitBundle& bundle, bool cache_bundle)
1148 {
1149 #if ENABLE_THREADS
1150 boost::mutex::scoped_lock lock(_factory_lock);
1151 #endif
1152 bool got_bundle = false;
1153 switch(_bundle_mode)
1154 {
1155 case HIT_DRIVEN:
1156 _curr_bundle++;
1157 got_bundle = next_bundle_hit_driven(bundle);
1158 bundle.id(_curr_bundle);
1159 break;
1160 case REF_DRIVEN:
1161 _curr_bundle++;
1162 got_bundle = next_bundle_ref_driven(bundle);
1163 bundle.id(_curr_bundle);
1164 break;
1165 case REF_GUIDED:
1166 _curr_bundle++;
1167 got_bundle = next_bundle_ref_guided(bundle);
1168 bundle.id(_curr_bundle);
1169 break;
1170 }
1171 return got_bundle;
1172 }
1173
1174
1175 struct IntronSpanCounter
1176 {
IntronSpanCounterIntronSpanCounter1177 IntronSpanCounter() : left_reads(0), little_reads(0), total_reads(0), multimap_reads(0), fwd_strand_frags(0) {}
1178 size_t left_reads;
1179 size_t little_reads; // small span overhang
1180 size_t total_reads;
1181 size_t multimap_reads;
1182 size_t fwd_strand_frags;
1183 vector<size_t> hist;
1184 };
1185
1186 typedef map<AugmentedCuffOp, IntronSpanCounter> IntronCountTable;
1187
count_introns_in_read(const ReadHit & read,IntronCountTable & intron_counts)1188 void count_introns_in_read(const ReadHit& read,
1189 IntronCountTable& intron_counts)
1190 {
1191 const vector<CigarOp>& cig = read.cigar();
1192
1193 int read_len = read.read_len();
1194 int small_anchor = (int)floor(read_len * small_anchor_fraction);
1195
1196 int r_left = 0;
1197 int g_left = read.left();
1198
1199 for (size_t i = 0; i < cig.size(); ++i)
1200 {
1201 assert(cig[i].length >= 0);
1202 switch(cig[i].opcode)
1203 {
1204 case MATCH:
1205 //ops.push_back(AugmentedCuffOp(CUFF_MATCH, g_left, cig[i].length));
1206 g_left += cig[i].length;
1207 r_left += cig[i].length;
1208 break;
1209
1210 case REF_SKIP:
1211 {
1212 AugmentedCuffOp intron(CUFF_INTRON, g_left, cig[i].length);
1213 pair<IntronCountTable::iterator, bool> ins_itr;
1214 ins_itr = intron_counts.insert(make_pair(intron, IntronSpanCounter()));
1215 IntronCountTable::iterator itr = ins_itr.first;
1216 itr->second.total_reads++;
1217
1218 if (read.num_hits() > 10)
1219 {
1220 itr->second.multimap_reads++;
1221 }
1222
1223 if ( r_left <= small_anchor || (read_len - r_left) < small_anchor)
1224 {
1225 itr->second.little_reads++;
1226 }
1227
1228 if (read.source_strand() == CUFF_FWD)
1229 {
1230 //itr->second.fwd_strand_frags;
1231 }
1232 else
1233 {
1234 assert(read.source_strand() == CUFF_REV);
1235 }
1236
1237
1238 vector<size_t>& hist = itr->second.hist;
1239 if (hist.size() < (size_t)read_len)
1240 {
1241 size_t num_new_bins = read_len - hist.size();
1242 size_t new_left_bins = (size_t)floor(num_new_bins / 2.0);
1243 size_t new_right_bins = (size_t)ceil(num_new_bins / 2.0);
1244 hist.insert(hist.begin(), new_left_bins, 0);
1245 hist.insert(hist.end(), new_right_bins, 0);
1246 }
1247
1248 assert (r_left < hist.size());
1249 hist[r_left]++;
1250 //ops.push_back(AugmentedCuffOp(CUFF_INTRON, g_left, cig[i].length));
1251 g_left += cig[i].length;
1252 break;
1253 }
1254
1255 case SOFT_CLIP:
1256 g_left += cig[i].length;
1257 break;
1258 case HARD_CLIP:
1259 break;
1260 case INS:
1261 g_left -= cig[i].length;
1262 break;
1263 case DEL:
1264 g_left += cig[i].length;
1265 break;
1266 default:
1267 assert(false);
1268 break;
1269 }
1270 }
1271 }
1272
minor_introns(int bundle_length,int bundle_left,const IntronCountTable & intron_counts,vector<AugmentedCuffOp> & bad_introns,double fraction)1273 void minor_introns(int bundle_length,
1274 int bundle_left,
1275 const IntronCountTable& intron_counts,
1276 vector<AugmentedCuffOp>& bad_introns,
1277 double fraction)
1278
1279 {
1280 for(IntronCountTable::const_iterator itr = intron_counts.begin();
1281 itr != intron_counts.end();
1282 ++itr)
1283 {
1284 pair<AugmentedCuffOp, IntronSpanCounter> itr_cnt_pair = *itr;
1285 const IntronSpanCounter itr_spans = itr_cnt_pair.second;
1286
1287 double doc = itr_spans.total_reads;
1288
1289 for (IntronCountTable::const_iterator itr2 = intron_counts.begin();
1290 itr2 != intron_counts.end();
1291 ++itr2)
1292 {
1293 if (itr == itr2 ||
1294 !AugmentedCuffOp::overlap_in_genome(itr->first, itr2->first))
1295 {
1296 continue;
1297 }
1298
1299 pair<AugmentedCuffOp, IntronSpanCounter> itr2_cnt_pair = *itr2;
1300 const IntronSpanCounter itr2_spans = itr2_cnt_pair.second;
1301
1302 double thresh = itr2_spans.total_reads * fraction;
1303 if (doc < thresh)
1304 {
1305 //#if verbose_msg
1306 // fprintf(stderr, "\t Filtering intron (due to overlap) %d - %d: %f thresh %f\n", itr->first.first, itr->first.second, doc, bundle_avg_thresh);
1307 //#endif
1308 bool exists = binary_search(bad_introns.begin(),
1309 bad_introns.end(),
1310 itr->first);
1311 if (!exists)
1312 {
1313 verbose_msg("Filtering intron %d-%d spanned by %lu reads based on overlap with much more abundant intron: %d-%d spanned by %lu reads\n",
1314 itr->first.g_left(),
1315 itr->first.g_right(),
1316 itr->second.total_reads,
1317 itr2->first.g_left(),
1318 itr2->first.g_right(),
1319 itr2->second.total_reads);
1320
1321 bad_introns.push_back(itr->first);
1322 sort(bad_introns.begin(), bad_introns.end());
1323 }
1324 }
1325
1326 // if ((itr->second.fwd_strand_frags == 0 &&
1327 // itr2->second.fwd_strand_frags != 0) ||
1328 // (itr2->second.fwd_strand_frags == 0 &&
1329 // itr->second.fwd_strand_frags != 0))
1330 // {
1331 // int itr1_L = itr->first.g_left();
1332 // int itr1_R = itr->first.g_right();
1333 // int itr2_L = itr2->first.g_left();
1334 // int itr2_R = itr2->first.g_right();
1335 //
1336 // if (abs(itr1_L - itr2_L) < 25 && abs(itr1_R - itr2_R) < 25)
1337 // {
1338 // int a = 3;
1339 // }
1340 // }
1341 }
1342 }
1343 }
1344
multimapping_introns(int bundle_length,int bundle_left,const IntronCountTable & intron_counts,vector<AugmentedCuffOp> & bad_introns,double fraction)1345 void multimapping_introns(int bundle_length,
1346 int bundle_left,
1347 const IntronCountTable& intron_counts,
1348 vector<AugmentedCuffOp>& bad_introns,
1349 double fraction)
1350
1351 {
1352 for(IntronCountTable::const_iterator itr = intron_counts.begin();
1353 itr != intron_counts.end();
1354 ++itr)
1355 {
1356 pair<AugmentedCuffOp, IntronSpanCounter> itr_cnt_pair = *itr;
1357 const IntronSpanCounter itr_spans = itr_cnt_pair.second;
1358
1359 double doc = itr_spans.total_reads;
1360 double multi = itr_spans.multimap_reads;
1361
1362 double multi_fraction = multi / doc;
1363
1364 if (multi_fraction > fraction)
1365 {
1366 bool exists = binary_search(bad_introns.begin(),
1367 bad_introns.end(),
1368 itr->first);
1369 if (!exists)
1370 {
1371 verbose_msg("Filtering intron %d-%d spanned by %lu reads because %lg percent are multireads.\n",
1372 itr->first.g_left(),
1373 itr->first.g_right(),
1374 itr->second.total_reads,
1375 multi_fraction * 100);
1376
1377 bad_introns.push_back(itr->first);
1378 sort(bad_introns.begin(), bad_introns.end());
1379 }
1380 }
1381 }
1382 }
1383
1384
identify_bad_splices(const HitBundle & bundle,BadIntronTable & bad_splice_ops)1385 void identify_bad_splices(const HitBundle& bundle,
1386 BadIntronTable& bad_splice_ops)
1387 {
1388 // Tracks, for each intron, how many reads
1389 IntronCountTable intron_counts;
1390
1391 RefID ref_id = bundle.ref_id();
1392
1393 pair<BadIntronTable::iterator, bool> ins_itr;
1394 ins_itr = bad_splice_ops.insert(make_pair(ref_id, vector<AugmentedCuffOp>()));
1395 vector<AugmentedCuffOp>& bad_introns = ins_itr.first->second;
1396
1397 BOOST_FOREACH (const MateHit& hit, bundle.hits())
1398 {
1399 if (hit.left_alignment())
1400 {
1401 count_introns_in_read(*hit.left_alignment(), intron_counts);
1402 }
1403 if (hit.right_alignment())
1404 {
1405 count_introns_in_read(*hit.right_alignment(), intron_counts);
1406 }
1407 }
1408
1409 minor_introns(bundle.length(), bundle.left(), intron_counts, bad_introns, min_isoform_fraction);
1410 // [Geo] disable filtering of multi-mapped introns:
1411 // multimapping_introns(bundle.length(), bundle.left(), intron_counts, bad_introns, 0.5);
1412 for (IntronCountTable::iterator itr = intron_counts.begin();
1413 itr != intron_counts.end();
1414 ++itr)
1415 {
1416 if (binary_search(bad_introns.begin(),
1417 bad_introns.end(),
1418 itr->first))
1419 {
1420 continue;
1421 }
1422 pair<AugmentedCuffOp, IntronSpanCounter> cnt_pair = *itr;
1423 try
1424 {
1425 const IntronSpanCounter spans = cnt_pair.second;
1426
1427 // binomial read_half_dist(spans.total_reads, success_fraction);
1428 // double left_side_p = cdf(read_half_dist, spans.total_reads - spans.left_reads);
1429 // double right_side_p = cdf(complement(read_half_dist, spans.left_reads));
1430
1431
1432 double success = 2 * small_anchor_fraction;
1433
1434 binomial read_half_dist(spans.total_reads, success);
1435 double right_side_p;
1436
1437 // right_side_p describes the chance that we'd observe at least
1438 // this many small overhang reads by chance with an unbiased
1439 // distribution over a normal (e.g. non-artifact) junction
1440 if (spans.little_reads > 0)
1441 {
1442 right_side_p = 1.0 - cdf(read_half_dist, spans.little_reads - 1);
1443 }
1444 else
1445 {
1446 right_side_p = 1.0;
1447 }
1448
1449 double left_side_p = 0;
1450 double expected = success * spans.total_reads;
1451
1452 //double excess = spans.little_reads - expected;
1453
1454 // left_side_p describes the chance that we'd observe this few or
1455 // fewer small overhang reads by chance with an unbiased
1456 // distribution over a normal (e.g. non-artifact) junction
1457 if (spans.little_reads > 0)
1458 {
1459 left_side_p = cdf(read_half_dist, spans.little_reads);
1460 }
1461 else
1462 {
1463 left_side_p = cdf(read_half_dist, 0);
1464 }
1465
1466 //double alpha = 0.05;
1467 //double right_side_p = 0;
1468
1469 // Two-tailed binomial test:
1470 // if (left_side_p < (binomial_junc_filter_alpha / 2.0) ||
1471 // right_side_p < (binomial_junc_filter_alpha / 2.0))
1472 // One-tailed binomial test
1473
1474 bool filtered = false;
1475
1476 const IntronSpanCounter& counter = itr->second;
1477
1478 if (right_side_p < (binomial_junc_filter_alpha))
1479 {
1480 double overhang_ratio = counter.little_reads / (double) counter.total_reads;
1481 if (counter.total_reads < 100 || overhang_ratio >= 0.50)
1482 {
1483 verbose_msg("Filtering intron %d-%d spanned by %lu reads (%lu low overhang, %lg expected) left P = %lg, right P = %lg\n",
1484 itr->first.g_left(),
1485 itr->first.g_right(),
1486 itr->second.total_reads,
1487 itr->second.little_reads,
1488 expected,
1489 left_side_p,
1490 right_side_p);
1491 filtered = true;
1492
1493 bool exists = binary_search(bad_introns.begin(),
1494 bad_introns.end(),
1495 itr->first);
1496 if (!exists)
1497 {
1498 bad_introns.push_back(itr->first);
1499 sort(bad_introns.begin(), bad_introns.end());
1500 }
1501 }
1502 }
1503
1504 vector<size_t> hist = itr->second.hist;
1505 if (itr->second.total_reads > 1000)
1506 {
1507 sort(hist.begin(), hist.end());
1508 size_t median = (size_t)floor(hist.size() / 2);
1509 if (median <= hist.size() && hist[median] == 0)
1510 {
1511 verbose_msg("Filtering intron %d-%d spanned by %lu reads (%lu low overhang, %lg expected) left P = %lg, right P = %lg\n",
1512 itr->first.g_left(),
1513 itr->first.g_right(),
1514 itr->second.total_reads,
1515 itr->second.little_reads,
1516 expected,
1517 left_side_p,
1518 right_side_p);
1519
1520 filtered = true;
1521
1522 bool exists = binary_search(bad_introns.begin(),
1523 bad_introns.end(),
1524 itr->first);
1525 if (!exists)
1526 {
1527 bad_introns.push_back(itr->first);
1528 sort(bad_introns.begin(), bad_introns.end());
1529 }
1530 }
1531 }
1532
1533 if (!filtered)
1534 {
1535 verbose_msg("Accepting intron %d-%d spanned by %lu reads (%lu low overhang, %lg expected) left P = %lg, right P = %lg\n",
1536 itr->first.g_left(),
1537 itr->first.g_right(),
1538 itr->second.total_reads,
1539 itr->second.little_reads,
1540 expected,
1541 left_side_p,
1542 right_side_p);
1543
1544 }
1545 }
1546
1547
1548 catch(const std::exception& e)
1549 {
1550 //
1551 /*`
1552 [#coinflip_eg_catch]
1553 It is always essential to include try & catch blocks because
1554 default policies are to throw exceptions on arguments that
1555 are out of domain or cause errors like numeric-overflow.
1556
1557 Lacking try & catch blocks, the program will abort, whereas the
1558 message below from the thrown exception will give some helpful
1559 clues as to the cause of the problem.
1560 */
1561 std::cout <<
1562 "\n""Message from thrown exception was:\n " << e.what() << std::endl;
1563 }
1564
1565 }
1566 }
1567
spans_bad_intron(const ReadHit & read)1568 bool BundleFactory::spans_bad_intron(const ReadHit& read)
1569 {
1570
1571 const vector<CigarOp>& cig = read.cigar();
1572
1573 size_t g_left = read.left();
1574 BadIntronTable::const_iterator itr = _bad_introns.find(read.ref_id());
1575 if (itr == _bad_introns.end())
1576 return false;
1577
1578 const vector<AugmentedCuffOp>& bi = itr->second;
1579 for (size_t i = 0; i < cig.size(); ++i)
1580 {
1581 assert(cig[i].length >= 0);
1582 switch(cig[i].opcode)
1583 {
1584 case MATCH:
1585 //ops.push_back(AugmentedCuffOp(CUFF_MATCH, g_left, cig[i].length));
1586 g_left += cig[i].length;
1587 break;
1588
1589 case REF_SKIP:
1590 {
1591 AugmentedCuffOp intron(CUFF_INTRON, g_left, cig[i].length);
1592 if (binary_search(bi.begin(), bi.end(), intron))
1593 {
1594 return true;
1595 }
1596
1597 //ops.push_back(AugmentedCuffOp(CUFF_INTRON, g_left, cig[i].length));
1598 g_left += cig[i].length;
1599 break;
1600 }
1601
1602 case SOFT_CLIP:
1603 g_left += cig[i].length;
1604 break;
1605
1606 case HARD_CLIP:
1607 break;
1608 case INS:
1609 g_left -= cig[i].length;
1610 break;
1611 case DEL:
1612 g_left += cig[i].length;
1613 break;
1614 default:
1615 assert(false);
1616 break;
1617 }
1618 }
1619
1620 return false;
1621 }
1622
inspect_map(boost::shared_ptr<BundleFactory> bundle_factory,BadIntronTable * bad_introns,vector<LocusCount> & compatible_count_table,vector<LocusCount> & total_count_table,IdToLocusMap & id_to_locus_map,bool progress_bar,bool show_stats)1623 void inspect_map(boost::shared_ptr<BundleFactory> bundle_factory,
1624 BadIntronTable* bad_introns,
1625 vector<LocusCount>& compatible_count_table,
1626 vector<LocusCount>& total_count_table,
1627 IdToLocusMap& id_to_locus_map,
1628 bool progress_bar,
1629 bool show_stats)
1630 {
1631
1632 ProgressBar p_bar;
1633 if (progress_bar)
1634 p_bar = ProgressBar("Inspecting reads and determining fragment length distribution.",bundle_factory->ref_table().size());
1635 RefID last_chrom = 0;
1636
1637 long double map_mass = 0.0;
1638 long double norm_map_mass = 0.0;
1639
1640 int min_len = numeric_limits<int>::max();
1641 int max_len = def_max_frag_len;
1642 vector<double> frag_len_hist(def_max_frag_len+1,0);
1643 bool has_pairs = false;
1644
1645 int num_bundles = 0;
1646 size_t total_hits = 0;
1647 size_t total_non_redundant_hits = 0;
1648
1649 //To be used for quartile normalization
1650 vector<long double> mass_dist;
1651
1652 // Store the maximum read length for "first" and "second" reads to report to user.
1653 int max_1 = 0;
1654 int max_2 = 0;
1655
1656 boost::shared_ptr<MultiReadTable> mrt(new MultiReadTable());
1657
1658 while(true)
1659 {
1660 HitBundle* bundle_ptr = new HitBundle();
1661
1662 bool valid_bundle = bundle_factory->next_bundle(*bundle_ptr, false);
1663 HitBundle& bundle = *bundle_ptr;
1664
1665 if (use_compat_mass) //only count hits that are compatible with ref transcripts
1666 {
1667 // Take raw mass even if bundle is "empty", since we could be out of refs
1668 // with remaining hits
1669 map_mass += bundle.compatible_mass();
1670 // if (lib_norm_method == QUARTILE && bundle.compatible_mass() > 0)
1671 // {
1672 // mass_dist.push_back(bundle.compatible_mass());
1673 // }
1674 }
1675 else if (use_total_mass) //use all raw mass
1676 {
1677
1678 // Take raw mass even if bundle is "empty", since we could be out of refs
1679 // with remaining hits
1680 map_mass += bundle.raw_mass();
1681 // if (lib_norm_method == QUARTILE && bundle.raw_mass() > 0)
1682 // {
1683 // mass_dist.push_back(bundle.raw_mass());
1684 // }
1685 }
1686 else
1687 {
1688 fprintf(stderr, "Error: hit counting scheme for normalization is not set!\n");
1689 assert(false);
1690 exit(1);
1691 }
1692
1693 const RefSequenceTable& rt = bundle_factory->ref_table();
1694 const char* chrom = rt.get_name(bundle.ref_id());
1695 char bundle_label_buf[2048];
1696 if (chrom)
1697 {
1698 sprintf(bundle_label_buf, "%s:%d-%d", chrom, bundle.left(), bundle.right());
1699 verbose_msg("Inspecting bundle %s with %lu reads\n", bundle_label_buf, bundle.hits().size());
1700
1701 vector<string> gene_ids;
1702 vector<string> gene_short_names;
1703 BOOST_FOREACH(boost::shared_ptr<Scaffold> s, bundle.ref_scaffolds())
1704 {
1705 if (s->annotated_gene_id() != "")
1706 gene_ids.push_back(s->annotated_gene_id());
1707 if (s->annotated_gene_name() != "")
1708 gene_short_names.push_back(s->annotated_gene_name());
1709
1710 // if (s->annotated_gene_id() == "ENSG00000268467.1")
1711 // {
1712 // int a = 4;
1713 // }
1714
1715 }
1716 compatible_count_table.push_back(LocusCount(bundle_label_buf, floor(bundle.compatible_mass()), bundle.ref_scaffolds().size(), gene_ids, gene_short_names));
1717 total_count_table.push_back(LocusCount(bundle_label_buf, floor(bundle.raw_mass()), bundle.ref_scaffolds().size(), gene_ids, gene_short_names));
1718 }
1719
1720 if (!valid_bundle)
1721 {
1722 delete bundle_ptr;
1723 break;
1724 }
1725 num_bundles++;
1726
1727 BOOST_FOREACH(boost::shared_ptr<Scaffold> s, bundle.ref_scaffolds()){
1728 id_to_locus_map.register_locus_to_id(s->annotated_trans_id(), bundle_label_buf);
1729 id_to_locus_map.register_locus_to_id(s->annotated_gene_id(), bundle_label_buf);
1730
1731 if (s->annotated_tss_id().empty() == false)
1732 {
1733 id_to_locus_map.register_locus_to_id(s->annotated_tss_id(), bundle_label_buf);
1734 }
1735
1736 if (s->annotated_protein_id().empty() == false)
1737 {
1738 id_to_locus_map.register_locus_to_id(s->annotated_protein_id(), bundle_label_buf);
1739 }
1740 }
1741
1742 if (progress_bar)
1743 {
1744 double inc_amt = last_chrom == bundle.ref_id() ? 0.0 : 1.0;
1745 p_bar.update(bundle_label_buf, inc_amt);
1746 last_chrom = bundle.ref_id();
1747 }
1748
1749 if (bad_introns != NULL)
1750 {
1751 identify_bad_splices(bundle, *bad_introns);
1752 }
1753
1754 const vector<MateHit>& hits = bundle.non_redundant_hits();
1755 if (hits.empty())
1756 {
1757 delete bundle_ptr;
1758 continue;
1759 }
1760
1761 list<pair<int, int> > open_ranges;
1762 int curr_range_start = hits[0].left();
1763 int curr_range_end = numeric_limits<int>::max();
1764 int next_range_start = -1;
1765
1766 total_non_redundant_hits += bundle.non_redundant_hits().size();
1767 total_hits += bundle.hits().size();
1768
1769 // This first loop calclates the map mass and finds ranges with no introns
1770 // Note that we are actually looking at non-redundant hits, which is why we use collapse_mass
1771 // This loop will also add multi-reads to the MultiReads table
1772 for (size_t i = 0; i < hits.size(); ++i)
1773 {
1774 assert(hits[i].left_alignment());
1775
1776 // Add to table if multi-read
1777 if (hits[i].is_multi())
1778 {
1779 mrt->add_hit(hits[i]);
1780 }
1781
1782 // Find left length
1783 int left_len = hits[i].left_alignment()->right()-hits[i].left_alignment()->left();
1784 min_len = min(min_len, left_len);
1785 if (!hits[i].left_alignment()->contains_splice())
1786 {
1787 if (hits[i].left_alignment()->is_first())
1788 max_1 = max(max_1, left_len);
1789 else
1790 max_2 = max(max_2, left_len);
1791 }
1792
1793 // Find right length
1794 if (hits[i].right_alignment())
1795 {
1796 int right_len = hits[i].right_alignment()->right()-hits[i].right_alignment()->left();
1797 min_len = min(min_len, right_len);
1798 if (!hits[i].right_alignment()->contains_splice())
1799 {
1800 if (hits[i].right_alignment()->is_first())
1801 max_1 = max(max_1, right_len);
1802 else
1803 max_2 = max(max_2, right_len);
1804 }
1805 has_pairs = true;
1806 }
1807
1808 // Find fragment length
1809 if (bundle.ref_scaffolds().size()==1 && hits[i].is_pair())
1810 // Annotation provided and single isoform gene
1811 {
1812 int start, end, mate_length;
1813 boost::shared_ptr<Scaffold> scaff = bundle.ref_scaffolds()[0];
1814 if (scaff->map_frag(hits[i], start, end, mate_length))
1815 {
1816 if (mate_length >= min_len && mate_length <= max_len)
1817 frag_len_hist[mate_length] += hits[i].collapse_mass();
1818 }
1819 }
1820 else if (bundle.ref_scaffolds().empty())
1821 // No annotation provided. Look for ranges.
1822 {
1823 if (hits[i].left() > curr_range_end)
1824 {
1825 if (curr_range_end - curr_range_start > max_len)
1826 open_ranges.push_back(make_pair(curr_range_start, curr_range_end));
1827 curr_range_start = next_range_start;
1828 curr_range_end = numeric_limits<int>::max();
1829 }
1830 if (hits[i].left_alignment()->contains_splice())
1831 {
1832 if (hits[i].left() - curr_range_start > max_len)
1833 open_ranges.push_back(make_pair(curr_range_start, hits[i].left()-1));
1834 curr_range_start = max(next_range_start, hits[i].left_alignment()->right());
1835 }
1836 if (hits[i].right_alignment() && hits[i].right_alignment()->contains_splice())
1837 {
1838 assert(hits[i].right_alignment()->left() >= hits[i].left());
1839 curr_range_end = min(curr_range_end, hits[i].right_alignment()->left()-1);
1840 next_range_start = max(next_range_start, hits[i].right());
1841 }
1842 }
1843 }
1844
1845 if (bundle.ref_scaffolds().empty() && has_pairs) // No annotation provided
1846 {
1847 pair<int, int> curr_range(-1,-1);
1848
1849 // This second loop uses the ranges found above to find the estimated frag length distribution
1850 // It also finds the minimum read length to use in the linear interpolation
1851 for (size_t i = 0; i < hits.size(); ++i)
1852 {
1853 if (hits[i].left() > curr_range.second && open_ranges.empty())
1854 break;
1855
1856 if (hits[i].left() > curr_range.second)
1857 {
1858 curr_range = open_ranges.front();
1859 open_ranges.pop_front();
1860 }
1861
1862 if (hits[i].left() >= curr_range.first && hits[i].right() <= curr_range.second && hits[i].is_pair())
1863 {
1864 int mate_len = hits[i].right()-hits[i].left();
1865 if (mate_len <= max_len)
1866 frag_len_hist[mate_len] += hits[i].collapse_mass();
1867 }
1868 }
1869 }
1870
1871 open_ranges.clear();
1872 delete bundle_ptr;
1873 }
1874
1875 norm_map_mass = map_mass;
1876
1877 // if (lib_norm_method == QUARTILE && mass_dist.size() > 0)
1878 // {
1879 // sort(mass_dist.begin(),mass_dist.end());
1880 // int upper_quart_index = mass_dist.size() * 0.75;
1881 // norm_map_mass = mass_dist[upper_quart_index];
1882 // }
1883
1884 if (bad_introns != NULL)
1885 {
1886 size_t alloced = 0;
1887 size_t used = 0;
1888 size_t num_introns = 0;
1889 for (BadIntronTable::const_iterator itr = bad_introns->begin();
1890 itr != bad_introns->end();
1891 ++itr)
1892 {
1893 alloced += itr->second.capacity() * sizeof(AugmentedCuffOp);
1894 used += itr->second.size() * sizeof(AugmentedCuffOp);
1895 num_introns += itr->second.size();
1896 }
1897
1898 verbose_msg( "Bad intron table has %lu introns: (%lu alloc'd, %lu used)\n", num_introns, alloced, used);
1899 verbose_msg( "Map has %lu hits, %lu are non-redundant\n", total_hits, total_non_redundant_hits);
1900 }
1901
1902 if (progress_bar)
1903 p_bar.complete();
1904
1905 vector<double> frag_len_pdf(max_len+1, 0.0);
1906 vector<double> frag_len_cdf(max_len+1, 0.0);
1907 long double tot_count = accumulate(frag_len_hist.begin(), frag_len_hist.end(), 0.0 );
1908 bool empirical = false;
1909
1910 if (user_provided_fld && has_pairs && tot_count >= 10000)
1911 {
1912 fprintf(stderr, "Warning: Overriding empirical fragment length distribution with user-specified parameters is not recommended.\n");
1913 }
1914
1915 if (!has_pairs || tot_count < 10000)
1916 {
1917 if (has_pairs && !user_provided_fld)
1918 {
1919 fprintf(stderr, "Warning: Using default Gaussian distribution due to insufficient paired-end reads in open ranges. It is recommended that correct parameters (--frag-len-mean and --frag-len-std-dev) be provided.\n");
1920 }
1921 tot_count = 0;
1922 normal frag_len_norm(def_frag_len_mean, def_frag_len_std_dev);
1923 max_len = def_frag_len_mean + 3*def_frag_len_std_dev;
1924 for(int i = min_len; i <= max_len; i++)
1925 {
1926 frag_len_hist[i] = cdf(frag_len_norm, i+0.5)-cdf(frag_len_norm, i-0.5);
1927 tot_count += frag_len_hist[i];
1928 }
1929 }
1930 else
1931 // Calculate the max frag length and interpolate all zeros between min read len and max frag len
1932 {
1933 empirical = true;
1934 double curr_total = 0;
1935 size_t last_nonzero = min_len-1;
1936 for(size_t i = last_nonzero+1; i < frag_len_hist.size(); i++)
1937 {
1938 if (frag_len_hist[i] > 0)
1939 {
1940 if (last_nonzero != i-1)
1941 {
1942 double b = frag_len_hist[last_nonzero];
1943 double m = (frag_len_hist[i] - b)/(i-last_nonzero);
1944 for (size_t x = 1; x < i - last_nonzero; x++)
1945 {
1946 frag_len_hist[last_nonzero+x] = m * x + b;
1947 tot_count += frag_len_hist[last_nonzero+x];
1948 curr_total += frag_len_hist[last_nonzero+x];
1949 }
1950 }
1951 last_nonzero = i;
1952 }
1953
1954 curr_total += frag_len_hist[i];
1955
1956 if (curr_total/tot_count > 0.9999)
1957 {
1958 max_len = i;
1959 tot_count = curr_total;
1960 break;
1961 }
1962 }
1963 }
1964
1965 double mean = 0.0;
1966
1967 if (output_fld)
1968 {
1969 FILE* fhist = fopen(string(output_dir + "/frag_len_hist.csv").c_str(),"w");
1970 fprintf(fhist, "Length,Count\n");
1971 for(size_t i = 1; i < frag_len_hist.size(); i++)
1972 {
1973 fprintf(fhist, "%zu,%f\n", i, frag_len_hist[i]);
1974 }
1975 fclose(fhist);
1976 }
1977
1978 // Convert histogram to pdf and cdf, calculate mean
1979 int frag_len_mode = 0;
1980 for(size_t i = min_len; i <= (size_t)max_len; i++)
1981 {
1982 frag_len_pdf[i] = frag_len_hist[i]/tot_count;
1983 frag_len_cdf[i] = frag_len_cdf[i-1] + frag_len_pdf[i];
1984
1985 if (frag_len_pdf[i] > frag_len_pdf[frag_len_mode])
1986 frag_len_mode = i;
1987 mean += frag_len_pdf[i] * i;
1988 }
1989
1990 double std_dev = 0.0;
1991 for(size_t i = 1; i < frag_len_hist.size(); i++)
1992 {
1993 std_dev += frag_len_pdf[i] * ((i - mean) * (i - mean));
1994 }
1995
1996 std_dev = sqrt(std_dev);
1997
1998 boost::shared_ptr<ReadGroupProperties> rg_props = bundle_factory->read_group_properties();
1999
2000 FLDSource source = DEFAULT;
2001 if (empirical)
2002 {
2003 source = LEARNED;
2004 }
2005 else if (user_provided_fld)
2006 {
2007 source = USER;
2008 }
2009
2010 boost::shared_ptr<EmpDist const> fld(new EmpDist(frag_len_pdf, frag_len_cdf, frag_len_mode, mean, std_dev, min_len, max_len, source));
2011 rg_props->multi_read_table(mrt);
2012 rg_props->frag_len_dist(fld);
2013 rg_props->normalized_map_mass(norm_map_mass);
2014 rg_props->total_map_mass(map_mass);
2015
2016 if (show_stats)
2017 {
2018 fprintf(stderr, "> Map Properties:\n");
2019 //if (lib_norm_method == QUARTILE)
2020 // fprintf(stderr, ">\tUpper Quartile: %.2Lf\n", norm_map_mass);
2021 fprintf(stderr, ">\tNormalized Map Mass: %.2Lf\n", norm_map_mass);
2022 fprintf(stderr, ">\tRaw Map Mass: %.2Lf\n", map_mass);
2023 if (corr_multi)
2024 fprintf(stderr,">\tNumber of Multi-Reads: %zu (with %zu total hits)\n", mrt->num_multireads(), mrt->num_multihits());
2025 // if (has_pairs)
2026 // fprintf(stderr, ">\tRead Type: %dbp x %dbp\n", max_1, max_2);
2027 // else
2028 // fprintf(stderr, ">\tRead Type: %dbp single-end\n", max(max_1,max_2));
2029
2030 if (empirical)
2031 {
2032 fprintf(stderr, ">\tFragment Length Distribution: Empirical (learned)\n");
2033 fprintf(stderr, ">\t Estimated Mean: %.2f\n", mean);
2034 fprintf(stderr, ">\t Estimated Std Dev: %.2f\n", std_dev);
2035 }
2036 else
2037 {
2038 if (user_provided_fld)
2039 {
2040 fprintf(stderr, ">\tFragment Length Distribution: Truncated Gaussian (user-specified)\n");
2041 }
2042 else
2043 {
2044 fprintf(stderr, ">\tFragment Length Distribution: Truncated Gaussian (default)\n");
2045 }
2046 fprintf(stderr, ">\t Default Mean: %d\n", def_frag_len_mean);
2047 fprintf(stderr, ">\t Default Std Dev: %d\n", def_frag_len_std_dev);
2048 }
2049 }
2050 bundle_factory->num_bundles(num_bundles);
2051 bundle_factory->reset();
2052 return;
2053 }
2054
2055 //////////////////////
2056
2057
next_bundle(HitBundle & bundle,bool cache_bundle)2058 bool PrecomputedExpressionBundleFactory::next_bundle(HitBundle& bundle, bool cache_bundle)
2059 {
2060 #if ENABLE_THREADS
2061 boost::mutex::scoped_lock lock(_factory_lock);
2062 #endif
2063 bool got_bundle = BundleFactory::next_bundle(bundle, cache_bundle);
2064 if (got_bundle)
2065 {
2066 RefSequenceTable& rt = ref_table();
2067
2068 char bundle_label_buf[2048];
2069 sprintf(bundle_label_buf, "%s:%d-%d", rt.get_name(bundle.ref_id()), bundle.left(), bundle.right());
2070
2071 boost::shared_ptr<const AbundanceGroup> ab = _hit_fac->next_locus(bundle.id(), cache_bundle);
2072 if (ab)
2073 {
2074 double compatible_mass = _hit_fac->get_compat_mass(bundle.id());
2075 double total_mass = _hit_fac->get_total_mass(bundle.id());
2076
2077 /*
2078 double compatible_mass = _hit_fac->get_compat_mass(bundle_label_buf);
2079 double total_mass = _hit_fac->get_total_mass(bundle_label_buf);
2080 */
2081
2082 bundle.finalize();
2083 bundle.add_raw_mass(total_mass);
2084 bundle.compatible_mass(compatible_mass);
2085
2086 //fprintf (stderr, "Reconstituting bundle %s (%d) with mass %lf\n", bundle_label_buf, bundle.id(), compatible_mass);
2087 if (bundle.ref_scaffolds().size() != ab->abundances().size())
2088 {
2089 fprintf (stderr, "Error in file %s: reconstituted expression bundle %s (%lu transcripts) does not match GTF (%lu transcripts):\n", read_group_properties()->file_path().c_str(), bundle_label_buf, ab->abundances().size(), bundle.ref_scaffolds().size());
2090 fprintf(stderr, "Reconstituted:\n");
2091 for (size_t i = 0; i < ab->abundances().size(); ++i)
2092 {
2093 fprintf(stderr, "%s\n", ab->abundances()[i]->description().c_str());
2094 }
2095 fprintf(stderr, "GTF:\n");
2096 for (size_t i = 0; i < bundle.ref_scaffolds().size(); ++i)
2097 {
2098 fprintf(stderr, "%s\n", bundle.ref_scaffolds()[i]->annotated_trans_id().c_str());
2099 }
2100 exit(1);
2101 }
2102 }
2103 else
2104 {
2105 fprintf (stderr, "Error: no abundance info for locus %s\n", bundle_label_buf);
2106 }
2107
2108 }
2109 return got_bundle;
2110 }
2111
get_abundance_for_locus(int locus_id)2112 boost::shared_ptr<const AbundanceGroup> PrecomputedExpressionBundleFactory::get_abundance_for_locus(int locus_id)
2113 {
2114 #if ENABLE_THREADS
2115 boost::mutex::scoped_lock lock(_factory_lock);
2116 #endif
2117 return _hit_fac->get_abundance_for_locus(locus_id);
2118 }
2119
clear_abundance_for_locus(int locus_id)2120 void PrecomputedExpressionBundleFactory::clear_abundance_for_locus(int locus_id)
2121 {
2122 #if ENABLE_THREADS
2123 boost::mutex::scoped_lock lock(_factory_lock);
2124 #endif
2125 _hit_fac->clear_abundance_for_locus(locus_id);
2126 }
2127