1 /*
2 * filters.cpp
3 * cufflinks
4 *
5 * Created by Cole Trapnell on 10/27/09.
6 * Copyright 2009 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #include "filters.h"
11 #include <algorithm>
12 #include <numeric>
13 #include <boost/graph/adjacency_list.hpp>
14 #include <boost/graph/depth_first_search.hpp>
15 #include <boost/graph/visitors.hpp>
16 #include <boost/graph/graph_traits.hpp>
17 #include <boost/graph/connected_components.hpp>
18
19 using namespace std;
20 using namespace boost;
21
filter_introns(int bundle_length,int bundle_left,vector<Scaffold> & hits,double fraction,bool filter_on_intron_overlap,bool filter_with_intron_doc)22 void filter_introns(int bundle_length,
23 int bundle_left,
24 vector<Scaffold>& hits,
25 double fraction,
26 bool filter_on_intron_overlap,
27 bool filter_with_intron_doc)
28 {
29 vector<float> depth_of_coverage(bundle_length,0);
30 vector<double> scaff_doc;
31 map<pair<int,int>, float> intron_doc;
32 vector<Scaffold> filtered_hits;
33 vector<bool> toss(hits.size(), false);
34
35 double bundle_avg_doc = compute_doc(bundle_left,
36 hits,
37 depth_of_coverage,
38 intron_doc,
39 false);
40
41 double bundle_avg_thresh = bundle_avg_doc * fraction;
42
43 if (filter_with_intron_doc && !intron_doc.empty())
44 {
45 bundle_avg_doc = major_isoform_intron_doc(intron_doc);
46 bundle_avg_thresh = fraction * bundle_avg_doc;
47 verbose_msg("\tFiltering bundle introns, avg (intron) doc = %lf, thresh = %f\n", bundle_avg_doc, bundle_avg_thresh);
48 }
49 else
50 {
51 verbose_msg("\tFiltering bundle introns, avg bundle doc = %lf, thresh = %f\n", bundle_avg_doc, bundle_avg_thresh);
52 }
53
54 for(map<pair<int, int>, float>::const_iterator itr = intron_doc.begin();
55 itr != intron_doc.end();
56 ++itr)
57 {
58 for (size_t j = 0; j < hits.size(); ++j)
59 {
60 //fprintf(stderr, "considering read [%d-%d] with min doc = %lf contained in intron with doc = %lf\n", hits[j].left(), hits[j].right(), doc, idoc);
61
62 if(hits[j].is_ref())
63 continue;
64
65 const vector<AugmentedCuffOp>& ops = hits[j].augmented_ops();
66
67 for (size_t i = 0; i < ops.size(); ++i)
68 {
69 if (ops[i].opcode == CUFF_INTRON)
70 {
71 map<pair<int, int>, float>::const_iterator itr;
72 itr = intron_doc.find(make_pair(ops[i].g_left(), ops[i].g_right()));
73
74 double doc = itr->second;
75 if (doc < bundle_avg_thresh)
76 {
77 toss[j] = true;
78 verbose_msg("\t Filtering intron %d - %d: %f thresh %f\n", itr->first.first, itr->first.second, doc, bundle_avg_thresh);
79 continue;
80 }
81
82 if (!filter_on_intron_overlap)
83 continue;
84
85 for (map<pair<int,int>, float>::const_iterator itr2 = intron_doc.begin();
86 itr2 != intron_doc.end();
87 ++itr2)
88 {
89 if (itr == itr2 ||
90 !overlap_in_genome(itr->first.first,
91 itr->first.second,
92 itr2->first.first,
93 itr2->first.second))
94 continue;
95
96 double thresh = itr2->second * fraction;
97 if (doc < thresh)
98 {
99 verbose_msg("\t Filtering intron (due to overlap) %d - %d: %f thresh %f\n", itr->first.first, itr->first.second, doc, bundle_avg_thresh);
100 toss[j] = true;
101 }
102 }
103 }
104 }
105 }
106 }
107
108 for (size_t j = 0; j < hits.size(); ++j)
109 {
110 if (!toss[j])
111 {
112 filtered_hits.push_back(hits[j]);
113 //#if verbose_msg
114 // if (hits[j].has_intron())
115 // {
116 // fprintf(stderr, "KEEPING intron scaff [%d-%d]\n", hits[j].left(), hits[j].right());
117 // }
118 //#endif
119 }
120 else
121 {
122 if (hits[j].has_intron())
123 {
124
125 verbose_msg("\tFiltering intron scaff [%d-%d]\n", hits[j].left(), hits[j].right());
126 }
127 }
128 }
129
130
131 verbose_msg("\tIntron filtering pass finished: excluded %d fragments\n", (int)hits.size() - (int)filtered_hits.size());
132 hits = filtered_hits;
133 }
134
background_rate(const vector<float> depth_of_coverage,int left,int right)135 double background_rate(const vector<float> depth_of_coverage,
136 int left,
137 int right)
138 {
139 vector<float> tmp;
140
141 size_t r_bound = (size_t)min(right, (int) depth_of_coverage.size());
142 size_t l_bound = (size_t)max(left, 0);
143
144 tmp.insert(tmp.end(),
145 depth_of_coverage.begin() + l_bound,
146 depth_of_coverage.begin() + r_bound);
147
148 if (tmp.empty())
149 return 0;
150
151 vector<float>::iterator new_end = remove(tmp.begin(), tmp.end(), 0);
152 tmp.erase(new_end, tmp.end());
153 sort(tmp.begin(), tmp.end());
154
155 size_t median = (size_t)floor(tmp.size() / 2);
156 double median_doc = tmp[median];
157 return median_doc;
158 }
159
pre_mrna_filter(int bundle_length,int bundle_left,vector<Scaffold> & hits)160 void pre_mrna_filter(int bundle_length,
161 int bundle_left,
162 vector<Scaffold>& hits)
163 {
164 vector<float> depth_of_coverage(bundle_length,0);
165 vector<double> scaff_doc;
166 map<pair<int,int>, float> intron_doc;
167 vector<Scaffold> filtered_hits;
168 vector<bool> toss(hits.size(), false);
169 vector<float> through_introns; //for each location, how many introns pass through
170
171 vector<int> scaff_intron_status;
172 // Make sure the avg only uses stuff we're sure isn't pre-mrna fragments
173 double bundle_avg_doc = compute_doc(bundle_left,
174 hits,
175 depth_of_coverage,
176 intron_doc,
177 true,
178 &through_introns,
179 &scaff_intron_status);
180 verbose_msg("Pre-mRNA flt: bundle average doc = %lf\n", bundle_avg_doc);
181 /*
182 //2nd call not needed, the vectors won't change, only the return value
183 compute_doc(bundle_left,
184 hits,
185 depth_of_coverage,
186 intron_doc,
187 false);
188 */
189 record_doc_for_scaffolds(bundle_left,
190 hits,
191 depth_of_coverage,
192 intron_doc,
193 scaff_doc);
194
195 for(map<pair<int, int>, float >::const_iterator itr = intron_doc.begin();
196 itr != intron_doc.end();
197 ++itr)
198 {
199 int i_left = itr->first.first;
200 int i_right = itr->first.second;
201 int i_doc = itr->second;
202 double intron_background = background_rate(depth_of_coverage,
203 i_left - bundle_left,
204 i_right - bundle_left);
205 double cumul_cov = 0;
206 for (int i = 0; i < i_right - i_left; ++i)
207 {
208 size_t pos = (i_left - bundle_left) + i;
209 cumul_cov += depth_of_coverage[pos];
210 }
211 cumul_cov /= i_right - i_left;
212 verbose_msg("Pre-mRNA flt: intron %d-%d : background: %lf, inner coverage: %lf, junction coverage: %f\n",
213 i_left, i_right, intron_background, cumul_cov, i_doc);
214 if (cumul_cov / bundle_avg_doc >= pre_mrna_fraction)
215 {
216 //fprintf(stderr, "\tskipping\n");
217 continue;
218 }
219
220 ////double thresh = (1.0/pre_mrna_fraction) * intron_background;
221 double thresh = pre_mrna_fraction * intron_background;
222
223
224 for (size_t j = 0; j < hits.size(); ++j)
225 {
226 if (hits[j].left()>i_right) break;
227 if (hits[j].is_ref())
228 continue;
229 if (toss[j])
230 continue;
231 //find maximum intron support in the hit region
232
233 int len = 0;
234 double doc = 0.0;
235 size_t curr_op = 0;
236 const vector<AugmentedCuffOp>& ops = hits[j].augmented_ops();
237 while (curr_op != ops.size())
238 {
239 const AugmentedCuffOp& op = ops[curr_op];
240 if (op.opcode == CUFF_MATCH)
241 {
242 int op_len = 0;
243 double op_doc = 0.0;
244 int left_off = op.g_left();
245 if (left_off + op.genomic_length > i_left && left_off < i_right)
246 {
247 if (left_off > i_left)
248 {
249 if (left_off + op.genomic_length <= i_right + 1)
250 {
251 op_len += op.genomic_length;
252 int L = left_off - bundle_left;
253 int R = L + op.genomic_length;
254 op_doc += accumulate(depth_of_coverage.begin() + L, depth_of_coverage.begin() + R, 0.0);
255 }
256 else
257 {
258 op_len += i_right - left_off;
259 int L = left_off - bundle_left;
260 int R = L + (i_right - left_off);
261 op_doc += accumulate(depth_of_coverage.begin() + L, depth_of_coverage.begin() + R, 0.0);
262 }
263 }
264 else
265 {
266 if (left_off + op.genomic_length <= i_right + 1)
267 {
268 op_len += (left_off + op.genomic_length - i_left);
269 int L = left_off - bundle_left;
270 int R = L + (left_off + op.genomic_length - i_left);
271 op_doc += accumulate(depth_of_coverage.begin() + L, depth_of_coverage.begin() + R, 0.0);
272 }
273 else
274 {
275 op_len = i_right - i_left;
276 int L = left_off - bundle_left;
277 int R = L + (i_right - i_left);
278 op_doc = accumulate(depth_of_coverage.begin() + L, depth_of_coverage.begin() + R, 0.0);
279 }
280 }
281 }
282
283 len += op_len;
284 doc += op_doc;
285 }
286
287 if (op.g_left() >= i_right)
288 break;
289 ++curr_op;
290 }
291
292 if (len)
293 {
294 double hit_doc_in_region = doc / len;
295 if (hit_doc_in_region < thresh)
296 {
297 toss[j] = true;
298 if (hits[j].has_intron())
299 {
300 // fprintf(stderr, "\t$$$ Filtering intron scaff [%d-%d]\n", hits[j].left(), hits[j].right());
301
302 verbose_msg("\t@@@ Filtering intron scaff [%d-%d] (scaff_doc=%lf, doc_in_region=%lf)\n",
303 hits[j].left(), hits[j].right(), scaff_doc[j], hit_doc_in_region);
304 }
305 }
306 }
307 } //for each scaffold
308 } //for each intron
309 for (size_t j = 0; j < hits.size(); ++j)
310 {
311 if (!toss[j])
312 {
313 filtered_hits.push_back(hits[j]);
314 }
315 /*else
316 {
317 if (hits[j].has_intron())
318 {
319
320 verbose_msg( "\t@@@ Filtering intron scaff [%d-%d]\n", hits[j].left(), hits[j].right());
321 }
322 }
323 */
324 }
325
326 if (cuff_verbose && hits.size()>filtered_hits.size())
327 verbose_msg("\tPre-mRNA flt tossed %lu fragments\n", hits.size() - filtered_hits.size());
328
329 hits = filtered_hits;
330 }
331
filter_hits(int bundle_length,int bundle_left,vector<Scaffold> & hits)332 void filter_hits(int bundle_length,
333 int bundle_left,
334 vector<Scaffold>& hits)
335 {
336
337 pre_mrna_filter(bundle_length, bundle_left, hits);
338
339 vector<float> depth_of_coverage(bundle_length+1,0);
340 vector<double> scaff_doc;
341 map<pair<int,int>, float> intron_doc;
342 vector<Scaffold> filtered_hits;
343 vector<bool> toss(hits.size(), false);
344
345 // Make sure the avg only uses stuff we're sure isn't pre-mrna fragments
346 double bundle_avg_doc = compute_doc(bundle_left,
347 hits,
348 depth_of_coverage,
349 intron_doc,
350 true);
351
352 // recompute the real DoCs
353 /* not needed, vectors are not changed
354 compute_doc(bundle_left,
355 hits,
356 depth_of_coverage,
357 intron_doc,
358 false);
359 */
360
361 record_min_doc_for_scaffolds(bundle_left,
362 hits,
363 depth_of_coverage,
364 intron_doc,
365 scaff_doc);
366
367 //double bundle_avg_thresh = min_isoform_fraction * bundle_avg_doc;
368
369 if (!intron_doc.empty())
370 {
371 double intron_avg_doc = major_isoform_intron_doc(intron_doc);
372 double intron_multiplier = intron_avg_doc / bundle_avg_doc;
373
374 // we don't want this to be more than 1.0 ...
375 intron_multiplier = min(intron_avg_doc, 1.0);
376 //bundle_avg_thresh = min_isoform_fraction * bundle_avg_doc;
377
378 set<pair<int, int> > tossed_introns;
379 for(map<pair<int, int>, float>::const_iterator itr = intron_doc.begin();
380 itr != intron_doc.end();
381 ++itr)
382 {
383 for (size_t j = 0; j < hits.size(); ++j)
384 {
385 if (hits[j].is_ref())
386 {
387 continue;
388 }
389 int i_left = itr->first.first;
390 int i_right = itr->first.second;
391 int j_match_len = hits[j].match_length(i_left, i_right);
392 if (j_match_len > 0)
393 {
394 double idoc = itr->second;
395 double doc = scaff_doc[j];
396
397 if (!hits[j].has_intron() &&
398 doc < pre_mrna_fraction * (idoc * intron_multiplier))
399 {
400 toss[j] = true;
401 }
402
403 const vector<AugmentedCuffOp>& ops = hits[j].augmented_ops();
404
405 unsigned int num_mismatches = 0;
406 assert (hits[j].mate_hits().size() == 1);
407 const MateHit& hit = **(hits[j].mate_hits().begin());
408 num_mismatches = hit.edit_dist();
409
410 double percent_mismatches = num_mismatches / (double)hits[j].length();
411
412 bool intron_pokin_read = false;
413
414 const AugmentedCuffOp& first = ops.front();
415 // intron =========================
416 // hit ******************
417 if (first.g_left() < i_left && first.g_right() > i_left && first.g_right() < i_right)
418 {
419 intron_pokin_read = true;
420 }
421
422 // intron =========================
423 // hit ******************
424 if (first.g_left() < i_right && first.g_right() > i_right && first.g_left() > i_left)
425 {
426 intron_pokin_read = true;
427 }
428
429 const AugmentedCuffOp& last = ops.back();
430 // intron =========================
431 // hit ******************
432 if (last.g_left() < i_left && last.g_right() > i_left && last.g_right() < i_right)
433 {
434 intron_pokin_read = true;
435 }
436
437 // intron =========================
438 // hit ******************
439 if (last.g_left() < i_right && last.g_right() > i_right && last.g_left() > i_left)
440 {
441 intron_pokin_read = true;
442 }
443
444 if (intron_pokin_read)
445 {
446 double fraction;
447 // if (!hits[j].has_intron())
448 // {
449 // fraction = (3 * pre_mrna_fraction) + percent_mismatches;
450 // }
451 // else
452 {
453 fraction = pre_mrna_fraction + percent_mismatches;
454 }
455 double thresh = fraction * (intron_avg_doc * intron_multiplier);
456 if (doc < thresh)
457 {
458 toss[j] = true;
459 // if (hits[j].has_intron())
460 // {
461 // fprintf(stderr, "\t^^^Filtering intron scaff [%d-%d]\n", hits[j].left(), hits[j].right());
462 // }
463 }
464 }
465 }
466 }
467 }
468 }
469
470 for (size_t j = 0; j < hits.size(); ++j)
471 {
472 if (!toss[j])
473 {
474 filtered_hits.push_back(hits[j]);
475 //#if verbose_msg
476 // if (hits[j].has_intron())
477 // {
478 //
479 // fprintf(stderr, "KEEPING intron scaff [%d-%d]\n", hits[j].left(), hits[j].right());
480 // }
481 //#endif
482 }
483 else
484 {
485 if (hits[j].has_intron())
486 {
487
488 verbose_msg("\t!!!Filtering intron scaff [%d-%d]\n", hits[j].left(), hits[j].right());
489 }
490 }
491 }
492
493 //#if verbose_msg
494 // fprintf(stderr, "\tInitial filter pass complete\n");
495 //#endif
496
497 hits = filtered_hits;
498
499 scaff_doc.clear();
500 filtered_hits.clear();
501
502 toss = vector<bool>(hits.size(), false);
503
504 map<pair<int, int>, float> dummy;
505 bundle_avg_doc = compute_doc(bundle_left,
506 hits,
507 depth_of_coverage,
508 dummy,
509 false);
510
511 //#if verbose_msg
512 // fprintf(stderr, "\tUpdated avg bundle doc = %lf\n", bundle_avg_doc);
513 //#endif
514
515 record_doc_for_scaffolds(bundle_left,
516 hits,
517 depth_of_coverage,
518 intron_doc,
519 scaff_doc);
520
521
522
523 //#if verbose_msg
524 // double bundle_thresh = pre_mrna_fraction * bundle_avg_doc;
525 // fprintf(stderr, "\tthreshold is = %lf\n", bundle_thresh);
526 //#endif
527
528 if (!intron_doc.empty())
529 {
530 // filter_introns(bundle_length,
531 // bundle_left,
532 // hits,
533 // min_isoform_fraction,
534 // true,
535 // true);
536 if (bundle_avg_doc > 3000)
537 {
538 filter_introns(bundle_length,
539 bundle_left,
540 hits,
541 min_isoform_fraction,
542 true,
543 false);
544 }
545 }
546
547 for (size_t j = 0; j < hits.size(); ++j)
548 {
549 if (!toss[j])
550 {
551 filtered_hits.push_back(hits[j]);
552 //#if verbose_msg
553 // if (hits[j].has_intron())
554 // {
555 //
556 // fprintf(stderr, "KEEPING intron scaff [%d-%d]\n", hits[j].left(), hits[j].right());
557 // }
558 //#endif
559 }
560 else
561 {
562 if (hits[j].has_intron())
563 {
564 verbose_msg("\t***Filtering intron scaff [%d-%d]\n", hits[j].left(), hits[j].right());
565 }
566 }
567 }
568
569 //fprintf(stderr, "\tTossed %d hits as noise\n", (int)hits.size() - filtered_hits.size());
570
571 hits = filtered_hits;
572 }
573
574
filter_junk_isoforms(vector<boost::shared_ptr<Abundance>> & transcripts,vector<double> & abundances,const vector<boost::shared_ptr<Abundance>> & mapped_transcripts,double locus_mass)575 void filter_junk_isoforms(vector<boost::shared_ptr<Abundance> >& transcripts,
576 vector<double>& abundances,
577 const vector<boost::shared_ptr<Abundance> >& mapped_transcripts,
578 double locus_mass)
579 {
580 // vector<double>::iterator max_ab = std::max_element(abundances.begin(),
581 // abundances.end());
582 double max_fwd_ab = -1.0;
583 double max_rev_ab = -1.0;
584
585 for (size_t t = 0; t < transcripts.size(); ++t)
586 {
587 boost::shared_ptr<Scaffold> scaff = transcripts[t]->transfrag();
588 if (scaff->strand() == CUFF_FWD || scaff->strand() == CUFF_STRAND_UNKNOWN)
589 {
590 if (abundances[t] > max_fwd_ab)
591 max_fwd_ab = abundances[t];
592 }
593 if (scaff->strand() == CUFF_REV || scaff->strand() == CUFF_STRAND_UNKNOWN)
594 {
595 if (abundances[t] > max_rev_ab)
596 max_rev_ab = abundances[t];
597 }
598 }
599
600 // Try to categorize the crap transcripts for suppression
601 vector<bool> pre_mrna_junk(transcripts.size(), false); //intra-intron, much lower abundance than container
602 vector<bool> chaff(transcripts.size(), false); // only a single MateHit, impossible to reliably quantitate
603 vector<bool> repeats(transcripts.size(), false); // too many low-quality hits
604 vector<bool> too_rare(transcripts.size(), false); // too rare to be reliably quantitated, could be error
605
606 //cerr << "Chucked : ";
607 for (size_t t = 0; t < transcripts.size(); ++t)
608 {
609 boost::shared_ptr<Scaffold> scaff = transcripts[t]->transfrag();
610
611 if (!(scaff->is_ref()) && allow_junk_filtering)
612 {
613 const vector<const MateHit*> hits = scaff->mate_hits();
614
615 const vector<AugmentedCuffOp>& ops = scaff->augmented_ops();
616
617 if (ops.size() == 1 && ops[0].opcode == CUFF_MATCH)
618 {
619 for (size_t j = 0; j < transcripts.size(); ++j)
620 {
621 const vector<AugmentedCuffOp>& j_ops = scaff->augmented_ops();
622 for (size_t L = 0; L < j_ops.size(); L++)
623 {
624 if (AugmentedCuffOp::overlap_in_genome(ops[0], j_ops[L]) &&
625 j_ops[L].opcode == CUFF_INTRON)
626 {
627 pre_mrna_junk[t] = true;
628 }
629 }
630 }
631 }
632
633 if (library_type != "transfrags")
634 {
635 double multi_hits = 0.0;
636 //static const double low_qual_err_prob = high_phred_err_prob; // hits with error_prob() above this are low quality;
637 //static const double low_qual_thresh = 0.75; // hits with more than this fraction of low qual hits are repeats
638 for (vector<const MateHit*>::const_iterator itr = hits.begin();
639 itr != hits.end();
640 ++itr)
641 {
642 if ((*itr)->is_multi())
643 multi_hits += 1.0;
644 }
645
646 double low_qual_frac = multi_hits / (double)hits.size();
647 if (low_qual_frac > max_multiread_fraction)
648 repeats[t] = true;
649 }
650 if (scaff->strand() == CUFF_FWD &&
651 (abundances[t] / max_fwd_ab) < min_isoform_fraction)
652 too_rare[t] = true;
653 if ((scaff->strand() == CUFF_REV || scaff->strand() == CUFF_STRAND_UNKNOWN) &&
654 (abundances[t] / max_rev_ab) < min_isoform_fraction)
655 too_rare[t] = true;
656
657 const vector<double>* cond_probs = (mapped_transcripts[t]->cond_probs());
658 if (cond_probs)
659 {
660 assert (library_type != "transfrags");
661 double supporting_hits = abundances[t] * locus_mass;
662 if (supporting_hits < min_frags_per_transfrag)
663 chaff[t] = true;
664 }
665 }
666 }
667
668 vector<boost::shared_ptr<Abundance> > non_junk_transcripts;
669 vector<double> non_junk_abundances;
670 for (size_t t = 0; t < transcripts.size(); ++t)
671 {
672 if (!repeats[t] && !pre_mrna_junk[t] && !too_rare[t] && !chaff[t])
673 {
674 non_junk_transcripts.push_back(transcripts[t]);
675 non_junk_abundances.push_back(abundances[t]);
676 }
677 else
678 {
679 verbose_msg( "Filtering isoform %d-%d\n", transcripts[t]->transfrag()->left(), transcripts[t]->transfrag()->right());
680 }
681 }
682
683 transcripts = non_junk_transcripts;
684 abundances = non_junk_abundances;
685 }
686
687 // Designed to strip out remaining pre-mrna genes, assembled repeats, and
688 // fragments from isoforms too short to be reliably quantitated.
filter_junk_genes(vector<Gene> & genes)689 void filter_junk_genes(vector<Gene>& genes)
690 {
691 vector<Gene> good_genes;
692 vector<Isoform> all_isoforms;
693 for (size_t i = 0; i < genes.size(); ++i)
694 {
695 all_isoforms.insert(all_isoforms.end(),
696 genes[i].isoforms().begin(),
697 genes[i].isoforms().end());
698 }
699
700 for (size_t i = 0; i < genes.size(); ++i)
701 {
702 const Gene& g = genes[i];
703
704 if(g.has_ref_trans())
705 {
706 good_genes.push_back(g);
707 continue;
708 }
709
710 bool good_gene = true;
711 for (size_t j = 0; j < all_isoforms.size(); ++j)
712 {
713 vector<pair<int, int> > introns = all_isoforms[j].scaffold().gaps();
714
715 //assert (!allow_junk_filtering || all_isoforms[j].scaffold().mate_hits().size() >= min_frags_per_transfrag);
716 for (size_t k = 0; k < introns.size(); ++k)
717 {
718 if (g.left() > introns[k].first && g.right() < introns[k].second &&
719 g.FPKM() / all_isoforms[j].FPKM() < pre_mrna_fraction)
720 {
721 good_gene = false;
722 }
723 }
724 }
725 if (allow_junk_filtering)
726 {
727 if (g.FPKM() == 0)
728 {
729 good_gene = false;
730 }
731 }
732 if (good_gene)
733 {
734 good_genes.push_back(g);
735 }
736 else
737 {
738 verbose_msg("Filtering transfrags from gene %d-%d\n", g.left(), g.right());
739 }
740 }
741
742 genes = good_genes;
743
744 }
745
clip_by_3_prime_dropoff(vector<Scaffold> & scaffolds)746 void clip_by_3_prime_dropoff(vector<Scaffold>& scaffolds)
747 {
748 vector<pair<double, Scaffold*> > three_prime_ends;
749
750 if (library_type != "transfrags")
751 {
752 BOOST_FOREACH (Scaffold& scaff, scaffolds)
753 {
754 if (!(scaff.strand() == CUFF_FWD || scaff.strand() == CUFF_REV))
755 continue;
756
757 int scaff_len = scaff.length();
758 vector<double> coverage(scaff_len, 0.0);
759
760 double total = 0;
761 BOOST_FOREACH(const MateHit* hit, scaff.mate_hits())
762 {
763 int start, end, frag_len;
764 if (!scaff.map_frag(*hit, start, end, frag_len)) continue;
765
766 if (scaff.strand() == CUFF_REV)
767 {
768 start = scaff_len - 1 - start;
769 end = scaff_len - 1 - end;
770 swap(start, end);
771 }
772
773 for(int i = start; i <= end; ++i)
774 {
775 coverage[i] += hit->mass();
776 total += hit->mass();
777 }
778 }
779 double avg_cov = total/scaff_len;
780 // if (avg_cov < trim_3_avgcov_thresh)
781 // continue;
782
783 const AugmentedCuffOp* exon_3 = NULL;
784 int mult;
785 int offset;
786
787 if (scaff.strand() == CUFF_REV)
788 {
789 mult = 1;
790 offset = 0;
791 exon_3 = &scaff.augmented_ops().front(); //last exon at 3' end
792 }
793 else if (scaff.strand() == CUFF_FWD)
794 {
795 mult = -1;
796 offset = scaff_len - 1;
797 exon_3 = &scaff.augmented_ops().back();//last exon at 3' end
798 }
799 else
800 {
801 continue;
802 }
803
804 int to_remove;
805 double min_cost = numeric_limits<double>::max();
806 double mean_to_keep = 0.0;
807 double mean_to_trim = 0.0;
808 double tmp_mean_to_trim = 0.0;
809 double tmp_mean_to_keep = 0.0;
810 double tmp_mean_3prime = 0.0;
811 for (int i = 0; i < exon_3->genomic_length; i++)
812 {
813 tmp_mean_3prime += coverage[offset + mult*i];
814 }
815 tmp_mean_3prime /= exon_3->genomic_length;
816
817 double base_cost = 0.0;
818 for (int i = 0; i < exon_3->genomic_length; i++)
819 {
820 double d = (coverage[offset + mult*i] - tmp_mean_3prime);
821 d *= d;
822 base_cost += d;
823 }
824 base_cost /= exon_3->genomic_length;
825
826 size_t min_cost_x = -1;
827 for (to_remove = 1; to_remove < exon_3->genomic_length - 1; to_remove++)
828 {
829 tmp_mean_to_trim = 0.0;
830 tmp_mean_to_keep = 0.0;
831 for (int i = 0; i < exon_3->genomic_length; i++)
832 {
833 if (i <= to_remove)
834 {
835 tmp_mean_to_trim += coverage[offset + mult*i];
836 }
837 else
838 {
839 tmp_mean_to_keep += coverage[offset + mult*i];
840 }
841 }
842
843 tmp_mean_to_trim /= to_remove;
844 tmp_mean_to_keep /= (exon_3->genomic_length - to_remove);
845
846 double tmp_mean_trim_cost = 0.0;
847 double tmp_mean_keep_cost = 0.0;
848 for (int i = 0; i < exon_3->genomic_length; i++)
849 {
850 if (i <= to_remove)
851 {
852 double d = (coverage[offset + mult*i] - tmp_mean_to_trim);
853 d *= d;
854 tmp_mean_trim_cost += d;
855 }
856 else
857 {
858 double d = (coverage[offset + mult*i] - tmp_mean_to_keep);
859 d *= d;
860 tmp_mean_keep_cost += d;
861 }
862 }
863
864 tmp_mean_trim_cost /= to_remove;
865 tmp_mean_keep_cost /= (exon_3->genomic_length - to_remove);
866
867 double new_cost = tmp_mean_trim_cost + tmp_mean_keep_cost;
868
869 if (new_cost < min_cost && trim_3_dropoff_frac * tmp_mean_to_keep > tmp_mean_to_trim && new_cost < base_cost && to_remove > scaff_len * 0.05)
870 {
871 min_cost = tmp_mean_trim_cost + tmp_mean_keep_cost;
872 min_cost_x = to_remove;
873 mean_to_keep = tmp_mean_to_keep;
874 mean_to_trim = tmp_mean_to_trim;
875 }
876 }
877
878 // If trimming reduces the overall mean squared error of the coverage
879 // do it
880 if (avg_cov >= trim_3_avgcov_thresh && min_cost_x < exon_3->genomic_length)
881 {
882 scaff.trim_3(min_cost_x);
883 }
884
885 // store the mean squared error for this exon
886 tmp_mean_3prime = 0.0;
887 for (int i = 0; i < exon_3->genomic_length; i++)
888 {
889 tmp_mean_3prime += coverage[offset + mult*i];
890 }
891 tmp_mean_3prime /= exon_3->genomic_length;
892
893 base_cost = 0.0;
894 for (int i = 0; i < exon_3->genomic_length; i++)
895 {
896 double d = (coverage[offset + mult*i] - tmp_mean_3prime);
897 d *= d;
898 base_cost += d;
899 }
900 base_cost /= exon_3->genomic_length;
901 three_prime_ends.push_back(make_pair(base_cost, &scaff));
902 }
903 }
904 else
905 {
906 BOOST_FOREACH (Scaffold& scaff, scaffolds)
907 {
908 if (!(scaff.strand() == CUFF_FWD || scaff.strand() == CUFF_REV))
909 continue;
910
911 int scaff_len = scaff.length();
912 vector<double> coverage(scaff_len, 0.0);
913
914 double total = 0;
915 BOOST_FOREACH(const MateHit* hit, scaff.mate_hits())
916 {
917 int start, end, frag_len;
918 if (!scaff.map_frag(*hit, start, end, frag_len)) continue;
919
920 if (scaff.strand() == CUFF_REV)
921 {
922 start = scaff_len - 1 - start;
923 end = scaff_len - 1 - end;
924 swap(start, end);
925 }
926
927 for(int i = start; i <= end; ++i)
928 {
929 coverage[i] += hit->mass();
930 total += hit->mass();
931 }
932 }
933
934 const AugmentedCuffOp* exon_3 = NULL;
935 int mult;
936 int offset;
937
938 if (scaff.strand() == CUFF_REV)
939 {
940 mult = 1;
941 offset = 0;
942 exon_3 = &scaff.augmented_ops().front();
943 }
944 else if (scaff.strand() == CUFF_FWD)
945 {
946 mult = -1;
947 offset = scaff_len - 1;
948 exon_3 = &scaff.augmented_ops().back();
949 }
950 else
951 {
952 continue;
953 }
954
955 three_prime_ends.push_back(make_pair(scaff.fpkm(), &scaff));
956 }
957
958 }
959
960 adjacency_list <vecS, vecS, undirectedS> G;
961
962 for (size_t i = 0; i < three_prime_ends.size(); ++i)
963 {
964 add_vertex(G);
965 }
966
967 for (size_t i = 0; i < three_prime_ends.size(); ++i)
968 {
969 Scaffold* scaff_i = three_prime_ends[i].second;
970 //assert (scaff_i);
971
972 const AugmentedCuffOp* scaff_i_exon_3 = NULL;
973
974 if (scaff_i->strand() == CUFF_REV)
975 {
976 scaff_i_exon_3 = &(scaff_i->augmented_ops().front());
977 }
978 else if (scaff_i->strand() == CUFF_FWD)
979 {
980 scaff_i_exon_3 = &(scaff_i->augmented_ops().back());
981 }
982
983 for (size_t j = i + 1; j < three_prime_ends.size(); ++j)
984 {
985 Scaffold* scaff_j = three_prime_ends[j].second;
986
987 if (scaff_i->strand() != scaff_j->strand())
988 continue;
989
990 const AugmentedCuffOp* scaff_j_exon_3 = NULL;
991
992 if (scaff_j->strand() == CUFF_REV)
993 {
994 scaff_j_exon_3 = &(scaff_j->augmented_ops().front());
995 }
996 else if (scaff_j->strand() == CUFF_FWD)
997 {
998 scaff_j_exon_3 = &(scaff_j->augmented_ops().back());
999 }
1000
1001 if (AugmentedCuffOp::overlap_in_genome(*scaff_j_exon_3, *scaff_i_exon_3) &&
1002 AugmentedCuffOp::compatible(*scaff_j_exon_3, *scaff_i_exon_3, 0))
1003 add_edge(i, j, G);
1004 }
1005 }
1006
1007 std::vector<int> component(num_vertices(G));
1008 connected_components(G, &component[0]);
1009
1010 vector<vector<bool> > clusters(three_prime_ends.size(),
1011 vector<bool>(three_prime_ends.size(), false));
1012
1013 //vector<vector<size_t> > cluster_indices(three_prime_ends.size());
1014
1015 vector<vector<pair<double, Scaffold*> > > grouped_scaffolds(three_prime_ends.size());
1016 for (size_t i = 0; i < three_prime_ends.size(); ++i)
1017 {
1018 clusters[component[i]][i] = true;
1019 grouped_scaffolds[component[i]].push_back(three_prime_ends[i]);
1020 }
1021
1022 for (size_t i = 0; i < grouped_scaffolds.size(); ++i)
1023 {
1024 vector<pair<double, Scaffold*> >& group = grouped_scaffolds[i];
1025 sort(group.begin(), group.end());
1026 if (group.empty())
1027 continue;
1028
1029 Scaffold* group_leader = NULL;
1030 int trim_point = -1;
1031
1032 const AugmentedCuffOp* group_exon_3 = NULL;
1033 vector<pair<double, Scaffold*> >::iterator l_itr = group.begin();
1034 while (l_itr != group.end())
1035 {
1036 Scaffold* possible_leader = l_itr->second;
1037 bool ok_clip_leader = true;
1038 vector<pair<double, Scaffold*> >::iterator g_itr = group.begin();
1039 const AugmentedCuffOp* l_exon_3 = NULL;
1040 CuffStrand s = possible_leader->strand();
1041
1042 if (s != CUFF_STRAND_UNKNOWN)
1043 {
1044 if (s == CUFF_REV)
1045 l_exon_3 = &(possible_leader->augmented_ops().front());
1046 else
1047 l_exon_3 = &(possible_leader->augmented_ops().back());
1048 for (; g_itr != group.end(); ++g_itr)
1049 {
1050 const AugmentedCuffOp* g_exon_3 = NULL;
1051 if (s == CUFF_REV)
1052 {
1053 // bad:
1054 // leader
1055 // follower
1056 g_exon_3 = &(g_itr->second->augmented_ops().front());
1057 if (g_exon_3->g_right() <= l_exon_3->g_left())
1058 ok_clip_leader = false;
1059
1060 // for meta-assembly libraries, don't ever allow clipping, just extension
1061 // bad:
1062 // leader
1063 // follower
1064 if (library_type == "transfrags" &&
1065 g_exon_3->g_left() < l_exon_3->g_left())
1066 ok_clip_leader = false;
1067 }
1068 else
1069 {
1070 // bad:
1071 // follower
1072 // leader
1073 g_exon_3 = &(g_itr->second->augmented_ops().back());
1074 if (g_exon_3->g_left() >= l_exon_3->g_right())
1075 ok_clip_leader = false;
1076
1077 // for meta-assembly libraries, don't ever allow clipping, just extension
1078 // bad:
1079 // leader
1080 // follower
1081 if (library_type == "transfrags" &&
1082 g_exon_3->g_right() > l_exon_3->g_right())
1083 ok_clip_leader = false;
1084 }
1085 }
1086 }
1087 else
1088 {
1089 ok_clip_leader = false;
1090 }
1091
1092 if (ok_clip_leader)
1093 {
1094 if (s == CUFF_REV)
1095 {
1096 if (trim_point == -1)
1097 trim_point = l_exon_3->g_left();
1098 else if (l_exon_3->g_left() < trim_point)
1099 ok_clip_leader = false;
1100 }
1101 else
1102 {
1103 if (trim_point == -1)
1104 trim_point = l_exon_3->g_right();
1105 else if (l_exon_3->g_right() > trim_point)
1106 ok_clip_leader = false;
1107 }
1108 }
1109
1110 if (ok_clip_leader)
1111 {
1112 group_leader = possible_leader;
1113 group_exon_3 = l_exon_3;
1114 break;
1115 }
1116 ++l_itr;
1117 }
1118
1119 if (!group_leader || !group_exon_3)
1120 continue;
1121
1122 for (size_t j = 0; j < group.size(); ++j)
1123 {
1124 const AugmentedCuffOp* exon_3 = NULL;
1125 int end_diff = 0;
1126 if (group_leader->strand() == CUFF_REV)
1127 {
1128 exon_3 = &(group[j].second->augmented_ops().front());
1129 end_diff = group_exon_3->g_left() - exon_3->g_left();
1130 }
1131 else
1132 {
1133 exon_3 = &(group[j].second->augmented_ops().back());
1134 end_diff = exon_3->g_right() - group_exon_3->g_right();
1135 }
1136
1137 if (end_diff > 0)
1138 {
1139 // leader
1140 // follower
1141 group[j].second->trim_3(end_diff);
1142 }
1143 else if (end_diff < 0)
1144 {
1145 // leader
1146 // follower
1147 group[j].second->extend_3(-end_diff);
1148 }
1149 }
1150 }
1151
1152
1153 return;
1154
1155 }
1156