1 /*
2  *  filters.cpp
3  *  cufflinks
4  *
5  *  Created by Cole Trapnell on 10/27/09.
6  *  Copyright 2009 Cole Trapnell. All rights reserved.
7  *
8  */
9 
10 #include "filters.h"
11 #include <algorithm>
12 #include <numeric>
13 #include <boost/graph/adjacency_list.hpp>
14 #include <boost/graph/depth_first_search.hpp>
15 #include <boost/graph/visitors.hpp>
16 #include <boost/graph/graph_traits.hpp>
17 #include <boost/graph/connected_components.hpp>
18 
19 using namespace std;
20 using namespace boost;
21 
filter_introns(int bundle_length,int bundle_left,vector<Scaffold> & hits,double fraction,bool filter_on_intron_overlap,bool filter_with_intron_doc)22 void filter_introns(int bundle_length,
23 					int bundle_left,
24 					vector<Scaffold>& hits,
25 					double fraction,
26 					bool filter_on_intron_overlap,
27 					bool filter_with_intron_doc)
28 {
29 	vector<float> depth_of_coverage(bundle_length,0);
30 	vector<double> scaff_doc;
31 	map<pair<int,int>, float> intron_doc;
32 	vector<Scaffold> filtered_hits;
33 	vector<bool> toss(hits.size(), false);
34 
35 	double bundle_avg_doc = compute_doc(bundle_left,
36 										hits,
37 										depth_of_coverage,
38 										intron_doc,
39 										false);
40 
41 	double bundle_avg_thresh = bundle_avg_doc * fraction;
42 
43 	if (filter_with_intron_doc && !intron_doc.empty())
44 	{
45 		bundle_avg_doc = major_isoform_intron_doc(intron_doc);
46 		bundle_avg_thresh = fraction * bundle_avg_doc;
47 		verbose_msg("\tFiltering bundle introns, avg (intron) doc = %lf, thresh = %f\n", bundle_avg_doc, bundle_avg_thresh);
48 	}
49 	else
50 	{
51 		verbose_msg("\tFiltering bundle introns, avg bundle doc = %lf, thresh = %f\n", bundle_avg_doc, bundle_avg_thresh);
52 	}
53 
54 	for(map<pair<int, int>, float>::const_iterator itr = intron_doc.begin();
55 		itr != intron_doc.end();
56 		++itr)
57 	{
58 		for (size_t j = 0; j < hits.size(); ++j)
59 		{
60 			//fprintf(stderr, "considering read [%d-%d] with min doc = %lf contained in intron with doc = %lf\n", hits[j].left(), hits[j].right(), doc, idoc);
61 
62 			if(hits[j].is_ref())
63 				continue;
64 
65 			const vector<AugmentedCuffOp>& ops = hits[j].augmented_ops();
66 
67 			for (size_t i = 0; i < ops.size(); ++i)
68 			{
69 				if (ops[i].opcode == CUFF_INTRON)
70 				{
71 					map<pair<int, int>, float>::const_iterator itr;
72 					itr = intron_doc.find(make_pair(ops[i].g_left(), ops[i].g_right()));
73 
74 					double doc = itr->second;
75 					if (doc < bundle_avg_thresh)
76 					{
77 						toss[j] = true;
78 						verbose_msg("\t Filtering intron %d - %d: %f thresh %f\n", itr->first.first, itr->first.second, doc, bundle_avg_thresh);
79 						continue;
80 					}
81 
82 					if (!filter_on_intron_overlap)
83 						continue;
84 
85 					for (map<pair<int,int>, float>::const_iterator itr2 = intron_doc.begin();
86 						 itr2 != intron_doc.end();
87 						 ++itr2)
88 					{
89 						if (itr == itr2 ||
90 							!overlap_in_genome(itr->first.first,
91 											   itr->first.second,
92 											   itr2->first.first,
93 											   itr2->first.second))
94 							continue;
95 
96 						double thresh = itr2->second * fraction;
97 						if (doc < thresh)
98 						{
99 							verbose_msg("\t Filtering intron (due to overlap) %d - %d: %f thresh %f\n", itr->first.first, itr->first.second, doc, bundle_avg_thresh);
100 							toss[j] = true;
101 						}
102 					}
103 				}
104 			}
105 		}
106 	}
107 
108 	for (size_t j = 0; j < hits.size(); ++j)
109 	{
110 		if (!toss[j])
111 		{
112 			filtered_hits.push_back(hits[j]);
113 //#if verbose_msg
114 //			if (hits[j].has_intron())
115 //			{
116 //				fprintf(stderr, "KEEPING intron scaff [%d-%d]\n", hits[j].left(), hits[j].right());
117 //			}
118 //#endif
119 		}
120 		else
121 		{
122 			if (hits[j].has_intron())
123 			{
124 
125 				verbose_msg("\tFiltering intron scaff [%d-%d]\n", hits[j].left(), hits[j].right());
126 			}
127 		}
128 	}
129 
130 
131 	verbose_msg("\tIntron filtering pass finished: excluded %d fragments\n", (int)hits.size() - (int)filtered_hits.size());
132 	hits = filtered_hits;
133 }
134 
background_rate(const vector<float> depth_of_coverage,int left,int right)135 double background_rate(const vector<float> depth_of_coverage,
136                        int left,
137                        int right)
138 {
139     vector<float> tmp;
140 
141     size_t r_bound = (size_t)min(right, (int) depth_of_coverage.size());
142     size_t l_bound = (size_t)max(left, 0);
143 
144     tmp.insert(tmp.end(),
145                depth_of_coverage.begin() + l_bound,
146                depth_of_coverage.begin() + r_bound);
147 
148     if (tmp.empty())
149         return 0;
150 
151     vector<float>::iterator new_end =  remove(tmp.begin(), tmp.end(), 0);
152     tmp.erase(new_end, tmp.end());
153     sort(tmp.begin(), tmp.end());
154 
155     size_t median = (size_t)floor(tmp.size() / 2);
156     double median_doc = tmp[median];
157     return median_doc;
158 }
159 
pre_mrna_filter(int bundle_length,int bundle_left,vector<Scaffold> & hits)160 void pre_mrna_filter(int bundle_length,
161 					 int bundle_left,
162 					 vector<Scaffold>& hits)
163 {
164 	vector<float> depth_of_coverage(bundle_length,0);
165 	vector<double> scaff_doc;
166 	map<pair<int,int>, float> intron_doc;
167 	vector<Scaffold> filtered_hits;
168 	vector<bool> toss(hits.size(), false);
169 	vector<float> through_introns; //for each location, how many introns pass through
170 
171 	vector<int> scaff_intron_status;
172 	// Make sure the avg only uses stuff we're sure isn't pre-mrna fragments
173     double bundle_avg_doc = compute_doc(bundle_left,
174 										hits,
175 										depth_of_coverage,
176 										intron_doc,
177 										true,
178 										&through_introns,
179 										&scaff_intron_status);
180     verbose_msg("Pre-mRNA flt: bundle average doc = %lf\n", bundle_avg_doc);
181     /*
182      //2nd call not needed, the vectors won't change, only the return value
183      compute_doc(bundle_left,
184      hits,
185      depth_of_coverage,
186      intron_doc,
187      false);
188      */
189 	record_doc_for_scaffolds(bundle_left,
190 							 hits,
191 							 depth_of_coverage,
192 							 intron_doc,
193 							 scaff_doc);
194 
195 	for(map<pair<int, int>, float >::const_iterator itr = intron_doc.begin();
196 		itr != intron_doc.end();
197 		++itr)
198 	{
199 		int i_left = itr->first.first;
200 		int i_right = itr->first.second;
201 		int i_doc   = itr->second;
202         double intron_background = background_rate(depth_of_coverage,
203                                                    i_left - bundle_left,
204                                                    i_right - bundle_left);
205         double cumul_cov = 0;
206         for (int i = 0; i < i_right - i_left; ++i)
207         {
208             size_t pos = (i_left - bundle_left) + i;
209             cumul_cov += depth_of_coverage[pos];
210         }
211         cumul_cov /= i_right - i_left;
212         verbose_msg("Pre-mRNA flt: intron %d-%d : background: %lf, inner coverage: %lf, junction coverage: %f\n",
213                     i_left, i_right, intron_background, cumul_cov, i_doc);
214         if (cumul_cov / bundle_avg_doc >= pre_mrna_fraction)
215         {
216             //fprintf(stderr, "\tskipping\n");
217             continue;
218         }
219 
220         ////double thresh = (1.0/pre_mrna_fraction) * intron_background;
221         double thresh = pre_mrna_fraction * intron_background;
222 
223 
224         for (size_t j = 0; j < hits.size(); ++j)
225         {
226             if (hits[j].left()>i_right) break;
227             if (hits[j].is_ref())
228                 continue;
229             if (toss[j])
230                 continue;
231             //find maximum intron support in the hit region
232 
233             int len = 0;
234             double doc = 0.0;
235             size_t curr_op = 0;
236             const vector<AugmentedCuffOp>& ops = hits[j].augmented_ops();
237             while (curr_op != ops.size())
238             {
239                 const AugmentedCuffOp&  op = ops[curr_op];
240                 if (op.opcode == CUFF_MATCH)
241                 {
242                     int op_len = 0;
243                     double op_doc = 0.0;
244                     int left_off = op.g_left();
245                     if (left_off + op.genomic_length > i_left && left_off < i_right)
246                     {
247                         if (left_off > i_left)
248                         {
249                             if (left_off + op.genomic_length <= i_right + 1)
250                             {
251                                 op_len += op.genomic_length;
252                                 int L = left_off - bundle_left;
253                                 int R = L + op.genomic_length;
254                                 op_doc += accumulate(depth_of_coverage.begin() + L, depth_of_coverage.begin() + R, 0.0);
255                             }
256                             else
257                             {
258                                 op_len += i_right - left_off;
259                                 int L = left_off - bundle_left;
260                                 int R = L + (i_right - left_off);
261                                 op_doc += accumulate(depth_of_coverage.begin() + L, depth_of_coverage.begin() + R, 0.0);
262                             }
263                         }
264                         else
265                         {
266                             if (left_off + op.genomic_length <= i_right + 1)
267                             {
268                                 op_len += (left_off + op.genomic_length - i_left);
269                                 int L = left_off - bundle_left;
270                                 int R = L + (left_off + op.genomic_length - i_left);
271                                 op_doc += accumulate(depth_of_coverage.begin() + L, depth_of_coverage.begin() + R, 0.0);
272                             }
273                             else
274                             {
275                                 op_len = i_right - i_left;
276                                 int L = left_off - bundle_left;
277                                 int R = L + (i_right - i_left);
278                                 op_doc = accumulate(depth_of_coverage.begin() + L, depth_of_coverage.begin() + R, 0.0);
279                             }
280                         }
281                     }
282 
283                     len += op_len;
284                     doc += op_doc;
285                 }
286 
287                 if (op.g_left() >= i_right)
288                     break;
289                 ++curr_op;
290             }
291 
292             if (len)
293             {
294                 double hit_doc_in_region = doc / len;
295                 if (hit_doc_in_region < thresh)
296                 {
297                     toss[j] = true;
298                     if (hits[j].has_intron())
299                     {
300                         //    fprintf(stderr, "\t$$$ Filtering intron scaff [%d-%d]\n", hits[j].left(), hits[j].right());
301 
302                         verbose_msg("\t@@@ Filtering intron scaff [%d-%d] (scaff_doc=%lf, doc_in_region=%lf)\n",
303                                     hits[j].left(), hits[j].right(), scaff_doc[j], hit_doc_in_region);
304                     }
305                 }
306             }
307 		} //for each scaffold
308 	} //for each intron
309 	for (size_t j = 0; j < hits.size(); ++j)
310 	{
311 		if (!toss[j])
312 		{
313 			filtered_hits.push_back(hits[j]);
314 		}
315 		/*else
316          {
317          if (hits[j].has_intron())
318          {
319 
320          verbose_msg( "\t@@@ Filtering intron scaff [%d-%d]\n", hits[j].left(), hits[j].right());
321          }
322          }
323          */
324 	}
325 
326 	if (cuff_verbose && hits.size()>filtered_hits.size())
327         verbose_msg("\tPre-mRNA flt tossed %lu fragments\n", hits.size() - filtered_hits.size());
328 
329 	hits = filtered_hits;
330 }
331 
filter_hits(int bundle_length,int bundle_left,vector<Scaffold> & hits)332 void filter_hits(int bundle_length,
333 				 int bundle_left,
334 				 vector<Scaffold>& hits)
335 {
336 
337 	pre_mrna_filter(bundle_length, bundle_left, hits);
338 
339 	vector<float> depth_of_coverage(bundle_length+1,0);
340 	vector<double> scaff_doc;
341 	map<pair<int,int>, float> intron_doc;
342 	vector<Scaffold> filtered_hits;
343 	vector<bool> toss(hits.size(), false);
344 
345 	// Make sure the avg only uses stuff we're sure isn't pre-mrna fragments
346 	double bundle_avg_doc = compute_doc(bundle_left,
347 										hits,
348 										depth_of_coverage,
349 										intron_doc,
350 										true);
351 
352 	// recompute the real DoCs
353 	/* not needed, vectors are not changed
354 	compute_doc(bundle_left,
355 				hits,
356 				depth_of_coverage,
357 				intron_doc,
358 				false);
359 	*/
360 
361 	record_min_doc_for_scaffolds(bundle_left,
362 								 hits,
363 								 depth_of_coverage,
364 								 intron_doc,
365 								 scaff_doc);
366 
367 	//double bundle_avg_thresh = min_isoform_fraction * bundle_avg_doc;
368 
369 	if (!intron_doc.empty())
370 	{
371 		double intron_avg_doc = major_isoform_intron_doc(intron_doc);
372 		double intron_multiplier = intron_avg_doc / bundle_avg_doc;
373 
374 		// we don't want this to be more than 1.0 ...
375 		intron_multiplier = min(intron_avg_doc, 1.0);
376 		//bundle_avg_thresh = min_isoform_fraction * bundle_avg_doc;
377 
378 		set<pair<int, int> > tossed_introns;
379 		for(map<pair<int, int>, float>::const_iterator itr = intron_doc.begin();
380 			itr != intron_doc.end();
381 			++itr)
382 		{
383 			for (size_t j = 0; j < hits.size(); ++j)
384 			{
385 				if (hits[j].is_ref())
386                 {
387 					continue;
388                 }
389 				int i_left = itr->first.first;
390 				int i_right = itr->first.second;
391 				int j_match_len = hits[j].match_length(i_left, i_right);
392 				if (j_match_len > 0)
393 				{
394 					double idoc = itr->second;
395 					double doc = scaff_doc[j];
396 
397 					if (!hits[j].has_intron() &&
398 						doc < pre_mrna_fraction * (idoc * intron_multiplier))
399                     {
400 						toss[j] = true;
401                     }
402 
403 					const vector<AugmentedCuffOp>& ops = hits[j].augmented_ops();
404 
405 					unsigned int num_mismatches = 0;
406 					assert (hits[j].mate_hits().size() == 1);
407 					const MateHit& hit = **(hits[j].mate_hits().begin());
408 					num_mismatches = hit.edit_dist();
409 
410 					double percent_mismatches = num_mismatches / (double)hits[j].length();
411 
412 					bool intron_pokin_read = false;
413 
414 					const AugmentedCuffOp& first = ops.front();
415 					// intron                    =========================
416 					// hit                     ******************
417 					if (first.g_left() < i_left && first.g_right() > i_left  && first.g_right() < i_right)
418 					{
419 						intron_pokin_read = true;
420 					}
421 
422 					// intron          =========================
423 					// hit                      ******************
424 					if (first.g_left() < i_right && first.g_right() > i_right  && first.g_left() > i_left)
425 					{
426 						intron_pokin_read = true;
427 					}
428 
429 					const AugmentedCuffOp& last = ops.back();
430 					// intron          =========================
431 					// hit                     ******************
432 					if (last.g_left() < i_left && last.g_right() > i_left  && last.g_right() < i_right)
433 					{
434 						intron_pokin_read = true;
435 					}
436 
437 					// intron          =========================
438 					// hit                            ******************
439 					if (last.g_left() < i_right && last.g_right() > i_right  && last.g_left() > i_left)
440 					{
441 						intron_pokin_read = true;
442 					}
443 
444 					if (intron_pokin_read)
445 					{
446 						double fraction;
447 //						if (!hits[j].has_intron())
448 //						{
449 //							fraction = (3 * pre_mrna_fraction) + percent_mismatches;
450 //						}
451 //						else
452 						{
453 							fraction = pre_mrna_fraction + percent_mismatches;
454 						}
455 						double thresh = fraction * (intron_avg_doc * intron_multiplier);
456 						if (doc < thresh)
457 						{
458 							toss[j] = true;
459 //							if (hits[j].has_intron())
460 //							{
461 //								fprintf(stderr, "\t^^^Filtering intron scaff [%d-%d]\n", hits[j].left(), hits[j].right());
462 //							}
463 						}
464 					}
465 				}
466 			}
467 		}
468 	}
469 
470 	for (size_t j = 0; j < hits.size(); ++j)
471 	{
472 		if (!toss[j])
473 		{
474 			filtered_hits.push_back(hits[j]);
475 //#if verbose_msg
476 //			if (hits[j].has_intron())
477 //			{
478 //
479 //				fprintf(stderr, "KEEPING intron scaff [%d-%d]\n", hits[j].left(), hits[j].right());
480 //			}
481 //#endif
482 		}
483 		else
484 		{
485 			if (hits[j].has_intron())
486 			{
487 
488 				verbose_msg("\t!!!Filtering intron scaff [%d-%d]\n", hits[j].left(), hits[j].right());
489 			}
490 		}
491 	}
492 
493 //#if verbose_msg
494 //	fprintf(stderr, "\tInitial filter pass complete\n");
495 //#endif
496 
497 	hits = filtered_hits;
498 
499 	scaff_doc.clear();
500 	filtered_hits.clear();
501 
502 	toss = vector<bool>(hits.size(), false);
503 
504 	map<pair<int, int>, float> dummy;
505 	bundle_avg_doc = compute_doc(bundle_left,
506 								 hits,
507 								 depth_of_coverage,
508 								 dummy,
509 								 false);
510 
511 //#if verbose_msg
512 //	fprintf(stderr, "\tUpdated avg bundle doc = %lf\n", bundle_avg_doc);
513 //#endif
514 
515 	record_doc_for_scaffolds(bundle_left,
516 							 hits,
517 							 depth_of_coverage,
518 							 intron_doc,
519 							 scaff_doc);
520 
521 
522 
523 //#if verbose_msg
524 //    double bundle_thresh = pre_mrna_fraction * bundle_avg_doc;
525 //	fprintf(stderr, "\tthreshold is = %lf\n", bundle_thresh);
526 //#endif
527 
528 	if (!intron_doc.empty())
529 	{
530 //		filter_introns(bundle_length,
531 //					   bundle_left,
532 //					   hits,
533 //					   min_isoform_fraction,
534 //					   true,
535 //					   true);
536 		if (bundle_avg_doc > 3000)
537 		{
538 			filter_introns(bundle_length,
539 						   bundle_left,
540 						   hits,
541 						   min_isoform_fraction,
542 						   true,
543 						   false);
544 		}
545 	}
546 
547 	for (size_t j = 0; j < hits.size(); ++j)
548 	{
549 		if (!toss[j])
550 		{
551 			filtered_hits.push_back(hits[j]);
552 //#if verbose_msg
553 //			if (hits[j].has_intron())
554 //			{
555 //
556 //				fprintf(stderr, "KEEPING intron scaff [%d-%d]\n", hits[j].left(), hits[j].right());
557 //			}
558 //#endif
559 		}
560 		else
561 		{
562 			if (hits[j].has_intron())
563 			{
564 				verbose_msg("\t***Filtering intron scaff [%d-%d]\n", hits[j].left(), hits[j].right());
565 			}
566 		}
567 	}
568 
569 	//fprintf(stderr, "\tTossed %d hits as noise\n", (int)hits.size() - filtered_hits.size());
570 
571 	hits = filtered_hits;
572 }
573 
574 
filter_junk_isoforms(vector<boost::shared_ptr<Abundance>> & transcripts,vector<double> & abundances,const vector<boost::shared_ptr<Abundance>> & mapped_transcripts,double locus_mass)575 void filter_junk_isoforms(vector<boost::shared_ptr<Abundance> >& transcripts,
576 						  vector<double>& abundances,
577                           const vector<boost::shared_ptr<Abundance> >& mapped_transcripts,
578                           double locus_mass)
579 {
580 	//	vector<double>::iterator max_ab = std::max_element(abundances.begin(),
581 	//													   abundances.end());
582 	double max_fwd_ab = -1.0;
583 	double max_rev_ab = -1.0;
584 
585 	for (size_t t = 0; t < transcripts.size(); ++t)
586 	{
587 		boost::shared_ptr<Scaffold> scaff = transcripts[t]->transfrag();
588 		if (scaff->strand() == CUFF_FWD || scaff->strand() == CUFF_STRAND_UNKNOWN)
589 		{
590 			if (abundances[t] > max_fwd_ab)
591 				max_fwd_ab = abundances[t];
592 		}
593 		if (scaff->strand() == CUFF_REV || scaff->strand() == CUFF_STRAND_UNKNOWN)
594 		{
595 			if (abundances[t] > max_rev_ab)
596 				max_rev_ab = abundances[t];
597 		}
598 	}
599 
600 	// Try to categorize the crap transcripts for suppression
601 	vector<bool> pre_mrna_junk(transcripts.size(), false); //intra-intron, much lower abundance than container
602 	vector<bool> chaff(transcripts.size(), false); // only a single MateHit, impossible to reliably quantitate
603 	vector<bool> repeats(transcripts.size(), false); // too many low-quality hits
604 	vector<bool> too_rare(transcripts.size(), false); // too rare to be reliably quantitated, could be error
605 
606 	//cerr << "Chucked : ";
607 	for (size_t t = 0; t < transcripts.size(); ++t)
608 	{
609 		boost::shared_ptr<Scaffold> scaff = transcripts[t]->transfrag();
610 
611 		if (!(scaff->is_ref()) && allow_junk_filtering)
612 		{
613 			const vector<const MateHit*> hits = scaff->mate_hits();
614 
615 			const vector<AugmentedCuffOp>& ops = scaff->augmented_ops();
616 
617 			if (ops.size() == 1 && ops[0].opcode == CUFF_MATCH)
618 			{
619 				for (size_t j = 0; j < transcripts.size(); ++j)
620 				{
621 					const vector<AugmentedCuffOp>& j_ops = scaff->augmented_ops();
622 					for (size_t L = 0; L < j_ops.size(); L++)
623 					{
624 						if (AugmentedCuffOp::overlap_in_genome(ops[0], j_ops[L]) &&
625 							j_ops[L].opcode == CUFF_INTRON)
626 						{
627 							pre_mrna_junk[t] = true;
628 						}
629 					}
630 				}
631 			}
632 
633             if (library_type != "transfrags")
634             {
635                 double multi_hits = 0.0;
636                 //static const double low_qual_err_prob = high_phred_err_prob; // hits with error_prob() above this are low quality;
637                 //static const double low_qual_thresh = 0.75; // hits with more than this fraction of low qual hits are repeats
638                 for (vector<const MateHit*>::const_iterator itr = hits.begin();
639                      itr != hits.end();
640                      ++itr)
641                 {
642                     if ((*itr)->is_multi())
643                         multi_hits += 1.0;
644                 }
645 
646                 double low_qual_frac = multi_hits / (double)hits.size();
647                 if (low_qual_frac > max_multiread_fraction)
648                     repeats[t] = true;
649             }
650             if (scaff->strand() == CUFF_FWD &&
651                 (abundances[t] / max_fwd_ab) < min_isoform_fraction)
652                 too_rare[t] = true;
653             if ((scaff->strand() == CUFF_REV ||  scaff->strand() == CUFF_STRAND_UNKNOWN) &&
654                 (abundances[t] / max_rev_ab) < min_isoform_fraction)
655                 too_rare[t] = true;
656 
657             const vector<double>* cond_probs = (mapped_transcripts[t]->cond_probs());
658             if (cond_probs)
659             {
660                 assert (library_type != "transfrags");
661                 double supporting_hits = abundances[t] * locus_mass;
662                 if (supporting_hits < min_frags_per_transfrag)
663                     chaff[t] = true;
664             }
665 		}
666 	}
667 
668 	vector<boost::shared_ptr<Abundance> > non_junk_transcripts;
669 	vector<double> non_junk_abundances;
670 	for (size_t t = 0; t < transcripts.size(); ++t)
671 	{
672 		if (!repeats[t] && !pre_mrna_junk[t] && !too_rare[t] && !chaff[t])
673 		{
674 			non_junk_transcripts.push_back(transcripts[t]);
675 			non_junk_abundances.push_back(abundances[t]);
676 		}
677         else
678         {
679             verbose_msg( "Filtering isoform %d-%d\n", transcripts[t]->transfrag()->left(), transcripts[t]->transfrag()->right());
680         }
681 	}
682 
683 	transcripts = non_junk_transcripts;
684 	abundances = non_junk_abundances;
685 }
686 
687 // Designed to strip out remaining pre-mrna genes, assembled repeats, and
688 // fragments from isoforms too short to be reliably quantitated.
filter_junk_genes(vector<Gene> & genes)689 void filter_junk_genes(vector<Gene>& genes)
690 {
691 	vector<Gene> good_genes;
692 	vector<Isoform> all_isoforms;
693 	for (size_t i = 0; i < genes.size(); ++i)
694 	{
695 		all_isoforms.insert(all_isoforms.end(),
696 							genes[i].isoforms().begin(),
697 							genes[i].isoforms().end());
698 	}
699 
700 	for (size_t i = 0; i < genes.size(); ++i)
701 	{
702 		const Gene& g = genes[i];
703 
704 		if(g.has_ref_trans())
705 		{
706 			good_genes.push_back(g);
707 			continue;
708 		}
709 
710 		bool good_gene = true;
711 		for (size_t j = 0; j < all_isoforms.size(); ++j)
712 		{
713 			vector<pair<int, int> > introns = all_isoforms[j].scaffold().gaps();
714 
715             //assert (!allow_junk_filtering || all_isoforms[j].scaffold().mate_hits().size() >= min_frags_per_transfrag);
716 			for (size_t k = 0; k < introns.size(); ++k)
717 			{
718 				if (g.left() > introns[k].first && g.right() < introns[k].second &&
719 					g.FPKM() / all_isoforms[j].FPKM() < pre_mrna_fraction)
720 				{
721 					good_gene = false;
722 				}
723 			}
724 		}
725         if (allow_junk_filtering)
726         {
727             if (g.FPKM() == 0)
728             {
729                 good_gene = false;
730             }
731         }
732 		if (good_gene)
733         {
734 			good_genes.push_back(g);
735         }
736         else
737         {
738             verbose_msg("Filtering transfrags from gene %d-%d\n", g.left(), g.right());
739         }
740 	}
741 
742 	genes = good_genes;
743 
744 }
745 
clip_by_3_prime_dropoff(vector<Scaffold> & scaffolds)746 void clip_by_3_prime_dropoff(vector<Scaffold>& scaffolds)
747 {
748     vector<pair<double, Scaffold*> > three_prime_ends;
749 
750     if (library_type != "transfrags")
751     {
752         BOOST_FOREACH (Scaffold& scaff, scaffolds)
753         {
754             if (!(scaff.strand() == CUFF_FWD || scaff.strand() == CUFF_REV))
755                 continue;
756 
757             int scaff_len = scaff.length();
758             vector<double> coverage(scaff_len, 0.0);
759 
760             double total = 0;
761             BOOST_FOREACH(const MateHit* hit, scaff.mate_hits())
762             {
763                 int start, end, frag_len;
764                 if (!scaff.map_frag(*hit, start, end, frag_len)) continue;
765 
766                 if (scaff.strand() == CUFF_REV)
767                 {
768                     start = scaff_len - 1 - start;
769                     end = scaff_len - 1 - end;
770                     swap(start, end);
771                 }
772 
773                 for(int i = start; i <= end; ++i)
774                 {
775                     coverage[i] += hit->mass();
776                     total += hit->mass();
777                 }
778             }
779             double avg_cov = total/scaff_len;
780     //        if (avg_cov < trim_3_avgcov_thresh)
781     //            continue;
782 
783             const AugmentedCuffOp* exon_3 = NULL;
784             int mult;
785             int offset;
786 
787             if (scaff.strand() == CUFF_REV)
788             {
789                 mult = 1;
790                 offset = 0;
791                 exon_3 = &scaff.augmented_ops().front(); //last exon at 3' end
792             }
793             else if (scaff.strand() == CUFF_FWD)
794             {
795                 mult = -1;
796                 offset = scaff_len - 1;
797                 exon_3 = &scaff.augmented_ops().back();//last exon at 3' end
798             }
799             else
800             {
801                 continue;
802             }
803 
804             int to_remove;
805             double min_cost = numeric_limits<double>::max();
806             double mean_to_keep = 0.0;
807             double mean_to_trim = 0.0;
808             double tmp_mean_to_trim = 0.0;
809             double tmp_mean_to_keep = 0.0;
810             double tmp_mean_3prime = 0.0;
811             for (int i = 0; i < exon_3->genomic_length; i++)
812             {
813                 tmp_mean_3prime += coverage[offset + mult*i];
814             }
815             tmp_mean_3prime /= exon_3->genomic_length;
816 
817             double base_cost = 0.0;
818             for (int i = 0; i < exon_3->genomic_length; i++)
819             {
820                 double d = (coverage[offset + mult*i] - tmp_mean_3prime);
821                 d *= d;
822                 base_cost += d;
823             }
824             base_cost /= exon_3->genomic_length;
825 
826             size_t min_cost_x = -1;
827             for (to_remove = 1; to_remove < exon_3->genomic_length - 1; to_remove++)
828             {
829                 tmp_mean_to_trim = 0.0;
830                 tmp_mean_to_keep = 0.0;
831                 for (int i = 0; i < exon_3->genomic_length; i++)
832                 {
833                     if (i <= to_remove)
834                     {
835                         tmp_mean_to_trim += coverage[offset + mult*i];
836                     }
837                     else
838                     {
839                         tmp_mean_to_keep += coverage[offset + mult*i];
840                     }
841                 }
842 
843                 tmp_mean_to_trim /= to_remove;
844                 tmp_mean_to_keep /= (exon_3->genomic_length - to_remove);
845 
846                 double tmp_mean_trim_cost = 0.0;
847                 double tmp_mean_keep_cost = 0.0;
848                 for (int i = 0; i < exon_3->genomic_length; i++)
849                 {
850                     if (i <= to_remove)
851                     {
852                         double d = (coverage[offset + mult*i] - tmp_mean_to_trim);
853                         d *= d;
854                         tmp_mean_trim_cost += d;
855                     }
856                     else
857                     {
858                         double d = (coverage[offset + mult*i] - tmp_mean_to_keep);
859                         d *= d;
860                         tmp_mean_keep_cost += d;
861                     }
862                 }
863 
864                 tmp_mean_trim_cost /= to_remove;
865                 tmp_mean_keep_cost /= (exon_3->genomic_length - to_remove);
866 
867                 double new_cost = tmp_mean_trim_cost + tmp_mean_keep_cost;
868 
869                 if (new_cost < min_cost && trim_3_dropoff_frac * tmp_mean_to_keep > tmp_mean_to_trim && new_cost < base_cost && to_remove > scaff_len * 0.05)
870                 {
871                     min_cost = tmp_mean_trim_cost + tmp_mean_keep_cost;
872                     min_cost_x = to_remove;
873                     mean_to_keep = tmp_mean_to_keep;
874                     mean_to_trim = tmp_mean_to_trim;
875                 }
876             }
877 
878             // If trimming reduces the overall mean squared error of the coverage
879             // do it
880             if (avg_cov >= trim_3_avgcov_thresh && min_cost_x < exon_3->genomic_length)
881             {
882                 scaff.trim_3(min_cost_x);
883             }
884 
885             // store the mean squared error for this exon
886             tmp_mean_3prime = 0.0;
887             for (int i = 0; i < exon_3->genomic_length; i++)
888             {
889                 tmp_mean_3prime += coverage[offset + mult*i];
890             }
891             tmp_mean_3prime /= exon_3->genomic_length;
892 
893             base_cost = 0.0;
894             for (int i = 0; i < exon_3->genomic_length; i++)
895             {
896                 double d = (coverage[offset + mult*i] - tmp_mean_3prime);
897                 d *= d;
898                 base_cost += d;
899             }
900             base_cost /= exon_3->genomic_length;
901             three_prime_ends.push_back(make_pair(base_cost, &scaff));
902         }
903     }
904     else
905     {
906         BOOST_FOREACH (Scaffold& scaff, scaffolds)
907         {
908             if (!(scaff.strand() == CUFF_FWD || scaff.strand() == CUFF_REV))
909                 continue;
910 
911             int scaff_len = scaff.length();
912             vector<double> coverage(scaff_len, 0.0);
913 
914             double total = 0;
915             BOOST_FOREACH(const MateHit* hit, scaff.mate_hits())
916             {
917                 int start, end, frag_len;
918                 if (!scaff.map_frag(*hit, start, end, frag_len)) continue;
919 
920                 if (scaff.strand() == CUFF_REV)
921                 {
922                     start = scaff_len - 1 - start;
923                     end = scaff_len - 1 - end;
924                     swap(start, end);
925                 }
926 
927                 for(int i = start; i <= end; ++i)
928                 {
929                     coverage[i] += hit->mass();
930                     total += hit->mass();
931                 }
932             }
933 
934             const AugmentedCuffOp* exon_3 = NULL;
935             int mult;
936             int offset;
937 
938             if (scaff.strand() == CUFF_REV)
939             {
940                 mult = 1;
941                 offset = 0;
942                 exon_3 = &scaff.augmented_ops().front();
943             }
944             else if (scaff.strand() == CUFF_FWD)
945             {
946                 mult = -1;
947                 offset = scaff_len - 1;
948                 exon_3 = &scaff.augmented_ops().back();
949             }
950             else
951             {
952                 continue;
953             }
954 
955             three_prime_ends.push_back(make_pair(scaff.fpkm(), &scaff));
956         }
957 
958     }
959 
960     adjacency_list <vecS, vecS, undirectedS> G;
961 
962 	for (size_t i = 0; i < three_prime_ends.size(); ++i)
963 	{
964 		add_vertex(G);
965 	}
966 
967 	for (size_t i = 0; i < three_prime_ends.size(); ++i)
968 	{
969 		Scaffold* scaff_i = three_prime_ends[i].second;
970 		//assert (scaff_i);
971 
972         const AugmentedCuffOp* scaff_i_exon_3 = NULL;
973 
974         if (scaff_i->strand() == CUFF_REV)
975         {
976             scaff_i_exon_3 = &(scaff_i->augmented_ops().front());
977         }
978         else if (scaff_i->strand() == CUFF_FWD)
979         {
980             scaff_i_exon_3 = &(scaff_i->augmented_ops().back());
981         }
982 
983 		for (size_t j = i + 1; j < three_prime_ends.size(); ++j)
984 		{
985 			Scaffold* scaff_j = three_prime_ends[j].second;
986 
987             if (scaff_i->strand() != scaff_j->strand())
988                 continue;
989 
990             const AugmentedCuffOp* scaff_j_exon_3 = NULL;
991 
992             if (scaff_j->strand() == CUFF_REV)
993             {
994                 scaff_j_exon_3 = &(scaff_j->augmented_ops().front());
995             }
996             else if (scaff_j->strand() == CUFF_FWD)
997             {
998                 scaff_j_exon_3 = &(scaff_j->augmented_ops().back());
999             }
1000 
1001 			if (AugmentedCuffOp::overlap_in_genome(*scaff_j_exon_3, *scaff_i_exon_3) &&
1002                 AugmentedCuffOp::compatible(*scaff_j_exon_3, *scaff_i_exon_3, 0))
1003 				add_edge(i, j, G);
1004 		}
1005 	}
1006 
1007 	std::vector<int> component(num_vertices(G));
1008 	connected_components(G, &component[0]);
1009 
1010 	vector<vector<bool> > clusters(three_prime_ends.size(),
1011 								   vector<bool>(three_prime_ends.size(), false));
1012 
1013 	//vector<vector<size_t> > cluster_indices(three_prime_ends.size());
1014 
1015     vector<vector<pair<double, Scaffold*> > > grouped_scaffolds(three_prime_ends.size());
1016 	for (size_t i = 0; i < three_prime_ends.size(); ++i)
1017 	{
1018 		clusters[component[i]][i] = true;
1019 		grouped_scaffolds[component[i]].push_back(three_prime_ends[i]);
1020 	}
1021 
1022     for (size_t i = 0; i < grouped_scaffolds.size(); ++i)
1023     {
1024         vector<pair<double, Scaffold*> >& group = grouped_scaffolds[i];
1025         sort(group.begin(), group.end());
1026         if (group.empty())
1027             continue;
1028 
1029         Scaffold* group_leader = NULL;
1030         int trim_point = -1;
1031 
1032         const AugmentedCuffOp* group_exon_3 = NULL;
1033         vector<pair<double, Scaffold*> >::iterator l_itr = group.begin();
1034         while (l_itr != group.end())
1035         {
1036             Scaffold* possible_leader = l_itr->second;
1037             bool ok_clip_leader = true;
1038             vector<pair<double, Scaffold*> >::iterator g_itr = group.begin();
1039             const AugmentedCuffOp* l_exon_3 = NULL;
1040             CuffStrand s = possible_leader->strand();
1041 
1042             if (s != CUFF_STRAND_UNKNOWN)
1043             {
1044                 if (s == CUFF_REV)
1045                     l_exon_3 = &(possible_leader->augmented_ops().front());
1046                 else
1047                     l_exon_3 = &(possible_leader->augmented_ops().back());
1048                 for (; g_itr != group.end(); ++g_itr)
1049                 {
1050                     const AugmentedCuffOp* g_exon_3 = NULL;
1051                     if (s == CUFF_REV)
1052                     {
1053                         //  bad:
1054                         //              leader
1055                         //    follower
1056                         g_exon_3 = &(g_itr->second->augmented_ops().front());
1057                         if (g_exon_3->g_right() <= l_exon_3->g_left())
1058                             ok_clip_leader = false;
1059 
1060                         // for meta-assembly libraries, don't ever allow clipping, just extension
1061                         // bad:
1062                         //             leader
1063                         //         follower
1064                         if (library_type == "transfrags" &&
1065                             g_exon_3->g_left() < l_exon_3->g_left())
1066                             ok_clip_leader = false;
1067                     }
1068                     else
1069                     {
1070                         //  bad:
1071                         //          follower
1072                         //  leader
1073                         g_exon_3 = &(g_itr->second->augmented_ops().back());
1074                         if (g_exon_3->g_left() >= l_exon_3->g_right())
1075                             ok_clip_leader = false;
1076 
1077                         // for meta-assembly libraries, don't ever allow clipping, just extension
1078                         // bad:
1079                         //             leader
1080                         //                follower
1081                         if (library_type == "transfrags" &&
1082                             g_exon_3->g_right() > l_exon_3->g_right())
1083                             ok_clip_leader = false;
1084                     }
1085                 }
1086             }
1087             else
1088             {
1089                 ok_clip_leader = false;
1090             }
1091 
1092             if (ok_clip_leader)
1093             {
1094                 if (s == CUFF_REV)
1095                 {
1096                     if (trim_point == -1)
1097                         trim_point = l_exon_3->g_left();
1098                     else if (l_exon_3->g_left() < trim_point)
1099                         ok_clip_leader = false;
1100                 }
1101                 else
1102                 {
1103                     if (trim_point == -1)
1104                         trim_point = l_exon_3->g_right();
1105                     else if (l_exon_3->g_right() > trim_point)
1106                         ok_clip_leader = false;
1107                 }
1108             }
1109 
1110             if (ok_clip_leader)
1111             {
1112                 group_leader = possible_leader;
1113                 group_exon_3 = l_exon_3;
1114                 break;
1115             }
1116             ++l_itr;
1117         }
1118 
1119         if (!group_leader || !group_exon_3)
1120             continue;
1121 
1122         for (size_t j = 0; j < group.size(); ++j)
1123         {
1124             const AugmentedCuffOp* exon_3 = NULL;
1125             int end_diff = 0;
1126             if (group_leader->strand() == CUFF_REV)
1127             {
1128                 exon_3 = &(group[j].second->augmented_ops().front());
1129                 end_diff = group_exon_3->g_left() - exon_3->g_left();
1130             }
1131             else
1132             {
1133                 exon_3 = &(group[j].second->augmented_ops().back());
1134                 end_diff = exon_3->g_right() - group_exon_3->g_right();
1135             }
1136 
1137             if (end_diff > 0)
1138             {
1139                 // leader
1140                 //     follower
1141                 group[j].second->trim_3(end_diff);
1142             }
1143             else if (end_diff < 0)
1144             {
1145                 //        leader
1146                 //   follower
1147                 group[j].second->extend_3(-end_diff);
1148             }
1149         }
1150     }
1151 
1152 
1153 	return;
1154 
1155 }
1156