1 #ifndef CLUSTERING_H 2 #define CLUSTERING_H 3 /* 4 * abundances.h 5 * cufflinks 6 * 7 * Created by Cole Trapnell on 4/27/09. 8 * Copyright 2009 Cole Trapnell. All rights reserved. 9 * 10 */ 11 12 #ifdef HAVE_CONFIG_H 13 #include <config.h> 14 #endif 15 16 // DON'T move this, or mystery compiler errors will result. Affects gcc >= 4.1 17 #include <boost/graph/vector_as_graph.hpp> 18 19 #include <boost/graph/adjacency_list.hpp> 20 #include <boost/graph/depth_first_search.hpp> 21 #include <boost/graph/visitors.hpp> 22 #include <boost/graph/graph_traits.hpp> 23 #include <boost/graph/connected_components.hpp> 24 25 #ifdef DEBUG 26 #include <boost/numeric/ublas/io.hpp> 27 #endif 28 29 #include <boost/version.hpp> 30 31 #if (BOOST_VERSION < 103800) 32 #include <boost/vector_property_map.hpp> 33 #else 34 #include <boost/property_map/vector_property_map.hpp> 35 #endif 36 37 38 #include "abundances.h" 39 40 typedef boost::adjacency_list <boost::vecS, boost::vecS, boost::undirectedS> AbundanceGraph; 41 42 struct ConnectByExonOverlap 43 { 44 void operator()(const AbundanceGroup& cluster, 45 AbundanceGraph& G); 46 }; 47 48 struct ConnectByAnnotatedGeneId 49 { 50 void operator()(const AbundanceGroup& cluster, 51 AbundanceGraph& G); 52 }; 53 54 struct ConnectByAnnotatedTssId 55 { 56 void operator()(const AbundanceGroup& cluster, 57 AbundanceGraph& G); 58 }; 59 60 struct ConnectByAnnotatedProteinId 61 { 62 void operator()(const AbundanceGroup& cluster, 63 AbundanceGraph& G); 64 }; 65 66 struct ConnectByStrand 67 { 68 void operator()(const AbundanceGroup& cluster, 69 AbundanceGraph& G); 70 }; 71 72 // A "transcript cluster is a set of transcripts whose projections into the 73 // genome overlap on the same strand. They may thus share fragment alignments, 74 // and so they need to be quantitated together. After quantitation, they 75 // can be picked apart. 76 template<class cluster_policy> 77 void cluster_transcripts(const AbundanceGroup& transfrags, 78 vector<AbundanceGroup>& transfrags_by_cluster, 79 ublas::matrix<double>* new_gamma = NULL, 80 ublas::matrix<double>* new_iterated_count = NULL, 81 ublas::matrix<double>* new_count = NULL, 82 ublas::matrix<double>* new_fpkm = NULL) 83 { 84 boost::adjacency_list <boost::vecS, boost::vecS, boost::undirectedS> G; 85 86 transfrags_by_cluster.clear(); 87 88 cluster_policy cp; 89 90 cp(transfrags, G); 91 92 std::vector<int> component(num_vertices(G)); 93 connected_components(G, &component[0]); 94 95 vector<vector<bool> > clusters(transfrags.abundances().size(), 96 vector<bool>(transfrags.abundances().size(), false)); 97 98 vector<vector<size_t> > cluster_indices(transfrags.abundances().size()); 99 for (size_t i = 0; i < transfrags.abundances().size(); ++i) 100 { 101 clusters[component[i]][i] = true; 102 cluster_indices[component[i]].push_back(i); 103 } 104 for (size_t i = 0; i < cluster_indices.size(); ++i) 105 { 106 if (cluster_indices[i].empty()) 107 { 108 cluster_indices.resize(i); 109 break; 110 } 111 } 112 113 for (size_t i = 0; i < clusters.size(); ++i) 114 { 115 AbundanceGroup cluster; 116 transfrags.filter_group(clusters[i], cluster); 117 if (!cluster.abundances().empty()) 118 transfrags_by_cluster.push_back(cluster); 119 } 120 121 if (new_gamma != NULL) 122 { 123 const ublas::matrix<double>& trans_gamma_cov = transfrags.gamma_cov(); 124 const ublas::matrix<double>& trans_iterated_count_cov = transfrags.iterated_count_cov(); 125 const ublas::matrix<double>& trans_count_cov = transfrags.count_cov(); 126 const ublas::matrix<double>& trans_fpkm_cov = transfrags.fpkm_cov(); 127 128 ublas::matrix<double>& cov = *new_gamma; 129 ublas::matrix<double>& iterated_count_cov = *new_iterated_count; 130 ublas::matrix<double>& count_cov = *new_count; 131 ublas::matrix<double>& fpkm_cov = *new_fpkm; 132 133 // number of primary transcripts for this gene 134 size_t num_pt = cluster_indices.size(); 135 cov = ublas::zero_matrix<double>(num_pt, num_pt); 136 137 count_cov = ublas::zero_matrix<double>(num_pt, num_pt); 138 iterated_count_cov = ublas::zero_matrix<double>(num_pt, num_pt); 139 fpkm_cov = ublas::zero_matrix<double>(num_pt, num_pt); 140 141 //cerr << "combined " << combined << endl; 142 143 //cerr << "locus isoform gamma cov" << gamma_cov << endl; 144 for (size_t L = 0; L < cluster_indices.size(); ++L) 145 { 146 const vector<size_t>& L_isos = cluster_indices[L]; 147 for (size_t K = 0; K < cluster_indices.size(); ++K) 148 { 149 const vector<size_t>& K_isos = cluster_indices[K]; 150 for (size_t l = 0; l < L_isos.size(); ++l) 151 { 152 for (size_t k = 0; k < K_isos.size(); ++k) 153 { 154 cov(L,K) += trans_gamma_cov(L_isos[l],K_isos[k]); 155 count_cov(L,K) += trans_count_cov(L_isos[l],K_isos[k]); 156 iterated_count_cov(L,K) += trans_iterated_count_cov(L_isos[l],K_isos[k]); 157 fpkm_cov(L,K) += trans_fpkm_cov(L_isos[l],K_isos[k]); 158 } 159 } 160 } 161 } 162 } 163 } 164 165 #endif 166 167