1 /*
2 * gtf_to_sam.cpp
3 * Cufflinks
4 *
5 * Created by Cole Trapnell on 8/1/10.
6 * Copyright 2009 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #include <stdlib.h>
11 #include <getopt.h>
12 #include <string>
13 #include <algorithm>
14
15 #include <boost/version.hpp>
16 #include <boost/graph/adjacency_list.hpp>
17 #include <boost/graph/depth_first_search.hpp>
18 #include <boost/graph/visitors.hpp>
19 #include <boost/graph/graph_traits.hpp>
20 #include <boost/graph/connected_components.hpp>
21
22 #include "common.h"
23 #include "hits.h"
24 #include "bundles.h"
25
26 #include "gtf_tracking.h"
27 #include "scaffolds.h"
28 #include "tokenize.h"
29 #include "genes.h"
30
31 using namespace boost;
32 using namespace std;
33
34 #if ENABLE_THREADS
35 const char *short_options = "r:F";
36 #else
37 const char *short_options = "r:F";
38 #endif
39
40 bool raw_fpkm = false;
41 bool proj_union = false;
42 bool proj_intersection = false;
43
44 static struct option long_options[] = {
45 {"reference-seq", required_argument, 0, 'r'},
46 {"raw-fpkm", no_argument, 0, 'F'},
47 {"union", no_argument, 0, 'U'},
48 {"intersection", no_argument, 0, 'I'},
49
50
51 {0, 0, 0, 0} // terminator
52 };
53
print_usage()54 void print_usage()
55 {
56 //NOTE: SPACES ONLY, bozo
57 fprintf(stderr, "compress_gtf v%s\n", PACKAGE_VERSION);
58 fprintf(stderr, "linked against Boost version %d\n", BOOST_VERSION);
59 fprintf(stderr, "-----------------------------\n");
60 fprintf(stderr, "Usage: compress_gtf [options] <reference.gtf> <compressed_reference.gtf>\n");
61 fprintf(stderr, "Options:\n\n");
62 fprintf(stderr, "-r/--reference-seq reference fasta file [ default: NULL ]\n");
63 fprintf(stderr, "-F/--raw-fpkm use FPKM instead of isoform fraction \n");
64 fprintf(stderr, "-U/--union report projective union [ default: OFF ]\n");
65 fprintf(stderr, "-I/--intersection report projective intersection [ default: ON ]\n");
66 }
67
parse_options(int argc,char ** argv)68 int parse_options(int argc, char** argv)
69 {
70 int option_index = 0;
71 int next_option;
72 do {
73 next_option = getopt_long(argc, argv, short_options, long_options, &option_index);
74 switch (next_option) {
75 case -1: /* Done with options. */
76 break;
77 case 'r':
78 {
79 fasta_dir = optarg;
80 break;
81 }
82 case 'F':
83 {
84 raw_fpkm = true;
85 break;
86 }
87 case 'U':
88 {
89 proj_union = true;
90 break;
91 }
92 case 'I':
93 {
94 proj_intersection = true;
95 break;
96 }
97 default:
98 print_usage();
99 return 1;
100 }
101 } while(next_option != -1);
102
103 if (proj_union && proj_intersection)
104 {
105 fprintf (stderr, "Error: please specify only one of --union and --intersection");
106 exit(1);
107 }
108
109 // if (!proj_union && !proj_intersection)
110 // proj_intersection = true;
111 return 0;
112 }
113
compress_genes(FILE * ftranscripts,RefSequenceTable & rt,vector<boost::shared_ptr<Scaffold>> & ref_mRNAs)114 void compress_genes(FILE* ftranscripts,
115 RefSequenceTable& rt,
116 vector<boost::shared_ptr<Scaffold> >& ref_mRNAs)
117 {
118 adjacency_list <vecS, vecS, undirectedS> G;
119
120 for (size_t i = 0; i < ref_mRNAs.size(); ++i)
121 {
122 add_vertex(G);
123 }
124
125 for (size_t i = 0; i < ref_mRNAs.size(); ++i)
126 {
127 boost::shared_ptr<Scaffold> scaff_i = ref_mRNAs[i];
128 for (size_t j = 0; j < ref_mRNAs.size(); ++j)
129 {
130 boost::shared_ptr<Scaffold> scaff_j = ref_mRNAs[j];
131 if (scaff_i->annotated_gene_id() == scaff_j->annotated_gene_id())
132 add_edge(i, j, G);
133 }
134 }
135
136 std::vector<int> component(num_vertices(G));
137 connected_components(G, &component[0]);
138
139 vector<vector<bool> > clusters(ref_mRNAs.size(),
140 vector<bool>(ref_mRNAs.size(), false));
141
142 //vector<vector<size_t> > cluster_indices(three_prime_ends.size());
143
144 vector<vector<boost::shared_ptr<Scaffold> > > grouped_scaffolds(ref_mRNAs.size());
145 for (size_t i = 0; i < ref_mRNAs.size(); ++i)
146 {
147 clusters[component[i]][i] = true;
148 grouped_scaffolds[component[i]].push_back(ref_mRNAs[i]);
149 }
150
151 for (size_t i = 0; i < grouped_scaffolds.size(); ++i)
152 {
153 vector<boost::shared_ptr<Scaffold> >& gene = grouped_scaffolds[i];
154 vector<Scaffold> gene_scaffs;
155 string gene_id;
156 BOOST_FOREACH (boost::shared_ptr<Scaffold> s, gene)
157 {
158 if (gene_id == "")
159 gene_id = s->annotated_gene_id();
160
161 gene_scaffs.push_back(*s);
162 }
163
164 if (gene_scaffs.empty())
165 continue;
166
167 next_gene_id++;
168
169 Scaffold smashed_gene;
170 if (!proj_intersection && !proj_union)
171 {
172 BOOST_FOREACH (boost::shared_ptr<Scaffold> s, gene)
173 {
174 /*
175 *transfrag,
176 gene_id,
177 (int)isoforms.size() + 1,
178 FPKM,
179 iso_ab->effective_length(),
180 iso_ab->gamma(),
181 iso_ab->FPKM_conf(),
182 density_per_bp,
183 estimated_count,
184 density_score,
185 iso_ab->status(),
186 ref_gene_id)*/
187
188 Isoform iso(*s,
189 -1,
190 1,
191 0.0,
192 s->length(),
193 0.0,
194 ConfidenceInterval(0.0,0.0),
195 0,
196 0,
197 0,
198 NUMERIC_OK,
199 gene_id);
200 vector<string> isoform_exon_recs;
201
202 iso.get_gtf(isoform_exon_recs, rt);
203
204 for (size_t g = 0; g < isoform_exon_recs.size(); ++g)
205 {
206 fprintf(ftranscripts, "%s", isoform_exon_recs[g].c_str());
207 }
208 }
209 }
210 else
211 {
212 if (proj_union)
213 Scaffold::merge(gene_scaffs, smashed_gene, false);
214 else if (proj_intersection)
215 {
216 vector<AugmentedCuffOp> iso_ops;
217
218 int gmax = -1;
219 int gmin = numeric_limits<int>::max();
220
221 BOOST_FOREACH (boost::shared_ptr<Scaffold> s, gene)
222 {
223 //iso_ops.push_back(s->augmented_ops());
224 //sort (iso_ops.back().begin(), iso_ops.back().end());
225 if (s->left() < gmin)
226 gmin = s->left();
227 if (s->right() > gmax)
228 gmax = s->right();
229 }
230
231 BOOST_FOREACH (boost::shared_ptr<Scaffold> s, gene)
232 {
233 if (s->left() > gmin)
234 {
235 iso_ops.push_back(AugmentedCuffOp(CUFF_INTRON, gmin, s->left() - gmin));
236 }
237 if (s->right() < gmax)
238 {
239 iso_ops.push_back(AugmentedCuffOp(CUFF_INTRON, s->right(), gmax - s->right()));
240 }
241 iso_ops.insert(iso_ops.end(), s->augmented_ops().begin(), s->augmented_ops().end());
242 }
243 // vector<AugmentedCuffOp> intersect = iso_ops.front();
244 // for (size_t j = 1; j < iso_ops.size(); ++j)
245 // {
246 // vector<AugmentedCuffOp> tmp;
247 // const vector<AugmentedCuffOp>& iso_ops_j = iso_ops[j];
248 // //set_intersection(intersect.begin(), intersect.end(), iso_ops_j.begin(), iso_ops_j.end(), back_inserter(tmp));
249 // intersect.insert(intersect.end(), iso_ops_j.begin(), iso_ops_j.end());
250 //
251 // intersect.push_back(
252 // assert (tmp.size() <= intersect.size());
253 // //intersect = tmp;
254 // //sort(intersect.begin(), intersect.end());
255 // }
256 //
257 sort(iso_ops.begin(), iso_ops.end(), AugmentedCuffOp::g_left_lt);
258 //
259 // while (!intersect.empty() && intersect.front().opcode != CUFF_MATCH)
260 // {
261 // intersect.erase(intersect.begin());
262 // }
263 //
264 // while (!intersect.empty() && intersect.back().opcode != CUFF_MATCH)
265 // {
266 // intersect.pop_back();
267 // }
268 //
269 // if (intersect.empty())
270 // continue;
271
272 vector<AugmentedCuffOp> merged_ops;
273 AugmentedCuffOp::merge_ops(iso_ops, merged_ops, true, true);
274 vector<AugmentedCuffOp>::iterator first_match = merged_ops.begin();
275 vector<AugmentedCuffOp>::iterator last_match = merged_ops.end();
276 last_match--;
277 while(first_match < merged_ops.end())
278 {
279 if (first_match->opcode == CUFF_MATCH)
280 break;
281 first_match++;
282 }
283 while(last_match >= merged_ops.begin() && last_match< merged_ops.end())
284 {
285 if (last_match->opcode == CUFF_MATCH)
286 break;
287 last_match--;
288 }
289
290 vector<AugmentedCuffOp> internal_matches;
291 if (last_match >= first_match && last_match < merged_ops.end())
292 {
293 last_match++;
294
295 internal_matches.insert(internal_matches.end(), first_match, last_match);
296 smashed_gene = Scaffold(gene.front()->ref_id(), gene.front()->strand(), internal_matches);
297 }
298 else
299 {
300
301 fprintf(stderr, "Could not find consitutive region for %s\n", gene_id.c_str());
302 continue;
303 }
304
305 }
306 else
307 assert(false);
308 assert (smashed_gene.ref_id());
309
310 Isoform iso(smashed_gene,
311 -1,
312 1,
313 0.0,
314 smashed_gene.length(),
315 0.0,
316 ConfidenceInterval(0.0,0.0),
317 0,
318 0,
319 0,
320 NUMERIC_OK,
321 gene_id);
322 vector<string> isoform_exon_recs;
323
324 iso.get_gtf(isoform_exon_recs, rt);
325
326 for (size_t g = 0; g < isoform_exon_recs.size(); ++g)
327 {
328 fprintf(ftranscripts, "%s", isoform_exon_recs[g].c_str());
329 }
330 }
331
332 fflush(ftranscripts);
333 }
334 }
335
driver(vector<FILE * > ref_gtf_files,FILE * gtf_out)336 void driver(vector<FILE*> ref_gtf_files, FILE* gtf_out)
337 {
338 ReadTable it;
339 RefSequenceTable rt(true, false);
340
341 vector<vector<boost::shared_ptr<Scaffold> > > ref_mRNA_table;
342 vector<pair<string, vector<double> > > sample_count_table;
343
344 BOOST_FOREACH (FILE* ref_gtf, ref_gtf_files)
345 {
346 vector<boost::shared_ptr<Scaffold> > ref_mRNAs;
347 boost::crc_32_type gtf_crc_result;
348 ::load_ref_rnas(ref_gtf, rt, ref_mRNAs, gtf_crc_result, false, true);
349 ref_mRNA_table.push_back(ref_mRNAs);
350 }
351
352 for (size_t j = 0; j < ref_mRNA_table.size(); ++j)
353 {
354 vector<boost::shared_ptr<Scaffold> > ref_mRNAs = ref_mRNA_table[j];
355
356 if (!raw_fpkm)
357 compress_genes(gtf_out, rt, ref_mRNAs);
358 }
359 }
360
main(int argc,char ** argv)361 int main(int argc, char** argv)
362 {
363 init_library_table();
364
365 int parse_ret = parse_options(argc,argv);
366 if (parse_ret)
367 return parse_ret;
368
369
370 if(optind >= argc)
371 {
372 print_usage();
373 return 1;
374 }
375
376 string ref_gtf_in_filenames = argv[optind++];
377
378 if(optind >= argc)
379 {
380 print_usage();
381 return 1;
382 }
383
384 string gtf_out_filename = argv[optind++];
385
386 vector<string> ref_gtf_filenames;
387 tokenize(ref_gtf_in_filenames, ",", ref_gtf_filenames);
388
389 vector<FILE*> ref_gtf_files;
390
391 BOOST_FOREACH (const string& ref_gtf_in_filename, ref_gtf_filenames)
392 {
393 FILE* ref_gtf = NULL;
394 if (ref_gtf_in_filename != "")
395 {
396 ref_gtf = fopen(ref_gtf_in_filename.c_str(), "r");
397 if (!ref_gtf)
398 {
399 fprintf(stderr, "Error: cannot open GTF file %s for reading\n",
400 ref_gtf_in_filename.c_str());
401 exit(1);
402 }
403 ref_gtf_files.push_back(ref_gtf);
404 }
405 }
406
407 FILE* gtf_out = NULL;
408 if (gtf_out_filename != "")
409 {
410 gtf_out = fopen(gtf_out_filename.c_str(), "w");
411 if (!gtf_out)
412 {
413 fprintf(stderr, "Error: cannot open GTF file %s for writing\n",
414 gtf_out_filename.c_str());
415 exit(1);
416 }
417 }
418
419 driver(ref_gtf_files, gtf_out);
420
421 return 0;
422 }
423