1 /*
2  *  gtf_to_sam.cpp
3  *  Cufflinks
4  *
5  *  Created by Cole Trapnell on 8/1/10.
6  *  Copyright 2009 Cole Trapnell. All rights reserved.
7  *
8  */
9 
10 #include <stdlib.h>
11 #include <getopt.h>
12 #include <string>
13 #include <algorithm>
14 
15 #include <boost/version.hpp>
16 #include <boost/graph/adjacency_list.hpp>
17 #include <boost/graph/depth_first_search.hpp>
18 #include <boost/graph/visitors.hpp>
19 #include <boost/graph/graph_traits.hpp>
20 #include <boost/graph/connected_components.hpp>
21 
22 #include "common.h"
23 #include "hits.h"
24 #include "bundles.h"
25 
26 #include "gtf_tracking.h"
27 #include "scaffolds.h"
28 #include "tokenize.h"
29 #include "genes.h"
30 
31 using namespace boost;
32 using namespace std;
33 
34 #if ENABLE_THREADS
35 const char *short_options = "r:F";
36 #else
37 const char *short_options = "r:F";
38 #endif
39 
40 bool raw_fpkm = false;
41 bool proj_union = false;
42 bool proj_intersection = false;
43 
44 static struct option long_options[] = {
45 {"reference-seq",		required_argument,		 0,			 'r'},
46 {"raw-fpkm",            no_argument,             0,			 'F'},
47 {"union",               no_argument,             0,			 'U'},
48 {"intersection",        no_argument,             0,			 'I'},
49 
50 
51 {0, 0, 0, 0} // terminator
52 };
53 
print_usage()54 void print_usage()
55 {
56 	//NOTE: SPACES ONLY, bozo
57 	fprintf(stderr, "compress_gtf v%s\n", PACKAGE_VERSION);
58 	fprintf(stderr, "linked against Boost version %d\n", BOOST_VERSION);
59 	fprintf(stderr, "-----------------------------\n");
60 	fprintf(stderr, "Usage:   compress_gtf [options] <reference.gtf> <compressed_reference.gtf>\n");
61 	fprintf(stderr, "Options:\n\n");
62 	fprintf(stderr, "-r/--reference-seq			  reference fasta file                     [ default:   NULL ]\n");
63     fprintf(stderr, "-F/--raw-fpkm			      use FPKM instead of isoform fraction                        \n");
64     fprintf(stderr, "-U/--union                   report projective union                  [ default:   OFF  ]\n");
65     fprintf(stderr, "-I/--intersection            report projective intersection           [ default:   ON   ]\n");
66 }
67 
parse_options(int argc,char ** argv)68 int parse_options(int argc, char** argv)
69 {
70     int option_index = 0;
71     int next_option;
72     do {
73         next_option = getopt_long(argc, argv, short_options, long_options, &option_index);
74         switch (next_option) {
75 			case -1:     /* Done with options. */
76 				break;
77             case 'r':
78 			{
79 				fasta_dir = optarg;
80 				break;
81             }
82             case 'F':
83 			{
84 				raw_fpkm = true;
85 				break;
86             }
87             case 'U':
88 			{
89 				proj_union = true;
90 				break;
91             }
92             case 'I':
93 			{
94 				proj_intersection = true;
95 				break;
96             }
97 			default:
98 				print_usage();
99 				return 1;
100         }
101     } while(next_option != -1);
102 
103     if (proj_union && proj_intersection)
104     {
105         fprintf (stderr, "Error: please specify only one of --union and --intersection");
106         exit(1);
107     }
108 
109 //    if (!proj_union && !proj_intersection)
110 //        proj_intersection =  true;
111 	return 0;
112 }
113 
compress_genes(FILE * ftranscripts,RefSequenceTable & rt,vector<boost::shared_ptr<Scaffold>> & ref_mRNAs)114 void compress_genes(FILE* ftranscripts,
115                     RefSequenceTable& rt,
116                     vector<boost::shared_ptr<Scaffold> >& ref_mRNAs)
117 {
118     adjacency_list <vecS, vecS, undirectedS> G;
119 
120 	for (size_t i = 0; i < ref_mRNAs.size(); ++i)
121 	{
122 		add_vertex(G);
123 	}
124 
125     for (size_t i = 0; i < ref_mRNAs.size(); ++i)
126 	{
127         boost::shared_ptr<Scaffold> scaff_i = ref_mRNAs[i];
128         for (size_t j = 0; j < ref_mRNAs.size(); ++j)
129         {
130             boost::shared_ptr<Scaffold> scaff_j = ref_mRNAs[j];
131 			if (scaff_i->annotated_gene_id() == scaff_j->annotated_gene_id())
132 				add_edge(i, j, G);
133 		}
134 	}
135 
136     std::vector<int> component(num_vertices(G));
137 	connected_components(G, &component[0]);
138 
139 	vector<vector<bool> > clusters(ref_mRNAs.size(),
140 								   vector<bool>(ref_mRNAs.size(), false));
141 
142 	//vector<vector<size_t> > cluster_indices(three_prime_ends.size());
143 
144     vector<vector<boost::shared_ptr<Scaffold> > > grouped_scaffolds(ref_mRNAs.size());
145 	for (size_t i = 0; i < ref_mRNAs.size(); ++i)
146 	{
147 		clusters[component[i]][i] = true;
148 		grouped_scaffolds[component[i]].push_back(ref_mRNAs[i]);
149 	}
150 
151     for (size_t i = 0; i < grouped_scaffolds.size(); ++i)
152     {
153         vector<boost::shared_ptr<Scaffold> >& gene = grouped_scaffolds[i];
154         vector<Scaffold> gene_scaffs;
155         string gene_id;
156         BOOST_FOREACH (boost::shared_ptr<Scaffold> s, gene)
157         {
158             if (gene_id == "")
159                 gene_id = s->annotated_gene_id();
160 
161             gene_scaffs.push_back(*s);
162         }
163 
164         if (gene_scaffs.empty())
165             continue;
166 
167         next_gene_id++;
168 
169         Scaffold smashed_gene;
170         if (!proj_intersection && !proj_union)
171         {
172             BOOST_FOREACH (boost::shared_ptr<Scaffold> s, gene)
173             {
174                 /*
175                  *transfrag,
176                  gene_id,
177                  (int)isoforms.size() + 1,
178                  FPKM,
179                  iso_ab->effective_length(),
180                  iso_ab->gamma(),
181                  iso_ab->FPKM_conf(),
182                  density_per_bp,
183                  estimated_count,
184                  density_score,
185                  iso_ab->status(),
186                  ref_gene_id)*/
187 
188                 Isoform iso(*s,
189                             -1,
190                             1,
191                             0.0,
192                             s->length(),
193                             0.0,
194                             ConfidenceInterval(0.0,0.0),
195                             0,
196                             0,
197                             0,
198                             NUMERIC_OK,
199                             gene_id);
200                 vector<string> isoform_exon_recs;
201 
202                 iso.get_gtf(isoform_exon_recs, rt);
203 
204                 for (size_t g = 0; g < isoform_exon_recs.size(); ++g)
205                 {
206                     fprintf(ftranscripts, "%s", isoform_exon_recs[g].c_str());
207                 }
208             }
209         }
210         else
211         {
212             if (proj_union)
213                 Scaffold::merge(gene_scaffs, smashed_gene, false);
214             else if (proj_intersection)
215             {
216                 vector<AugmentedCuffOp> iso_ops;
217 
218                 int gmax = -1;
219                 int gmin = numeric_limits<int>::max();
220 
221                 BOOST_FOREACH (boost::shared_ptr<Scaffold> s, gene)
222                 {
223                     //iso_ops.push_back(s->augmented_ops());
224                     //sort (iso_ops.back().begin(), iso_ops.back().end());
225                     if (s->left() < gmin)
226                         gmin = s->left();
227                     if (s->right() > gmax)
228                         gmax = s->right();
229                 }
230 
231                 BOOST_FOREACH (boost::shared_ptr<Scaffold> s, gene)
232                 {
233                     if (s->left() > gmin)
234                     {
235                         iso_ops.push_back(AugmentedCuffOp(CUFF_INTRON, gmin, s->left() - gmin));
236                     }
237                     if (s->right() < gmax)
238                     {
239                         iso_ops.push_back(AugmentedCuffOp(CUFF_INTRON, s->right(), gmax - s->right()));
240                     }
241                     iso_ops.insert(iso_ops.end(), s->augmented_ops().begin(), s->augmented_ops().end());
242                 }
243 //                vector<AugmentedCuffOp> intersect = iso_ops.front();
244 //                for (size_t j = 1; j < iso_ops.size(); ++j)
245 //                {
246 //                    vector<AugmentedCuffOp> tmp;
247 //                    const vector<AugmentedCuffOp>& iso_ops_j = iso_ops[j];
248 //                    //set_intersection(intersect.begin(), intersect.end(), iso_ops_j.begin(), iso_ops_j.end(), back_inserter(tmp));
249 //                    intersect.insert(intersect.end(), iso_ops_j.begin(), iso_ops_j.end());
250 //
251 //                    intersect.push_back(
252 //                    assert (tmp.size() <= intersect.size());
253 //                    //intersect = tmp;
254 //                    //sort(intersect.begin(), intersect.end());
255 //                }
256 //
257                 sort(iso_ops.begin(), iso_ops.end(), AugmentedCuffOp::g_left_lt);
258 //
259 //                while (!intersect.empty() && intersect.front().opcode != CUFF_MATCH)
260 //                {
261 //                    intersect.erase(intersect.begin());
262 //                }
263 //
264 //                while (!intersect.empty() && intersect.back().opcode != CUFF_MATCH)
265 //                {
266 //                    intersect.pop_back();
267 //                }
268 //
269 //                if (intersect.empty())
270 //                    continue;
271 
272                 vector<AugmentedCuffOp> merged_ops;
273                 AugmentedCuffOp::merge_ops(iso_ops, merged_ops, true, true);
274                 vector<AugmentedCuffOp>::iterator first_match = merged_ops.begin();
275                 vector<AugmentedCuffOp>::iterator last_match = merged_ops.end();
276                 last_match--;
277                 while(first_match < merged_ops.end())
278                 {
279                     if (first_match->opcode == CUFF_MATCH)
280                         break;
281                     first_match++;
282                 }
283                 while(last_match >= merged_ops.begin() && last_match< merged_ops.end())
284                 {
285                     if (last_match->opcode == CUFF_MATCH)
286                         break;
287                     last_match--;
288                 }
289 
290                 vector<AugmentedCuffOp> internal_matches;
291                 if (last_match >= first_match && last_match < merged_ops.end())
292                 {
293                     last_match++;
294 
295                     internal_matches.insert(internal_matches.end(), first_match, last_match);
296                     smashed_gene = Scaffold(gene.front()->ref_id(), gene.front()->strand(), internal_matches);
297                 }
298                 else
299                 {
300 
301                     fprintf(stderr, "Could not find consitutive region for %s\n", gene_id.c_str());
302                     continue;
303                 }
304 
305             }
306             else
307                 assert(false);
308             assert (smashed_gene.ref_id());
309 
310             Isoform iso(smashed_gene,
311                         -1,
312                         1,
313                         0.0,
314                         smashed_gene.length(),
315                         0.0,
316                         ConfidenceInterval(0.0,0.0),
317                         0,
318                         0,
319                         0,
320                         NUMERIC_OK,
321                         gene_id);
322             vector<string> isoform_exon_recs;
323 
324             iso.get_gtf(isoform_exon_recs, rt);
325 
326             for (size_t g = 0; g < isoform_exon_recs.size(); ++g)
327             {
328                 fprintf(ftranscripts, "%s", isoform_exon_recs[g].c_str());
329             }
330         }
331 
332         fflush(ftranscripts);
333     }
334 }
335 
driver(vector<FILE * > ref_gtf_files,FILE * gtf_out)336 void driver(vector<FILE*> ref_gtf_files, FILE* gtf_out)
337 {
338 	ReadTable it;
339 	RefSequenceTable rt(true, false);
340 
341 	vector<vector<boost::shared_ptr<Scaffold> > > ref_mRNA_table;
342 	vector<pair<string, vector<double> > > sample_count_table;
343 
344     BOOST_FOREACH (FILE* ref_gtf, ref_gtf_files)
345     {
346         vector<boost::shared_ptr<Scaffold> > ref_mRNAs;
347         boost::crc_32_type gtf_crc_result;
348         ::load_ref_rnas(ref_gtf, rt, ref_mRNAs, gtf_crc_result, false, true);
349         ref_mRNA_table.push_back(ref_mRNAs);
350     }
351 
352     for (size_t j = 0; j < ref_mRNA_table.size(); ++j)
353     {
354         vector<boost::shared_ptr<Scaffold> > ref_mRNAs = ref_mRNA_table[j];
355 
356         if (!raw_fpkm)
357             compress_genes(gtf_out, rt, ref_mRNAs);
358     }
359 }
360 
main(int argc,char ** argv)361 int main(int argc, char** argv)
362 {
363     init_library_table();
364 
365 	int parse_ret = parse_options(argc,argv);
366     if (parse_ret)
367         return parse_ret;
368 
369 
370     if(optind >= argc)
371     {
372         print_usage();
373         return 1;
374     }
375 
376     string ref_gtf_in_filenames = argv[optind++];
377 
378     if(optind >= argc)
379     {
380         print_usage();
381         return 1;
382     }
383 
384     string gtf_out_filename = argv[optind++];
385 
386     vector<string> ref_gtf_filenames;
387     tokenize(ref_gtf_in_filenames, ",", ref_gtf_filenames);
388 
389     vector<FILE*> ref_gtf_files;
390 
391     BOOST_FOREACH (const string& ref_gtf_in_filename, ref_gtf_filenames)
392     {
393         FILE* ref_gtf = NULL;
394         if (ref_gtf_in_filename != "")
395         {
396             ref_gtf = fopen(ref_gtf_in_filename.c_str(), "r");
397             if (!ref_gtf)
398             {
399                 fprintf(stderr, "Error: cannot open GTF file %s for reading\n",
400                         ref_gtf_in_filename.c_str());
401                 exit(1);
402             }
403             ref_gtf_files.push_back(ref_gtf);
404         }
405     }
406 
407     FILE* gtf_out = NULL;
408 	if (gtf_out_filename != "")
409 	{
410 		gtf_out = fopen(gtf_out_filename.c_str(), "w");
411 		if (!gtf_out)
412 		{
413 			fprintf(stderr, "Error: cannot open GTF file %s for writing\n",
414 					gtf_out_filename.c_str());
415 			exit(1);
416 		}
417 	}
418 
419     driver(ref_gtf_files, gtf_out);
420 
421 	return 0;
422 }
423