1 /*
2  *  gtf_to_sam.cpp
3  *  Cufflinks
4  *
5  *  Created by Cole Trapnell on 8/1/10.
6  *  Copyright 2009 Cole Trapnell. All rights reserved.
7  *
8  */
9 
10 #include <stdlib.h>
11 #include <getopt.h>
12 #include <string>
13 
14 #include <boost/version.hpp>
15 #include <boost/graph/adjacency_list.hpp>
16 #include <boost/graph/depth_first_search.hpp>
17 #include <boost/graph/visitors.hpp>
18 #include <boost/graph/graph_traits.hpp>
19 #include <boost/graph/connected_components.hpp>
20 
21 #include "common.h"
22 #include "hits.h"
23 #include "bundles.h"
24 
25 #include "scaffolds.h"
26 #include "tokenize.h"
27 
28 using namespace boost;
29 using namespace std;
30 
31 #if ENABLE_THREADS
32 const char *short_options = "r:F";
33 #else
34 const char *short_options = "r:F";
35 #endif
36 
37 bool raw_fpkm = false;
38 
39 static struct option long_options[] = {
40 {"reference-seq",		required_argument,		 0,			 'r'},
41 {"raw-fpkm",            no_argument,             0,			 'F'},
42 {0, 0, 0, 0} // terminator
43 };
44 
print_usage()45 void print_usage()
46 {
47 	//NOTE: SPACES ONLY, bozo
48 	fprintf(stderr, "gtf_to_sam v%s\n", PACKAGE_VERSION);
49 	fprintf(stderr, "linked against Boost version %d\n", BOOST_VERSION);
50 	fprintf(stderr, "-----------------------------\n");
51 	fprintf(stderr, "Usage:   cufflinks [options] <transcripts1.gtf,...,transcriptsN.gtf> <out.sam>\n");
52 	fprintf(stderr, "Options:\n\n");
53 	fprintf(stderr, "-r/--reference-seq			  reference fasta file                     [ default:   NULL ]\n");
54     fprintf(stderr, "-F/--raw-fpkm			      use FPKM instead of isoform fraction                        \n");
55 }
56 
parse_options(int argc,char ** argv)57 int parse_options(int argc, char** argv)
58 {
59     int option_index = 0;
60     int next_option;
61     do {
62         next_option = getopt_long(argc, argv, short_options, long_options, &option_index);
63         switch (next_option) {
64 			case -1:     /* Done with options. */
65 				break;
66             case 'r':
67 			{
68 				fasta_dir = optarg;
69 				break;
70             }
71             case 'F':
72 			{
73 				raw_fpkm = true;
74 				break;
75             }
76 			default:
77 				print_usage();
78 				return 1;
79         }
80     } while(next_option != -1);
81 
82 	return 0;
83 }
84 
print_scaff_as_sam(FILE * sam_out,const RefSequenceTable & rt,const Scaffold & scaff)85 void print_scaff_as_sam(FILE* sam_out,
86                         const RefSequenceTable& rt,
87                         const Scaffold& scaff)
88 {
89 	string seq;
90 	string quals;
91 
92     seq = "*";
93     quals = "*";
94 
95 	uint32_t sam_flag = 0;
96 	if (scaff.strand() == CUFF_REV)
97 	{
98 		sam_flag |= 0x0010; // BAM_FREVERSE
99 //		if (sequence)
100 //		{
101 //			reverse_complement(seq);
102 //			reverse(quals.begin(), quals.end());
103 //		}
104 	}
105 
106 	uint32_t sam_pos = scaff.left() + 1;
107 	uint32_t map_quality = 255;
108 	char cigar[8192];
109 	cigar[0] = 0;
110 	string mate_ref_name = "*";
111 	uint32_t mate_pos = 0;
112 	uint32_t insert_size = 0;
113 
114 	const vector<AugmentedCuffOp>& ops = scaff.augmented_ops();
115 	for (size_t c = 0; c < ops.size(); ++c)
116 	{
117 		char ibuf[64];
118 		sprintf(ibuf, "%d", ops[c].genomic_length);
119 		switch(ops[c].opcode)
120 		{
121 			case CUFF_MATCH:
122 				strcat(cigar, ibuf);
123 				strcat(cigar, "M");
124 				break;
125 			case CUFF_INTRON:
126 				strcat(cigar, ibuf);
127 				strcat(cigar, "N");
128 				break;
129 			default:
130                 fprintf(stderr, "Warning: Transcript %s contains an unconvertible alignment operator, skipping\n", scaff.annotated_trans_id().c_str());
131                 return;
132                 break;
133 		}
134 	}
135 
136 	//string q = string(bh.read_len(), '!');
137 	//string s = string(bh.read_len(), 'N');
138 
139     const char* ref_name = rt.get_name(scaff.ref_id());
140     if (!ref_name)
141     {
142         fprintf(stderr, "Warning: Could not find contig name for ID %d, skipping\n", scaff.ref_id());
143         return;
144     }
145 
146     if (scaff.annotated_trans_id() == "")
147     {
148         fprintf(stderr, "Warning: transcript_id attribute is empty, skipping\n");
149         return;
150     }
151 
152 	fprintf(sam_out,
153 			"%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
154 			scaff.annotated_trans_id().c_str(),
155 			sam_flag,
156 			ref_name,
157 			sam_pos,
158 			map_quality,
159 			cigar,
160 			mate_ref_name.c_str(),
161 			mate_pos,
162 			insert_size,
163 			seq.c_str(),
164 			quals.c_str());
165 
166 	if (scaff.strand() != CUFF_STRAND_UNKNOWN)
167 	{
168 		fprintf(sam_out,
169 				"\tXS:A:%c",
170 				scaff.strand() == CUFF_REV ? '-' : '+');
171 	}
172 
173     if (scaff.fpkm() != 0)
174 	{
175 		fprintf(sam_out,
176 				"\tZF:f:%f",
177 				scaff.fpkm());
178 	}
179 
180 	fprintf(sam_out, "\n");
181 
182 }
183 
set_relative_fpkms(vector<boost::shared_ptr<Scaffold>> & ref_mRNAs)184 void set_relative_fpkms(vector<boost::shared_ptr<Scaffold> >& ref_mRNAs)
185 {
186     adjacency_list <vecS, vecS, undirectedS> G;
187 
188 	for (size_t i = 0; i < ref_mRNAs.size(); ++i)
189 	{
190 		add_vertex(G);
191 	}
192 
193     map<string, vector<int> > gene_id_idxs;
194 
195     for (size_t i = 0; i < ref_mRNAs.size(); ++i)
196     {
197         pair<map<string, vector<int> >::iterator, bool> inserted;
198         inserted = gene_id_idxs.insert(make_pair(ref_mRNAs[i]->annotated_gene_id(), vector<int>()));
199         inserted.first->second.push_back(i);
200     }
201 
202     for (map<string, vector<int> >::iterator itr = gene_id_idxs.begin();
203          itr != gene_id_idxs.end();
204          ++itr)
205     {
206         vector<int>& gene = itr->second;
207         for (size_t i = 0; i < gene.size(); ++i)
208         {
209             for (size_t j = 0; j < gene.size(); ++j)
210             {
211                 {
212                     add_edge(gene[i], gene[j], G);
213                 }
214             }
215         }
216     }
217 
218     std::vector<int> component(num_vertices(G));
219 	connected_components(G, &component[0]);
220 
221 	vector<vector<bool> > clusters(ref_mRNAs.size(),
222 								   vector<bool>(ref_mRNAs.size(), false));
223 
224 	//vector<vector<size_t> > cluster_indices(three_prime_ends.size());
225 
226     vector<vector<boost::shared_ptr<Scaffold> > > grouped_scaffolds(ref_mRNAs.size());
227 	for (size_t i = 0; i < ref_mRNAs.size(); ++i)
228 	{
229 		clusters[component[i]][i] = true;
230 		grouped_scaffolds[component[i]].push_back(ref_mRNAs[i]);
231 	}
232 
233     for (size_t i = 0; i < grouped_scaffolds.size(); ++i)
234     {
235         vector<boost::shared_ptr<Scaffold> >& gene = grouped_scaffolds[i];
236 
237         double total_fpkm = 0.0;
238         BOOST_FOREACH(boost::shared_ptr<Scaffold> scaff, gene)
239         {
240             total_fpkm += scaff->fpkm();
241         }
242         if (total_fpkm > 0)
243         {
244             BOOST_FOREACH (boost::shared_ptr<Scaffold> scaff, gene)
245             {
246                 scaff->fpkm(scaff->fpkm() / total_fpkm);
247             }
248         }
249     }
250 }
251 
driver(vector<FILE * > ref_gtf_files,FILE * sam_out)252 void driver(vector<FILE*> ref_gtf_files, FILE* sam_out)
253 {
254 	ReadTable it;
255 	RefSequenceTable rt(true, false);
256 
257 	vector<vector<boost::shared_ptr<Scaffold> > > ref_mRNA_table;
258 	vector<pair<string, vector<double> > > sample_count_table;
259 
260     BOOST_FOREACH (FILE* ref_gtf, ref_gtf_files)
261     {
262         vector<boost::shared_ptr<Scaffold> > ref_mRNAs;
263         boost::crc_32_type ref_gtf_crc_result;
264         ::load_ref_rnas(ref_gtf, rt, ref_mRNAs, ref_gtf_crc_result, false, true);
265         ref_mRNA_table.push_back(ref_mRNAs);
266     }
267 
268     for (size_t j = 0; j < ref_mRNA_table.size(); ++j)
269     {
270         vector<boost::shared_ptr<Scaffold> > ref_mRNAs = ref_mRNA_table[j];
271 
272         if (!raw_fpkm)
273             set_relative_fpkms(ref_mRNAs);
274 
275         for (size_t i = 0; i < ref_mRNAs.size(); ++i)
276         {
277             print_scaff_as_sam(sam_out, rt, *ref_mRNA_table[j][i]);
278         }
279     }
280 }
281 
main(int argc,char ** argv)282 int main(int argc, char** argv)
283 {
284     init_library_table();
285 
286 	int parse_ret = parse_options(argc,argv);
287     if (parse_ret)
288         return parse_ret;
289 
290 
291     if(optind >= argc)
292     {
293         print_usage();
294         return 1;
295     }
296 
297     string ref_gtf_in_filenames = argv[optind++];
298 
299     if(optind >= argc)
300     {
301         print_usage();
302         return 1;
303     }
304 
305     string sam_out_filename = argv[optind++];
306 
307     vector<string> ref_gtf_filenames;
308     tokenize(ref_gtf_in_filenames, ",", ref_gtf_filenames);
309 
310     vector<FILE*> ref_gtf_files;
311 
312     BOOST_FOREACH (const string& ref_gtf_in_filename, ref_gtf_filenames)
313     {
314         FILE* ref_gtf = NULL;
315         if (ref_gtf_in_filename != "")
316         {
317             ref_gtf = fopen(ref_gtf_in_filename.c_str(), "r");
318             if (!ref_gtf)
319             {
320                 fprintf(stderr, "Error: cannot open GTF file %s for reading\n",
321                         ref_gtf_in_filename.c_str());
322                 exit(1);
323             }
324             ref_gtf_files.push_back(ref_gtf);
325         }
326     }
327 
328     FILE* sam_out = NULL;
329 	if (sam_out_filename != "")
330 	{
331 		sam_out = fopen(sam_out_filename.c_str(), "w");
332 		if (!sam_out)
333 		{
334 			fprintf(stderr, "Error: cannot open SAM file %s for writing\n",
335 					sam_out_filename.c_str());
336 			exit(1);
337 		}
338 	}
339 
340     driver(ref_gtf_files, sam_out);
341 
342 	return 0;
343 }
344