1 /*
2 * gtf_to_sam.cpp
3 * Cufflinks
4 *
5 * Created by Cole Trapnell on 8/1/10.
6 * Copyright 2009 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #include <stdlib.h>
11 #include <getopt.h>
12 #include <string>
13
14 #include <boost/version.hpp>
15 #include <boost/graph/adjacency_list.hpp>
16 #include <boost/graph/depth_first_search.hpp>
17 #include <boost/graph/visitors.hpp>
18 #include <boost/graph/graph_traits.hpp>
19 #include <boost/graph/connected_components.hpp>
20
21 #include "common.h"
22 #include "hits.h"
23 #include "bundles.h"
24
25 #include "scaffolds.h"
26 #include "tokenize.h"
27
28 using namespace boost;
29 using namespace std;
30
31 #if ENABLE_THREADS
32 const char *short_options = "r:F";
33 #else
34 const char *short_options = "r:F";
35 #endif
36
37 bool raw_fpkm = false;
38
39 static struct option long_options[] = {
40 {"reference-seq", required_argument, 0, 'r'},
41 {"raw-fpkm", no_argument, 0, 'F'},
42 {0, 0, 0, 0} // terminator
43 };
44
print_usage()45 void print_usage()
46 {
47 //NOTE: SPACES ONLY, bozo
48 fprintf(stderr, "gtf_to_sam v%s\n", PACKAGE_VERSION);
49 fprintf(stderr, "linked against Boost version %d\n", BOOST_VERSION);
50 fprintf(stderr, "-----------------------------\n");
51 fprintf(stderr, "Usage: cufflinks [options] <transcripts1.gtf,...,transcriptsN.gtf> <out.sam>\n");
52 fprintf(stderr, "Options:\n\n");
53 fprintf(stderr, "-r/--reference-seq reference fasta file [ default: NULL ]\n");
54 fprintf(stderr, "-F/--raw-fpkm use FPKM instead of isoform fraction \n");
55 }
56
parse_options(int argc,char ** argv)57 int parse_options(int argc, char** argv)
58 {
59 int option_index = 0;
60 int next_option;
61 do {
62 next_option = getopt_long(argc, argv, short_options, long_options, &option_index);
63 switch (next_option) {
64 case -1: /* Done with options. */
65 break;
66 case 'r':
67 {
68 fasta_dir = optarg;
69 break;
70 }
71 case 'F':
72 {
73 raw_fpkm = true;
74 break;
75 }
76 default:
77 print_usage();
78 return 1;
79 }
80 } while(next_option != -1);
81
82 return 0;
83 }
84
print_scaff_as_sam(FILE * sam_out,const RefSequenceTable & rt,const Scaffold & scaff)85 void print_scaff_as_sam(FILE* sam_out,
86 const RefSequenceTable& rt,
87 const Scaffold& scaff)
88 {
89 string seq;
90 string quals;
91
92 seq = "*";
93 quals = "*";
94
95 uint32_t sam_flag = 0;
96 if (scaff.strand() == CUFF_REV)
97 {
98 sam_flag |= 0x0010; // BAM_FREVERSE
99 // if (sequence)
100 // {
101 // reverse_complement(seq);
102 // reverse(quals.begin(), quals.end());
103 // }
104 }
105
106 uint32_t sam_pos = scaff.left() + 1;
107 uint32_t map_quality = 255;
108 char cigar[8192];
109 cigar[0] = 0;
110 string mate_ref_name = "*";
111 uint32_t mate_pos = 0;
112 uint32_t insert_size = 0;
113
114 const vector<AugmentedCuffOp>& ops = scaff.augmented_ops();
115 for (size_t c = 0; c < ops.size(); ++c)
116 {
117 char ibuf[64];
118 sprintf(ibuf, "%d", ops[c].genomic_length);
119 switch(ops[c].opcode)
120 {
121 case CUFF_MATCH:
122 strcat(cigar, ibuf);
123 strcat(cigar, "M");
124 break;
125 case CUFF_INTRON:
126 strcat(cigar, ibuf);
127 strcat(cigar, "N");
128 break;
129 default:
130 fprintf(stderr, "Warning: Transcript %s contains an unconvertible alignment operator, skipping\n", scaff.annotated_trans_id().c_str());
131 return;
132 break;
133 }
134 }
135
136 //string q = string(bh.read_len(), '!');
137 //string s = string(bh.read_len(), 'N');
138
139 const char* ref_name = rt.get_name(scaff.ref_id());
140 if (!ref_name)
141 {
142 fprintf(stderr, "Warning: Could not find contig name for ID %d, skipping\n", scaff.ref_id());
143 return;
144 }
145
146 if (scaff.annotated_trans_id() == "")
147 {
148 fprintf(stderr, "Warning: transcript_id attribute is empty, skipping\n");
149 return;
150 }
151
152 fprintf(sam_out,
153 "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
154 scaff.annotated_trans_id().c_str(),
155 sam_flag,
156 ref_name,
157 sam_pos,
158 map_quality,
159 cigar,
160 mate_ref_name.c_str(),
161 mate_pos,
162 insert_size,
163 seq.c_str(),
164 quals.c_str());
165
166 if (scaff.strand() != CUFF_STRAND_UNKNOWN)
167 {
168 fprintf(sam_out,
169 "\tXS:A:%c",
170 scaff.strand() == CUFF_REV ? '-' : '+');
171 }
172
173 if (scaff.fpkm() != 0)
174 {
175 fprintf(sam_out,
176 "\tZF:f:%f",
177 scaff.fpkm());
178 }
179
180 fprintf(sam_out, "\n");
181
182 }
183
set_relative_fpkms(vector<boost::shared_ptr<Scaffold>> & ref_mRNAs)184 void set_relative_fpkms(vector<boost::shared_ptr<Scaffold> >& ref_mRNAs)
185 {
186 adjacency_list <vecS, vecS, undirectedS> G;
187
188 for (size_t i = 0; i < ref_mRNAs.size(); ++i)
189 {
190 add_vertex(G);
191 }
192
193 map<string, vector<int> > gene_id_idxs;
194
195 for (size_t i = 0; i < ref_mRNAs.size(); ++i)
196 {
197 pair<map<string, vector<int> >::iterator, bool> inserted;
198 inserted = gene_id_idxs.insert(make_pair(ref_mRNAs[i]->annotated_gene_id(), vector<int>()));
199 inserted.first->second.push_back(i);
200 }
201
202 for (map<string, vector<int> >::iterator itr = gene_id_idxs.begin();
203 itr != gene_id_idxs.end();
204 ++itr)
205 {
206 vector<int>& gene = itr->second;
207 for (size_t i = 0; i < gene.size(); ++i)
208 {
209 for (size_t j = 0; j < gene.size(); ++j)
210 {
211 {
212 add_edge(gene[i], gene[j], G);
213 }
214 }
215 }
216 }
217
218 std::vector<int> component(num_vertices(G));
219 connected_components(G, &component[0]);
220
221 vector<vector<bool> > clusters(ref_mRNAs.size(),
222 vector<bool>(ref_mRNAs.size(), false));
223
224 //vector<vector<size_t> > cluster_indices(three_prime_ends.size());
225
226 vector<vector<boost::shared_ptr<Scaffold> > > grouped_scaffolds(ref_mRNAs.size());
227 for (size_t i = 0; i < ref_mRNAs.size(); ++i)
228 {
229 clusters[component[i]][i] = true;
230 grouped_scaffolds[component[i]].push_back(ref_mRNAs[i]);
231 }
232
233 for (size_t i = 0; i < grouped_scaffolds.size(); ++i)
234 {
235 vector<boost::shared_ptr<Scaffold> >& gene = grouped_scaffolds[i];
236
237 double total_fpkm = 0.0;
238 BOOST_FOREACH(boost::shared_ptr<Scaffold> scaff, gene)
239 {
240 total_fpkm += scaff->fpkm();
241 }
242 if (total_fpkm > 0)
243 {
244 BOOST_FOREACH (boost::shared_ptr<Scaffold> scaff, gene)
245 {
246 scaff->fpkm(scaff->fpkm() / total_fpkm);
247 }
248 }
249 }
250 }
251
driver(vector<FILE * > ref_gtf_files,FILE * sam_out)252 void driver(vector<FILE*> ref_gtf_files, FILE* sam_out)
253 {
254 ReadTable it;
255 RefSequenceTable rt(true, false);
256
257 vector<vector<boost::shared_ptr<Scaffold> > > ref_mRNA_table;
258 vector<pair<string, vector<double> > > sample_count_table;
259
260 BOOST_FOREACH (FILE* ref_gtf, ref_gtf_files)
261 {
262 vector<boost::shared_ptr<Scaffold> > ref_mRNAs;
263 boost::crc_32_type ref_gtf_crc_result;
264 ::load_ref_rnas(ref_gtf, rt, ref_mRNAs, ref_gtf_crc_result, false, true);
265 ref_mRNA_table.push_back(ref_mRNAs);
266 }
267
268 for (size_t j = 0; j < ref_mRNA_table.size(); ++j)
269 {
270 vector<boost::shared_ptr<Scaffold> > ref_mRNAs = ref_mRNA_table[j];
271
272 if (!raw_fpkm)
273 set_relative_fpkms(ref_mRNAs);
274
275 for (size_t i = 0; i < ref_mRNAs.size(); ++i)
276 {
277 print_scaff_as_sam(sam_out, rt, *ref_mRNA_table[j][i]);
278 }
279 }
280 }
281
main(int argc,char ** argv)282 int main(int argc, char** argv)
283 {
284 init_library_table();
285
286 int parse_ret = parse_options(argc,argv);
287 if (parse_ret)
288 return parse_ret;
289
290
291 if(optind >= argc)
292 {
293 print_usage();
294 return 1;
295 }
296
297 string ref_gtf_in_filenames = argv[optind++];
298
299 if(optind >= argc)
300 {
301 print_usage();
302 return 1;
303 }
304
305 string sam_out_filename = argv[optind++];
306
307 vector<string> ref_gtf_filenames;
308 tokenize(ref_gtf_in_filenames, ",", ref_gtf_filenames);
309
310 vector<FILE*> ref_gtf_files;
311
312 BOOST_FOREACH (const string& ref_gtf_in_filename, ref_gtf_filenames)
313 {
314 FILE* ref_gtf = NULL;
315 if (ref_gtf_in_filename != "")
316 {
317 ref_gtf = fopen(ref_gtf_in_filename.c_str(), "r");
318 if (!ref_gtf)
319 {
320 fprintf(stderr, "Error: cannot open GTF file %s for reading\n",
321 ref_gtf_in_filename.c_str());
322 exit(1);
323 }
324 ref_gtf_files.push_back(ref_gtf);
325 }
326 }
327
328 FILE* sam_out = NULL;
329 if (sam_out_filename != "")
330 {
331 sam_out = fopen(sam_out_filename.c_str(), "w");
332 if (!sam_out)
333 {
334 fprintf(stderr, "Error: cannot open SAM file %s for writing\n",
335 sam_out_filename.c_str());
336 exit(1);
337 }
338 }
339
340 driver(ref_gtf_files, sam_out);
341
342 return 0;
343 }
344