1 /*
2 * sorting_hat.cpp
3 * cufflinks
4 *
5 * Created by Cole Trapnell on 8/30/10.
6 * Copyright 2010 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #include <stdlib.h>
11 #include <getopt.h>
12 #include <string>
13
14 #include <boost/numeric/ublas/matrix.hpp>
15 #include <boost/numeric/ublas/matrix_proxy.hpp>
16 #include <boost/numeric/ublas/vector.hpp>
17 #include <boost/numeric/ublas/vector_proxy.hpp>
18 #include <boost/numeric/ublas/io.hpp>
19 #include <boost/random/linear_congruential.hpp>
20 #include <boost/random/uniform_real.hpp>
21 #include <boost/random/variate_generator.hpp>
22
23
24 #include "common.h"
25 #include "tokenize.h"
26
27 #include "differential.h"
28
29 using namespace std;
30 using namespace boost;
31
32 bool compute_row_matrix = false;
33 bool log_transform_fpkm = false;
34 bool output_row_density = false;
35
36 int k_clusters = 0;
37 int max_iterations = 1000;
38
39 string row_matrix_out_filename = "";
40 string row_density_out_filename = "";
41
42 #if ENABLE_THREADS
43 const char *short_options = "o:p:d:P:k:I:l";
44 #else
45 const char *short_options = "o:d:P:k:I:l";
46 #endif
47
48 static struct option long_options[] = {
49 {"output-dir", required_argument, 0, 'o'},
50 {"row-matrix", required_argument, 0, 'd'},
51 {"row-densities", required_argument, 0, 'P'},
52 {"log-fpkm", no_argument, 0, 'l'},
53 {"k-means", required_argument, 0, 'k'},
54 {"max-iterations", required_argument, 0, 'I'},
55 #if ENABLE_THREADS
56 {"num-threads", required_argument, 0, 'p'},
57 #endif
58 {0, 0, 0, 0} // terminator
59 };
60
print_usage()61 void print_usage()
62 {
63 fprintf(stderr, "sorting_hat v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
64 fprintf(stderr, "-----------------------------\n");
65
66 //NOTE: SPACES ONLY, bozo
67 fprintf(stderr, "Usage: cuffdiff [options] <input.fpkm_tracking> <output.shout>\n");
68 fprintf(stderr, "Options:\n\n");
69 fprintf(stderr, "-o/--output-dir write all output files to this directory [ default: ./ ]\n");
70 fprintf(stderr, "-d/--row-matrix compute row distance matrix [ default: off ]\n");
71 fprintf(stderr, "-l/--log-fpkm JS on log(fpkm+1) instead of [ default: off ]\n");
72 #if ENABLE_THREADS
73 fprintf(stderr, "-p/--num-threads number of threads used during assembly [ default: 1 ]\n");
74 #endif
75 }
76
parse_options(int argc,char ** argv)77 int parse_options(int argc, char** argv)
78 {
79 int option_index = 0;
80 int next_option;
81 do {
82 next_option = getopt_long(argc, argv, short_options, long_options, &option_index);
83 switch (next_option) {
84 case -1: /* Done with options. */
85 break;
86
87 case 'd':
88 {
89 compute_row_matrix = true;
90 row_matrix_out_filename = optarg;
91 break;
92 }
93
94 case 'P':
95 {
96 output_row_density = true;
97 row_density_out_filename = optarg;
98 break;
99 }
100
101 case 'l':
102 {
103 log_transform_fpkm = true;
104 break;
105 }
106
107 case 'k':
108 {
109 k_clusters = (int)parseInt(1, "-k/--k-means arg must be at least 1", print_usage);
110 break;
111 }
112
113 case 'I':
114 {
115 max_iterations = (int)parseInt(1, "-I/--max-iterations arg must be at least 1", print_usage);
116 break;
117 }
118
119 case 'o':
120 {
121 output_dir = optarg;
122 break;
123 }
124
125 default:
126 print_usage();
127 return 1;
128 }
129 } while(next_option != -1);
130
131 allow_junk_filtering = false;
132
133 return 0;
134 }
135
136
137 struct ExprRecord
138 {
ExprRecordExprRecord139 ExprRecord() :
140 total_FPKM(0.0),
141 total_FPKM_conf_hi(0.0),
142 total_FPKM_conf_lo(0.0),
143 max_FPKM(0.0),
144 total_log_FPKM(std::numeric_limits<double>::infinity()),
145 total_log_FPKM_conf_hi(std::numeric_limits<double>::infinity()),
146 total_log_FPKM_conf_lo(std::numeric_limits<double>::infinity()),
147 max_log_FPKM(std::numeric_limits<double>::infinity()),
148 cluster_id(-1) {}
149
150 string tracking_id;
151 string class_code;
152 string nearest_ref_id;
153 string gene_id;
154 string gene_short_name;
155 string tss_id;
156 string locus;
157
158 string length;
159 string coverage;
160 string status;
161
162 vector<double> FPKMs;
163 vector<double> FPKM_conf_los;
164 vector<double> FPKM_conf_his;
165
166 vector<double> log_FPKMs;
167 vector<double> log_FPKM_conf_los;
168 vector<double> log_FPKM_conf_his;
169
170 double total_FPKM;
171 double total_FPKM_conf_hi;
172 double total_FPKM_conf_lo;
173 double max_FPKM;
174
175 double total_log_FPKM;
176 double total_log_FPKM_conf_hi;
177 double total_log_FPKM_conf_lo;
178 double max_log_FPKM;
179
180 ublas::vector<double> cond_density;
181 vector<double> cond_specificities;
182
183 int cluster_id;
184 };
185
186 struct ClusterStats
187 {
ClusterStatsClusterStats188 ClusterStats(int dim)
189 {
190 mean = ublas::zero_vector<double>(dim);
191 variance = 0.0;
192 }
193
194 ublas::vector<double> mean;
195 double variance;
196 vector<size_t> members;
197 };
198
199 struct SortByVariance
200 {
operator ()SortByVariance201 bool operator()(const ClusterStats& lhs, const ClusterStats& rhs)
202 {
203 if (lhs.variance != rhs.variance)
204 return lhs.variance < rhs.variance;
205 else if (lhs.members.size() != rhs.members.size())
206 return lhs.members.size() < rhs.members.size();
207 return false;
208 }
209 };
210
211
assign_to_nearest_cluster(vector<ExprRecord> & expr_records,vector<ClusterStats> & clusters)212 void assign_to_nearest_cluster(vector<ExprRecord>& expr_records, vector<ClusterStats>& clusters)
213 {
214
215 int num_clusters = clusters.size();
216 int num_conditions = expr_records.front().cond_density.size();
217
218 BOOST_FOREACH(ClusterStats& c, clusters)
219 {
220 c.members.clear();
221 }
222
223 // E step - assign each record to nearest cluster
224 for (size_t r = 0; r < expr_records.size(); ++r)
225 {
226 ExprRecord& rec = expr_records[r];
227
228 // Skip unexpressed records;
229 if (log_transform_fpkm)
230 {
231 if (rec.total_log_FPKM == 0 ||
232 rec.total_log_FPKM == std::numeric_limits<double>::infinity())
233 {
234 continue;
235 }
236 }
237 else
238 {
239 if (rec.total_FPKM == 0)
240 {
241 continue;
242 }
243 }
244
245 double min_dist = std::numeric_limits<double>::max();
246 size_t closest_mean = 0;
247 vector<ublas::vector<double> > kappas;
248
249 kappas.push_back(rec.cond_density);
250 //cerr << rec.cond_density << endl;
251 kappas.push_back(ublas::zero_vector<double>());
252
253 for (size_t m = 0; m < clusters.size(); ++m)
254 {
255 kappas[1] = clusters[m].mean;
256 double js = jensen_shannon_distance(kappas);
257 if (js < min_dist)
258 {
259 closest_mean = m;
260 min_dist = js;
261 }
262 }
263
264 clusters[closest_mean].members.push_back(r);
265 }
266
267 // M step - calculate new means
268
269 vector<ublas::vector<double> > tmp_means(num_clusters,
270 ublas::zero_vector<double>(num_conditions));
271 // Calculate mean
272
273 for (size_t c = 0; c < clusters.size(); ++c)
274 {
275 ClusterStats& cluster = clusters[c];
276 BOOST_FOREACH(int m, cluster.members)
277 {
278 tmp_means[c] += expr_records[m].cond_density;
279 }
280 }
281
282 for (size_t c = 0; c < clusters.size(); ++c)
283 {
284 if (clusters[c].members.size() > 0)
285 {
286 clusters[c].mean = tmp_means[c] / clusters[c].members.size();
287 }
288 }
289
290 // And now the variance
291 for (size_t c = 0; c < clusters.size(); ++c)
292 {
293 ClusterStats& cluster = clusters[c];
294 if (cluster.members.empty())
295 continue;
296
297 vector<ublas::vector<double> > kappas;
298 kappas.push_back(cluster.mean);
299 kappas.push_back(ublas::zero_vector<double>());
300 BOOST_FOREACH (int m, cluster.members)
301 {
302 kappas[1] = expr_records[m].cond_density;
303 double js = jensen_shannon_distance(kappas);
304 cluster.variance += (js * js);
305 }
306
307 cluster.variance /= cluster.members.size();
308 }
309 }
310
get_assignments(const vector<ClusterStats> & clusters,vector<int> & assignments)311 void get_assignments(const vector<ClusterStats>& clusters,
312 vector<int>& assignments)
313 {
314 BOOST_FOREACH (int& i, assignments)
315 {
316 i = -1;
317 }
318
319 for (size_t c = 0; c < clusters.size(); ++c)
320 {
321 for (size_t r = 0; r < clusters[c].members.size(); ++r)
322 {
323 assignments[clusters[c].members[r]] = c;
324 }
325 }
326 }
327
328
split_cluster(const vector<ExprRecord> & expr_records,ClusterStats & to_split,vector<ClusterStats> & new_clusters)329 void split_cluster(const vector<ExprRecord>& expr_records,
330 ClusterStats& to_split,
331 vector<ClusterStats>& new_clusters)
332 {
333
334 fprintf (stderr, "Splitting cluster...\n");
335
336 int num_conditions = expr_records.front().cond_density.size();
337
338 vector<ublas::vector<double> > kappas;
339 kappas.push_back(to_split.mean);
340
341 vector<double> dist_from_mean;
342
343 for (size_t m = 0; m < to_split.members.size(); ++m)
344 {
345 kappas.push_back(expr_records[to_split.members[m]].cond_density);
346 double js = jensen_shannon_distance(kappas);
347 dist_from_mean.push_back(js);
348 }
349
350 // pick the point farthest from the mean and the point closest.
351
352 double largest_dist = -1.0;
353 size_t farthest_point = 0;
354
355 double smallest_dist = std::numeric_limits<double>::max();
356 size_t closest_point = 0;
357
358 for (size_t i = 0; i < dist_from_mean.size(); ++i)
359 {
360 if (largest_dist > dist_from_mean[i])
361 {
362 largest_dist = dist_from_mean[i];
363 farthest_point = i;
364 }
365
366 if (smallest_dist < dist_from_mean[i])
367 {
368 smallest_dist = dist_from_mean[i];
369 closest_point = i;
370 }
371 }
372
373 vector<ublas::vector<double> > a_kappas;
374 a_kappas.push_back(expr_records[to_split.members[closest_point]].cond_density);
375 a_kappas.push_back(ublas::zero_vector<double>());
376
377 vector<ublas::vector<double> > b_kappas;
378 b_kappas.push_back(expr_records[to_split.members[farthest_point]].cond_density);
379 b_kappas.push_back(ublas::zero_vector<double>());
380
381 ClusterStats a_cluster(num_conditions);
382 ClusterStats b_cluster(num_conditions);
383
384 a_cluster.mean = ublas::zero_vector<double>(num_conditions);
385 b_cluster.mean = ublas::zero_vector<double>(num_conditions);
386
387 for (size_t m = 0; m < to_split.members.size(); ++m)
388 {
389 a_kappas[1] = expr_records[to_split.members[m]].cond_density;
390 double a_js = jensen_shannon_distance(a_kappas);
391
392 b_kappas[1] = expr_records[to_split.members[m]].cond_density;
393 double b_js = jensen_shannon_distance(b_kappas);
394
395 if (a_js < b_js)
396 {
397 a_cluster.members.push_back(to_split.members[m]);
398 a_cluster.mean += expr_records[to_split.members[m]].cond_density;
399 }
400 else
401 {
402 b_cluster.members.push_back(to_split.members[m]);
403 b_cluster.mean += expr_records[to_split.members[m]].cond_density;
404 }
405 }
406
407 a_cluster.mean /= a_cluster.members.size();
408 b_cluster.mean /= b_cluster.members.size();
409
410 // Calculate the variances of the split clusters.
411 a_kappas[0] = a_cluster.mean;
412 b_kappas[0] = b_cluster.mean;
413
414 for (size_t m = 0; m < a_cluster.members.size(); ++m)
415 {
416 a_kappas[1] = expr_records[a_cluster.members[m]].cond_density;
417 double js = jensen_shannon_distance(a_kappas);
418 a_cluster.variance += (js * js);
419 }
420
421 a_cluster.variance /= a_cluster.members.size();
422
423 for (size_t m = 0; m < b_cluster.members.size(); ++m)
424 {
425 b_kappas[1] = expr_records[b_cluster.members[m]].cond_density;
426 double js = jensen_shannon_distance(b_kappas);
427 b_cluster.variance += (js * js);
428 }
429
430 b_cluster.variance /= b_cluster.members.size();
431
432 new_clusters.push_back(a_cluster);
433 new_clusters.push_back(b_cluster);
434 }
435
kmeans(vector<ExprRecord> & expr_records,int num_clusters,int num_iterations)436 void kmeans(vector<ExprRecord>& expr_records, int num_clusters, int num_iterations)
437 {
438 // generate k random means
439 size_t num_conditions = 0;
440 for (size_t i = 1; i < expr_records.size(); ++i)
441 {
442 if (expr_records[i - 1].FPKMs.size() != expr_records[i].FPKMs.size())
443 {
444 fprintf(stderr, "Error: not all records have the same number of conditions!\n");
445 exit(1);
446 }
447 num_conditions = expr_records[i].FPKMs.size();
448 }
449
450 // This is a typedef for a random number generator.
451 typedef boost::minstd_rand base_generator_type;
452 base_generator_type generator(time(NULL));
453
454 boost::uniform_real<> uni_dist(0,1);
455 boost::variate_generator<base_generator_type&, boost::uniform_real<> > uni(generator, uni_dist);
456
457 // each mean gets an accumulation vector and a counter. For now,
458 // let's just keep ClusterStats thin and do all this outside the class.
459 vector<ClusterStats> clusters(num_clusters, ClusterStats(num_conditions));
460
461 for (size_t i = 0; i < clusters.size(); ++i)
462 {
463 ClusterStats& c = clusters[i];
464 ublas::vector<double>& mean_i = c.mean;
465
466 for (size_t j = 0; j < mean_i.size(); ++j)
467 {
468 mean_i(j) = uni();
469 }
470 double total = accumulate(mean_i.begin(), mean_i.end(), 0.0);
471 mean_i /= total;
472 //cerr << mean_i << endl;
473 }
474
475 vector<int> assignments(expr_records.size());
476 vector<int> prev_assignments;
477
478 size_t iter = 0;
479 for (;iter < num_iterations; ++iter)
480 {
481 if (iter % 5 == 0)
482 {
483 fprintf(stderr, "Iteration # %d\n", iter);
484 }
485
486 assign_to_nearest_cluster(expr_records, clusters);
487 get_assignments(clusters, assignments);
488
489 sort(clusters.begin(), clusters.end(), SortByVariance());
490
491 vector<ClusterStats> empty_clusters;
492 vector<ClusterStats> full_clusters;
493
494 // Look for empty clusters, and if found split others (in order of
495 // largest variance)
496 for(size_t p = 0; p < clusters.size(); ++p)
497 {
498 const ClusterStats& P = clusters[p];
499 if (P.members.size() == 0) // found empty cluster
500 {
501 empty_clusters.push_back(P);
502 }
503 else
504 {
505 full_clusters.push_back(P);
506 }
507 }
508
509 if (!empty_clusters.empty())
510 {
511 vector<ClusterStats> new_clusters;
512 for (size_t c = 0; c < empty_clusters.size(); ++c)
513 {
514 //size_t full = 0;
515 // split this cluster.
516 ClusterStats to_split = full_clusters.back();
517 full_clusters.pop_back();
518 split_cluster(expr_records, to_split, new_clusters);
519 //get_assignments(clusters, assignments);
520 //prev_assignments = assignments;
521 }
522
523 full_clusters.insert(full_clusters.end(), new_clusters.begin(), new_clusters.end());
524 clusters = full_clusters;
525 get_assignments(clusters, assignments);
526 prev_assignments = assignments;
527 }
528
529 else
530 {
531 if (assignments == prev_assignments)
532 {
533 break;
534 }
535 else
536 {
537 prev_assignments = assignments;
538 }
539
540 }
541 }
542
543 for (size_t a = 0; a < assignments.size(); ++a)
544 {
545 expr_records[a].cluster_id = assignments[a];
546 }
547
548 if (iter == num_iterations)
549 {
550 fprintf(stderr, "Warning: clustering did not converge after %d iterations\n", iter);
551 }
552 }
553
driver(FILE * fpkm_file,FILE * spec_out,FILE * row_matrix_out,FILE * row_density_out)554 void driver(FILE* fpkm_file, FILE* spec_out, FILE* row_matrix_out, FILE* row_density_out)
555 {
556 char buf[10 * 1024];
557
558 vector<string> sample_names;
559
560 int line_num = 1;
561
562 while(fgets(buf, sizeof(buf), fpkm_file))
563 {
564 if (buf[0])
565 {
566 // Chomp the newline
567 char* nl = strrchr(buf, '\n');
568 if (nl) *nl = 0;
569
570 vector<string> tokens;
571 tokenize(buf, "\t", tokens);
572
573 if (tokens.size() < 16)
574 {
575 fprintf(stderr, "Error: FPKM tracking files must have at least 12 columns\n");
576 exit(1);
577 }
578
579 if ((tokens.size() - 10) % 3 != 0)
580 {
581 fprintf(stderr, "Error: FPKM tracking files must have FPKM, FPKM_lo, and FPKM_hi columns for each sample\n");
582 exit(1);
583 }
584
585 static const size_t first_sample_idx = 10;
586
587 for (size_t i = first_sample_idx; i < tokens.size(); i += 3)
588 {
589 string FPKM_label = tokens[i];
590 string::size_type name_end = FPKM_label.rfind("_FPKM");
591 if (name_end == string::npos)
592 {
593 fprintf(stderr, "Error: Malformed FPKM column header %s. Should end in \"_FPKM\".\n", FPKM_label.c_str());
594 exit(1);
595 }
596 string name = FPKM_label.substr(0, name_end);
597 sample_names.push_back(name);
598 }
599
600 break;
601 }
602
603 line_num++;
604 }
605
606 fprintf(spec_out, "tracking_id\tclass_code\tnearest_ref\tgene_id\tgene_short_name\ttss_id\tlocus\tlength\tcoverage\tstatus\ttotal_FPKM\ttotal_FPKM_lo\ttotal_FPKM_hi\tmax_FPKM\tcluster_id");
607
608 for (size_t i = 0; i < sample_names.size(); ++i)
609 {
610 fprintf(spec_out, "\t%s", sample_names[i].c_str());
611 }
612 fprintf(spec_out, "\n");
613
614 vector<ExprRecord> expr_records;
615
616 while(fgets(buf, sizeof(buf), fpkm_file))
617 {
618 // Chomp the newline
619 char* nl = strrchr(buf, '\n');
620 if (nl) *nl = 0;
621
622 vector<string> tokens;
623 tokenize(buf, "\t", tokens);
624
625 if (((tokens.size() - 10) / 3) != sample_names.size())
626 {
627 fprintf(stderr, "Error: Line %d has %lu columns, should have %lu\n", line_num, tokens.size(), (sample_names.size() * 3) + 6);
628 exit(1);
629 }
630
631 ExprRecord rec;
632
633 rec.tracking_id = tokens[0];
634 rec.class_code = tokens[1];
635 rec.nearest_ref_id = tokens[2];
636 rec.gene_id = tokens[3];
637 rec.gene_short_name = tokens[4];
638 rec.tss_id = tokens[5];
639 rec.locus = tokens[6];
640
641 rec.length = tokens[7];
642 rec.coverage = tokens[8];
643 rec.status = tokens[9];
644
645 static const size_t first_sample_idx = 10;
646
647 size_t num_samples = sample_names.size();
648 ublas::vector<double> u1 = ublas::unit_vector<double>(sample_names.size(), 0);
649 ublas::vector<double> u2 = ublas::unit_vector<double>(sample_names.size(), 1);
650
651 vector<ublas::vector<double> > norm_kappas;
652 norm_kappas.push_back(u1);
653 norm_kappas.push_back(u2);
654
655 double norm_js = jensen_shannon_distance(norm_kappas);
656 double max_FPKM = -1;
657 double max_log_FPKM = -1;
658
659 for (size_t i = first_sample_idx; i < tokens.size() - 2; i += 3)
660 {
661 string FPKM_string = tokens[i];
662 double fpkm = atof(FPKM_string.c_str());
663
664 string FPKM_conf_lo_string = tokens[i+1];
665 double fpkm_conf_lo = atof(FPKM_conf_lo_string.c_str());
666
667 string FPKM_conf_hi_string = tokens[i+2];
668 double fpkm_conf_hi = atof(FPKM_conf_hi_string.c_str());
669
670 double log_fpkm = log10(fpkm + 1.0);
671 double log_fpkm_conf_lo = log10(fpkm_conf_lo + 1.0);
672 double log_fpkm_conf_hi = log10(fpkm_conf_hi + 1.0);
673
674 if (isnan(fpkm) || isnan(fpkm_conf_lo) || isnan(fpkm_conf_hi))
675 {
676 fprintf (stderr, "Warning: gene %s (%s) on line %d has FPKM = NaN\n",
677 rec.tracking_id.c_str(), rec.gene_short_name.c_str(), line_num);
678 fpkm = 0.0;
679 fpkm_conf_lo = 0;
680 fpkm_conf_hi = 0;
681
682 log_fpkm = std::numeric_limits<double>::infinity();
683 log_fpkm_conf_lo = std::numeric_limits<double>::infinity();
684 log_fpkm_conf_hi = std::numeric_limits<double>::infinity();
685 }
686 else
687 {
688 if (log_fpkm > max_log_FPKM)
689 {
690 max_log_FPKM = log_fpkm;
691 }
692
693 if (fpkm > max_FPKM)
694 {
695 max_FPKM = fpkm;
696 }
697 }
698
699 rec.FPKMs.push_back(fpkm);
700 rec.FPKM_conf_los.push_back(fpkm_conf_lo);
701 rec.FPKM_conf_his.push_back(fpkm_conf_hi);
702
703 rec.log_FPKMs.push_back(log_fpkm);
704 rec.log_FPKM_conf_los.push_back(log_fpkm_conf_lo);
705 rec.log_FPKM_conf_his.push_back(log_fpkm_conf_hi);
706 }
707
708 rec.total_FPKM = accumulate(rec.FPKMs.begin(), rec.FPKMs.end(), 0.0);
709 rec.total_FPKM_conf_lo = accumulate(rec.FPKM_conf_los.begin(), rec.FPKM_conf_los.end(), 0.0);
710 rec.total_FPKM_conf_hi = accumulate(rec.FPKM_conf_his.begin(), rec.FPKM_conf_his.end(), 0.0);
711 rec.max_FPKM = max_FPKM;
712
713 rec.total_log_FPKM = accumulate(rec.log_FPKMs.begin(), rec.log_FPKMs.end(), 0.0);
714 rec.total_log_FPKM_conf_lo = accumulate(rec.log_FPKM_conf_los.begin(), rec.log_FPKM_conf_los.end(), 0.0);
715 rec.total_log_FPKM_conf_hi = accumulate(rec.log_FPKM_conf_his.begin(), rec.log_FPKM_conf_his.end(), 0.0);
716 rec.max_log_FPKM = max_log_FPKM;
717
718 assert (!isnan(rec.total_FPKM) && !isinf(rec.total_FPKM));
719
720 rec.cond_density = ublas::vector<double>(sample_names.size());
721 rec.cond_specificities = vector<double>(sample_names.size(), std::numeric_limits<double>::max());
722
723
724 if (rec.total_FPKM == 0.0 || (log_transform_fpkm && (rec.total_log_FPKM == 0.0 || rec.total_log_FPKM == std::numeric_limits<double>::infinity())))
725 {
726 rec.cond_density = ublas::zero_vector<double>(sample_names.size());
727 }
728 else
729 {
730 for (size_t i = 0; i < rec.cond_density.size(); ++i)
731 {
732 if (log_transform_fpkm)
733 {
734
735 rec.cond_density(i) = rec.log_FPKMs[i] / rec.total_log_FPKM;
736 }
737 else
738 {
739 rec.cond_density(i) = rec.FPKMs[i] / rec.total_FPKM;
740 }
741
742
743 }
744
745 //fpkm_dists.push_back(FPKM_dist);
746
747 //cerr << tracking_id << FPKM_dist<< endl;
748
749 const size_t N = sample_names.size();
750
751 assert (N >= 2);
752
753 vector<ublas::vector<double> > kappas;
754 kappas.push_back(rec.cond_density);
755 kappas.push_back(ublas::zero_vector<double>(sample_names.size()));
756
757 for (size_t i = 0; i < sample_names.size(); ++i)
758 {
759 ublas::vector<double> specific_vec = ublas::unit_vector<double>(N, i);
760 kappas[1] = specific_vec;
761
762 double js = jensen_shannon_distance(kappas);
763 js /= norm_js;
764
765 rec.cond_specificities[i] = 1.0 - js;
766 }
767 }
768
769 expr_records.push_back(rec);
770
771 line_num++;
772 }
773
774 if (k_clusters > 0)
775 {
776 kmeans(expr_records, k_clusters, max_iterations);
777 }
778
779
780 for (size_t i = 0; i < expr_records.size(); ++i)
781 {
782 const ExprRecord& rec = expr_records[i];
783
784 char cluster_str[256];
785
786 if (k_clusters == 0)
787 {
788 sprintf(cluster_str, "-");
789 }
790 else
791 {
792 sprintf(cluster_str, "%d", rec.cluster_id);
793 }
794
795 fprintf(spec_out,
796 "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%g\t%g\t%g\t%lg\t%s",
797 rec.tracking_id.c_str(),
798 rec.class_code.c_str(),
799 rec.nearest_ref_id.c_str(),
800 rec.gene_id.c_str(),
801 rec.gene_short_name.c_str(),
802 rec.tss_id.c_str(),
803 rec.locus.c_str(),
804 rec.length.c_str(),
805 rec.coverage.c_str(),
806 rec.status.c_str(),
807 rec.total_FPKM,
808 rec.total_FPKM_conf_lo,
809 rec.total_FPKM_conf_hi,
810 rec.max_FPKM,
811 cluster_str);
812
813 for (size_t i = 0; i < rec.cond_density.size(); ++i)
814 {
815 fprintf(spec_out, "\t%g", rec.cond_specificities[i]);
816 }
817
818 fprintf(spec_out, "\n");
819 }
820
821 // if (row_matrix_out)
822 // {
823 // for (size_t i = 0; i < fpkm_dists.size(); ++i)
824 // {
825 // const ublas::vector<double>& i_vec = fpkm_dists[i];
826 // vector<ublas::vector<double> > kappas;
827 // kappas.push_back(i_vec);
828 // kappas.push_back(ublas::zero_vector<double>(i_vec.size()));
829 // if (i % 100 == 0)
830 // {
831 // fprintf(stderr, "%lu of %lu (%g percent)\n", i, fpkm_dists.size(), (double)i/(double)fpkm_dists.size());
832 // }
833 //
834 // for(size_t j = 0; j < fpkm_dists.size(); ++j)
835 // {
836 // const ublas::vector<double>& j_vec = fpkm_dists[j];
837 // kappas[1] = j_vec;
838 // double i_j_js = jensen_shannon_distance(kappas);
839 // if (j == fpkm_dists.size() - 1)
840 // {
841 // fprintf(row_matrix_out, "%g\n", i_j_js);
842 // }
843 // else
844 // {
845 // fprintf(row_matrix_out, "%g\t", i_j_js);
846 // }
847 //
848 // }
849 // }
850 // }
851 //
852 if (row_density_out)
853 {
854 for (size_t i = 0; i < expr_records.size(); ++i)
855 {
856 const ExprRecord& rec = expr_records[i];
857 const ublas::vector<double>& i_vec = rec.cond_density;
858 fprintf(row_density_out, "%s\t", rec.tracking_id.c_str());
859 for (size_t j = 0; j < i_vec.size(); ++j)
860 {
861 if (j == i_vec.size() - 1)
862 {
863 fprintf(row_density_out, "%g\n", i_vec(j));
864 }
865 else
866 {
867 fprintf(row_density_out, "%g\t", i_vec(j));
868 }
869 }
870 }
871 }
872 }
873
main(int argc,char ** argv)874 int main(int argc, char** argv)
875 {
876 int parse_ret = parse_options(argc,argv);
877 if (parse_ret)
878 return parse_ret;
879
880 if(optind >= argc)
881 {
882 print_usage();
883 return 1;
884 }
885
886 string fpkm_filename = argv[optind++];
887
888 if(optind >= argc)
889 {
890 print_usage();
891 return 1;
892 }
893
894 string spec_out_filename = argv[optind++];
895
896
897 FILE* fpkm_file = fopen(fpkm_filename.c_str(), "r");
898 if (!fpkm_file)
899 {
900 fprintf(stderr, "Error: cannot open FPKM tracking file %s for reading\n",
901 fpkm_filename.c_str());
902 exit(1);
903 }
904
905 FILE* spec_out_file = fopen(spec_out_filename.c_str(), "w");
906 if (!spec_out_file)
907 {
908 fprintf(stderr, "Error: cannot open output file %s for writing\n",
909 spec_out_filename.c_str());
910 exit(1);
911 }
912
913 FILE* row_matrix_out = NULL;
914 if (compute_row_matrix)
915 {
916 row_matrix_out = fopen(row_matrix_out_filename.c_str(), "w");
917 if (!row_matrix_out)
918 {
919 fprintf(stderr, "Error: cannot open output file %s for writing\n",
920 row_matrix_out_filename.c_str());
921 exit(1);
922 }
923 }
924
925 FILE* row_density_out = NULL;
926 if (output_row_density)
927 {
928 row_density_out = fopen(row_density_out_filename.c_str(), "w");
929 if (!row_density_out)
930 {
931 fprintf(stderr, "Error: cannot open output file %s for writing\n",
932 row_density_out_filename.c_str());
933 exit(1);
934 }
935 }
936
937 driver(fpkm_file, spec_out_file, row_matrix_out, row_density_out);
938
939 return 0;
940 }
941
942
943