1 /*
2  *  sorting_hat.cpp
3  *  cufflinks
4  *
5  *  Created by Cole Trapnell on 8/30/10.
6  *  Copyright 2010 Cole Trapnell. All rights reserved.
7  *
8  */
9 
10 #include <stdlib.h>
11 #include <getopt.h>
12 #include <string>
13 
14 #include <boost/numeric/ublas/matrix.hpp>
15 #include <boost/numeric/ublas/matrix_proxy.hpp>
16 #include <boost/numeric/ublas/vector.hpp>
17 #include <boost/numeric/ublas/vector_proxy.hpp>
18 #include <boost/numeric/ublas/io.hpp>
19 #include <boost/random/linear_congruential.hpp>
20 #include <boost/random/uniform_real.hpp>
21 #include <boost/random/variate_generator.hpp>
22 
23 
24 #include "common.h"
25 #include "tokenize.h"
26 
27 #include "differential.h"
28 
29 using namespace std;
30 using namespace boost;
31 
32 bool compute_row_matrix = false;
33 bool log_transform_fpkm = false;
34 bool output_row_density = false;
35 
36 int k_clusters = 0;
37 int max_iterations = 1000;
38 
39 string row_matrix_out_filename = "";
40 string row_density_out_filename = "";
41 
42 #if ENABLE_THREADS
43 const char *short_options = "o:p:d:P:k:I:l";
44 #else
45 const char *short_options = "o:d:P:k:I:l";
46 #endif
47 
48 static struct option long_options[] = {
49     {"output-dir",			    required_argument,		 0,			 'o'},
50     {"row-matrix",			    required_argument,       0,			 'd'},
51     {"row-densities",			required_argument,       0,			 'P'},
52     {"log-fpkm",			    no_argument,             0,			 'l'},
53     {"k-means",                 required_argument,       0,			 'k'},
54     {"max-iterations",          required_argument,       0,			 'I'},
55 #if ENABLE_THREADS
56     {"num-threads",				required_argument,       0,          'p'},
57 #endif
58     {0, 0, 0, 0} // terminator
59 };
60 
print_usage()61 void print_usage()
62 {
63 	fprintf(stderr, "sorting_hat v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
64 	fprintf(stderr, "-----------------------------\n");
65 
66 	//NOTE: SPACES ONLY, bozo
67     fprintf(stderr, "Usage:   cuffdiff [options] <input.fpkm_tracking> <output.shout>\n");
68 	fprintf(stderr, "Options:\n\n");
69 	fprintf(stderr, "-o/--output-dir              write all output files to this directory              [ default:     ./ ]\n");
70     fprintf(stderr, "-d/--row-matrix              compute row distance matrix                           [ default:     off ]\n");
71     fprintf(stderr, "-l/--log-fpkm                JS on log(fpkm+1) instead of                          [ default:     off ]\n");
72 #if ENABLE_THREADS
73 	fprintf(stderr, "-p/--num-threads             number of threads used during assembly                [ default:      1 ]\n");
74 #endif
75 }
76 
parse_options(int argc,char ** argv)77 int parse_options(int argc, char** argv)
78 {
79     int option_index = 0;
80     int next_option;
81     do {
82         next_option = getopt_long(argc, argv, short_options, long_options, &option_index);
83         switch (next_option) {
84 			case -1:     /* Done with options. */
85 				break;
86 
87             case 'd':
88 			{
89 				compute_row_matrix = true;
90                 row_matrix_out_filename = optarg;
91 				break;
92 			}
93 
94             case 'P':
95 			{
96 				output_row_density = true;
97                 row_density_out_filename = optarg;
98 				break;
99 			}
100 
101             case 'l':
102 			{
103 				log_transform_fpkm = true;
104 				break;
105 			}
106 
107             case 'k':
108 			{
109 				k_clusters = (int)parseInt(1, "-k/--k-means arg must be at least 1", print_usage);
110 				break;
111 			}
112 
113             case 'I':
114 			{
115 				max_iterations = (int)parseInt(1, "-I/--max-iterations arg must be at least 1", print_usage);
116 				break;
117 			}
118 
119             case 'o':
120 			{
121 				output_dir = optarg;
122 				break;
123 			}
124 
125 			default:
126 				print_usage();
127 				return 1;
128         }
129     } while(next_option != -1);
130 
131 	allow_junk_filtering = false;
132 
133 	return 0;
134 }
135 
136 
137 struct ExprRecord
138 {
ExprRecordExprRecord139     ExprRecord() :
140         total_FPKM(0.0),
141         total_FPKM_conf_hi(0.0),
142         total_FPKM_conf_lo(0.0),
143         max_FPKM(0.0),
144         total_log_FPKM(std::numeric_limits<double>::infinity()),
145         total_log_FPKM_conf_hi(std::numeric_limits<double>::infinity()),
146         total_log_FPKM_conf_lo(std::numeric_limits<double>::infinity()),
147         max_log_FPKM(std::numeric_limits<double>::infinity()),
148         cluster_id(-1) {}
149 
150     string tracking_id;
151     string class_code;
152     string nearest_ref_id;
153     string gene_id;
154     string gene_short_name;
155     string tss_id;
156     string locus;
157 
158     string length;
159     string coverage;
160     string status;
161 
162     vector<double> FPKMs;
163     vector<double> FPKM_conf_los;
164     vector<double> FPKM_conf_his;
165 
166     vector<double> log_FPKMs;
167     vector<double> log_FPKM_conf_los;
168     vector<double> log_FPKM_conf_his;
169 
170     double total_FPKM;
171     double total_FPKM_conf_hi;
172     double total_FPKM_conf_lo;
173     double max_FPKM;
174 
175     double total_log_FPKM;
176     double total_log_FPKM_conf_hi;
177     double total_log_FPKM_conf_lo;
178     double max_log_FPKM;
179 
180     ublas::vector<double> cond_density;
181     vector<double> cond_specificities;
182 
183     int cluster_id;
184 };
185 
186 struct ClusterStats
187 {
ClusterStatsClusterStats188     ClusterStats(int dim)
189     {
190         mean = ublas::zero_vector<double>(dim);
191         variance = 0.0;
192     }
193 
194     ublas::vector<double> mean;
195     double variance;
196     vector<size_t> members;
197 };
198 
199 struct SortByVariance
200 {
operator ()SortByVariance201     bool operator()(const ClusterStats& lhs, const ClusterStats& rhs)
202     {
203         if (lhs.variance != rhs.variance)
204             return lhs.variance < rhs.variance;
205         else if (lhs.members.size() != rhs.members.size())
206             return lhs.members.size() < rhs.members.size();
207         return false;
208     }
209 };
210 
211 
assign_to_nearest_cluster(vector<ExprRecord> & expr_records,vector<ClusterStats> & clusters)212 void assign_to_nearest_cluster(vector<ExprRecord>& expr_records, vector<ClusterStats>& clusters)
213 {
214 
215     int num_clusters = clusters.size();
216     int num_conditions = expr_records.front().cond_density.size();
217 
218     BOOST_FOREACH(ClusterStats& c, clusters)
219     {
220         c.members.clear();
221     }
222 
223     // E step - assign each record to nearest cluster
224     for (size_t r = 0; r < expr_records.size(); ++r)
225     {
226         ExprRecord& rec = expr_records[r];
227 
228         // Skip unexpressed records;
229         if (log_transform_fpkm)
230         {
231             if (rec.total_log_FPKM == 0 ||
232                 rec.total_log_FPKM == std::numeric_limits<double>::infinity())
233             {
234                 continue;
235             }
236         }
237         else
238         {
239             if (rec.total_FPKM == 0)
240             {
241                 continue;
242             }
243         }
244 
245         double min_dist = std::numeric_limits<double>::max();
246         size_t closest_mean = 0;
247         vector<ublas::vector<double> >  kappas;
248 
249         kappas.push_back(rec.cond_density);
250         //cerr << rec.cond_density << endl;
251         kappas.push_back(ublas::zero_vector<double>());
252 
253         for (size_t m = 0; m < clusters.size(); ++m)
254         {
255             kappas[1] = clusters[m].mean;
256             double js = jensen_shannon_distance(kappas);
257             if (js < min_dist)
258             {
259                 closest_mean = m;
260                 min_dist = js;
261             }
262         }
263 
264         clusters[closest_mean].members.push_back(r);
265     }
266 
267     // M step - calculate new means
268 
269     vector<ublas::vector<double> > tmp_means(num_clusters,
270                                              ublas::zero_vector<double>(num_conditions));
271     // Calculate mean
272 
273     for (size_t c = 0; c < clusters.size(); ++c)
274     {
275         ClusterStats& cluster = clusters[c];
276         BOOST_FOREACH(int m, cluster.members)
277         {
278             tmp_means[c] += expr_records[m].cond_density;
279         }
280     }
281 
282     for (size_t c = 0; c < clusters.size(); ++c)
283     {
284         if (clusters[c].members.size() > 0)
285         {
286             clusters[c].mean = tmp_means[c] / clusters[c].members.size();
287         }
288     }
289 
290     // And now the variance
291     for (size_t c = 0; c < clusters.size(); ++c)
292     {
293         ClusterStats& cluster = clusters[c];
294         if (cluster.members.empty())
295             continue;
296 
297         vector<ublas::vector<double> >  kappas;
298         kappas.push_back(cluster.mean);
299         kappas.push_back(ublas::zero_vector<double>());
300         BOOST_FOREACH (int m, cluster.members)
301         {
302             kappas[1] = expr_records[m].cond_density;
303             double js = jensen_shannon_distance(kappas);
304             cluster.variance += (js * js);
305         }
306 
307         cluster.variance /= cluster.members.size();
308     }
309 }
310 
get_assignments(const vector<ClusterStats> & clusters,vector<int> & assignments)311 void get_assignments(const vector<ClusterStats>& clusters,
312                      vector<int>& assignments)
313 {
314     BOOST_FOREACH (int& i, assignments)
315     {
316         i = -1;
317     }
318 
319     for (size_t c = 0; c < clusters.size(); ++c)
320     {
321         for (size_t r = 0; r < clusters[c].members.size(); ++r)
322         {
323             assignments[clusters[c].members[r]] = c;
324         }
325     }
326 }
327 
328 
split_cluster(const vector<ExprRecord> & expr_records,ClusterStats & to_split,vector<ClusterStats> & new_clusters)329 void split_cluster(const vector<ExprRecord>& expr_records,
330                    ClusterStats& to_split,
331                    vector<ClusterStats>& new_clusters)
332 {
333 
334     fprintf (stderr, "Splitting cluster...\n");
335 
336     int num_conditions = expr_records.front().cond_density.size();
337 
338     vector<ublas::vector<double> >  kappas;
339     kappas.push_back(to_split.mean);
340 
341     vector<double> dist_from_mean;
342 
343     for (size_t m = 0; m < to_split.members.size(); ++m)
344     {
345         kappas.push_back(expr_records[to_split.members[m]].cond_density);
346         double js = jensen_shannon_distance(kappas);
347         dist_from_mean.push_back(js);
348     }
349 
350     // pick the point farthest from the mean and the point closest.
351 
352     double largest_dist = -1.0;
353     size_t farthest_point = 0;
354 
355     double smallest_dist = std::numeric_limits<double>::max();
356     size_t closest_point = 0;
357 
358     for (size_t i = 0; i < dist_from_mean.size(); ++i)
359     {
360         if (largest_dist > dist_from_mean[i])
361         {
362             largest_dist = dist_from_mean[i];
363             farthest_point = i;
364         }
365 
366         if (smallest_dist < dist_from_mean[i])
367         {
368             smallest_dist = dist_from_mean[i];
369             closest_point = i;
370         }
371     }
372 
373     vector<ublas::vector<double> > a_kappas;
374     a_kappas.push_back(expr_records[to_split.members[closest_point]].cond_density);
375     a_kappas.push_back(ublas::zero_vector<double>());
376 
377     vector<ublas::vector<double> > b_kappas;
378     b_kappas.push_back(expr_records[to_split.members[farthest_point]].cond_density);
379     b_kappas.push_back(ublas::zero_vector<double>());
380 
381     ClusterStats a_cluster(num_conditions);
382     ClusterStats b_cluster(num_conditions);
383 
384     a_cluster.mean = ublas::zero_vector<double>(num_conditions);
385     b_cluster.mean = ublas::zero_vector<double>(num_conditions);
386 
387     for (size_t m = 0; m < to_split.members.size(); ++m)
388     {
389         a_kappas[1] = expr_records[to_split.members[m]].cond_density;
390         double a_js = jensen_shannon_distance(a_kappas);
391 
392         b_kappas[1] = expr_records[to_split.members[m]].cond_density;
393         double b_js = jensen_shannon_distance(b_kappas);
394 
395         if (a_js < b_js)
396         {
397             a_cluster.members.push_back(to_split.members[m]);
398             a_cluster.mean += expr_records[to_split.members[m]].cond_density;
399         }
400         else
401         {
402             b_cluster.members.push_back(to_split.members[m]);
403             b_cluster.mean += expr_records[to_split.members[m]].cond_density;
404         }
405     }
406 
407     a_cluster.mean /= a_cluster.members.size();
408     b_cluster.mean /= b_cluster.members.size();
409 
410     // Calculate the variances of the split clusters.
411     a_kappas[0] = a_cluster.mean;
412     b_kappas[0] = b_cluster.mean;
413 
414     for (size_t m = 0; m < a_cluster.members.size(); ++m)
415     {
416         a_kappas[1] = expr_records[a_cluster.members[m]].cond_density;
417         double js = jensen_shannon_distance(a_kappas);
418         a_cluster.variance += (js * js);
419     }
420 
421     a_cluster.variance /= a_cluster.members.size();
422 
423     for (size_t m = 0; m < b_cluster.members.size(); ++m)
424     {
425         b_kappas[1] = expr_records[b_cluster.members[m]].cond_density;
426         double js = jensen_shannon_distance(b_kappas);
427         b_cluster.variance += (js * js);
428     }
429 
430     b_cluster.variance /= b_cluster.members.size();
431 
432     new_clusters.push_back(a_cluster);
433     new_clusters.push_back(b_cluster);
434 }
435 
kmeans(vector<ExprRecord> & expr_records,int num_clusters,int num_iterations)436 void kmeans(vector<ExprRecord>& expr_records, int num_clusters, int num_iterations)
437 {
438     // generate k random means
439     size_t num_conditions = 0;
440     for (size_t i = 1; i < expr_records.size(); ++i)
441     {
442         if (expr_records[i - 1].FPKMs.size() != expr_records[i].FPKMs.size())
443         {
444             fprintf(stderr, "Error: not all records have the same number of conditions!\n");
445             exit(1);
446         }
447         num_conditions = expr_records[i].FPKMs.size();
448     }
449 
450     // This is a typedef for a random number generator.
451     typedef boost::minstd_rand base_generator_type;
452     base_generator_type generator(time(NULL));
453 
454     boost::uniform_real<> uni_dist(0,1);
455     boost::variate_generator<base_generator_type&, boost::uniform_real<> > uni(generator, uni_dist);
456 
457     // each mean gets an accumulation vector and a counter.  For now,
458     // let's just keep ClusterStats thin and do all this outside the class.
459     vector<ClusterStats> clusters(num_clusters, ClusterStats(num_conditions));
460 
461     for (size_t i = 0; i < clusters.size(); ++i)
462     {
463         ClusterStats& c = clusters[i];
464         ublas::vector<double>& mean_i = c.mean;
465 
466         for (size_t j = 0; j < mean_i.size(); ++j)
467         {
468             mean_i(j) = uni();
469         }
470         double total = accumulate(mean_i.begin(), mean_i.end(), 0.0);
471         mean_i /= total;
472         //cerr << mean_i << endl;
473     }
474 
475     vector<int> assignments(expr_records.size());
476     vector<int> prev_assignments;
477 
478     size_t iter = 0;
479     for (;iter < num_iterations; ++iter)
480     {
481         if (iter % 5 == 0)
482         {
483             fprintf(stderr, "Iteration # %d\n", iter);
484         }
485 
486         assign_to_nearest_cluster(expr_records, clusters);
487         get_assignments(clusters, assignments);
488 
489         sort(clusters.begin(), clusters.end(), SortByVariance());
490 
491         vector<ClusterStats> empty_clusters;
492         vector<ClusterStats> full_clusters;
493 
494         // Look for empty clusters, and if found split others (in order of
495         // largest variance)
496         for(size_t p = 0; p < clusters.size(); ++p)
497         {
498             const ClusterStats& P = clusters[p];
499             if (P.members.size() == 0) // found empty cluster
500             {
501                 empty_clusters.push_back(P);
502             }
503             else
504             {
505                 full_clusters.push_back(P);
506             }
507         }
508 
509         if (!empty_clusters.empty())
510         {
511             vector<ClusterStats> new_clusters;
512             for (size_t c = 0; c < empty_clusters.size(); ++c)
513             {
514                 //size_t full = 0;
515                 // split this cluster.
516                 ClusterStats to_split = full_clusters.back();
517                 full_clusters.pop_back();
518                 split_cluster(expr_records, to_split, new_clusters);
519                 //get_assignments(clusters, assignments);
520                 //prev_assignments = assignments;
521             }
522 
523             full_clusters.insert(full_clusters.end(), new_clusters.begin(), new_clusters.end());
524             clusters = full_clusters;
525             get_assignments(clusters, assignments);
526             prev_assignments = assignments;
527         }
528 
529         else
530         {
531             if (assignments == prev_assignments)
532             {
533                 break;
534             }
535             else
536             {
537                 prev_assignments = assignments;
538             }
539 
540         }
541     }
542 
543     for (size_t a = 0; a < assignments.size(); ++a)
544     {
545         expr_records[a].cluster_id = assignments[a];
546     }
547 
548     if (iter == num_iterations)
549     {
550         fprintf(stderr, "Warning: clustering did not converge after %d iterations\n", iter);
551     }
552 }
553 
driver(FILE * fpkm_file,FILE * spec_out,FILE * row_matrix_out,FILE * row_density_out)554 void driver(FILE* fpkm_file, FILE* spec_out, FILE* row_matrix_out, FILE* row_density_out)
555 {
556     char buf[10 * 1024];
557 
558     vector<string> sample_names;
559 
560     int line_num = 1;
561 
562     while(fgets(buf, sizeof(buf), fpkm_file))
563     {
564         if (buf[0])
565         {
566             // Chomp the newline
567             char* nl = strrchr(buf, '\n');
568             if (nl) *nl = 0;
569 
570             vector<string> tokens;
571             tokenize(buf, "\t", tokens);
572 
573             if (tokens.size() < 16)
574             {
575                 fprintf(stderr, "Error:  FPKM tracking files must have at least 12 columns\n");
576                 exit(1);
577             }
578 
579             if ((tokens.size() - 10) % 3 != 0)
580             {
581                 fprintf(stderr, "Error:  FPKM tracking files must have FPKM, FPKM_lo, and FPKM_hi columns for each sample\n");
582                 exit(1);
583             }
584 
585             static const size_t first_sample_idx = 10;
586 
587             for (size_t i = first_sample_idx; i < tokens.size(); i += 3)
588             {
589                 string FPKM_label = tokens[i];
590                 string::size_type name_end = FPKM_label.rfind("_FPKM");
591                 if (name_end == string::npos)
592                 {
593                     fprintf(stderr, "Error:  Malformed FPKM column header %s.  Should end in \"_FPKM\".\n", FPKM_label.c_str());
594                     exit(1);
595                 }
596                 string name = FPKM_label.substr(0, name_end);
597                 sample_names.push_back(name);
598             }
599 
600             break;
601         }
602 
603         line_num++;
604     }
605 
606     fprintf(spec_out, "tracking_id\tclass_code\tnearest_ref\tgene_id\tgene_short_name\ttss_id\tlocus\tlength\tcoverage\tstatus\ttotal_FPKM\ttotal_FPKM_lo\ttotal_FPKM_hi\tmax_FPKM\tcluster_id");
607 
608     for (size_t i = 0; i < sample_names.size(); ++i)
609     {
610         fprintf(spec_out, "\t%s", sample_names[i].c_str());
611     }
612     fprintf(spec_out, "\n");
613 
614     vector<ExprRecord> expr_records;
615 
616     while(fgets(buf, sizeof(buf), fpkm_file))
617     {
618         // Chomp the newline
619 		char* nl = strrchr(buf, '\n');
620 		if (nl) *nl = 0;
621 
622         vector<string> tokens;
623         tokenize(buf, "\t", tokens);
624 
625         if (((tokens.size() - 10) / 3) != sample_names.size())
626         {
627             fprintf(stderr, "Error:  Line %d has %lu columns, should have %lu\n", line_num, tokens.size(), (sample_names.size() * 3) + 6);
628             exit(1);
629         }
630 
631         ExprRecord rec;
632 
633         rec.tracking_id = tokens[0];
634         rec.class_code = tokens[1];
635         rec.nearest_ref_id = tokens[2];
636         rec.gene_id = tokens[3];
637         rec.gene_short_name = tokens[4];
638         rec.tss_id = tokens[5];
639         rec.locus = tokens[6];
640 
641         rec.length = tokens[7];
642         rec.coverage = tokens[8];
643         rec.status = tokens[9];
644 
645         static const size_t first_sample_idx = 10;
646 
647         size_t num_samples = sample_names.size();
648         ublas::vector<double> u1 = ublas::unit_vector<double>(sample_names.size(), 0);
649         ublas::vector<double> u2 = ublas::unit_vector<double>(sample_names.size(), 1);
650 
651         vector<ublas::vector<double> > norm_kappas;
652         norm_kappas.push_back(u1);
653         norm_kappas.push_back(u2);
654 
655         double norm_js = jensen_shannon_distance(norm_kappas);
656         double max_FPKM = -1;
657         double max_log_FPKM = -1;
658 
659         for (size_t i = first_sample_idx; i < tokens.size() - 2; i += 3)
660         {
661             string FPKM_string = tokens[i];
662             double fpkm = atof(FPKM_string.c_str());
663 
664             string FPKM_conf_lo_string = tokens[i+1];
665             double fpkm_conf_lo = atof(FPKM_conf_lo_string.c_str());
666 
667             string FPKM_conf_hi_string = tokens[i+2];
668             double fpkm_conf_hi = atof(FPKM_conf_hi_string.c_str());
669 
670             double log_fpkm = log10(fpkm + 1.0);
671             double log_fpkm_conf_lo = log10(fpkm_conf_lo + 1.0);
672             double log_fpkm_conf_hi = log10(fpkm_conf_hi + 1.0);
673 
674             if (isnan(fpkm) || isnan(fpkm_conf_lo) || isnan(fpkm_conf_hi))
675             {
676                 fprintf (stderr, "Warning: gene %s (%s) on line %d has FPKM = NaN\n",
677                          rec.tracking_id.c_str(), rec.gene_short_name.c_str(), line_num);
678                 fpkm = 0.0;
679                 fpkm_conf_lo = 0;
680                 fpkm_conf_hi = 0;
681 
682                 log_fpkm = std::numeric_limits<double>::infinity();
683                 log_fpkm_conf_lo = std::numeric_limits<double>::infinity();
684                 log_fpkm_conf_hi = std::numeric_limits<double>::infinity();
685             }
686             else
687             {
688                 if (log_fpkm > max_log_FPKM)
689                 {
690                     max_log_FPKM = log_fpkm;
691                 }
692 
693                 if (fpkm > max_FPKM)
694                 {
695                     max_FPKM = fpkm;
696                 }
697             }
698 
699             rec.FPKMs.push_back(fpkm);
700             rec.FPKM_conf_los.push_back(fpkm_conf_lo);
701             rec.FPKM_conf_his.push_back(fpkm_conf_hi);
702 
703             rec.log_FPKMs.push_back(log_fpkm);
704             rec.log_FPKM_conf_los.push_back(log_fpkm_conf_lo);
705             rec.log_FPKM_conf_his.push_back(log_fpkm_conf_hi);
706         }
707 
708         rec.total_FPKM = accumulate(rec.FPKMs.begin(), rec.FPKMs.end(), 0.0);
709         rec.total_FPKM_conf_lo = accumulate(rec.FPKM_conf_los.begin(), rec.FPKM_conf_los.end(), 0.0);
710         rec.total_FPKM_conf_hi = accumulate(rec.FPKM_conf_his.begin(), rec.FPKM_conf_his.end(), 0.0);
711         rec.max_FPKM = max_FPKM;
712 
713         rec.total_log_FPKM = accumulate(rec.log_FPKMs.begin(), rec.log_FPKMs.end(), 0.0);
714         rec.total_log_FPKM_conf_lo = accumulate(rec.log_FPKM_conf_los.begin(), rec.log_FPKM_conf_los.end(), 0.0);
715         rec.total_log_FPKM_conf_hi = accumulate(rec.log_FPKM_conf_his.begin(), rec.log_FPKM_conf_his.end(), 0.0);
716         rec.max_log_FPKM = max_log_FPKM;
717 
718         assert (!isnan(rec.total_FPKM) && !isinf(rec.total_FPKM));
719 
720         rec.cond_density = ublas::vector<double>(sample_names.size());
721         rec.cond_specificities = vector<double>(sample_names.size(), std::numeric_limits<double>::max());
722 
723 
724         if (rec.total_FPKM == 0.0 || (log_transform_fpkm && (rec.total_log_FPKM == 0.0 || rec.total_log_FPKM == std::numeric_limits<double>::infinity())))
725         {
726              rec.cond_density = ublas::zero_vector<double>(sample_names.size());
727         }
728         else
729         {
730             for (size_t i = 0; i < rec.cond_density.size(); ++i)
731             {
732                 if (log_transform_fpkm)
733                 {
734 
735                     rec.cond_density(i) = rec.log_FPKMs[i] / rec.total_log_FPKM;
736                 }
737                 else
738                 {
739                     rec.cond_density(i) = rec.FPKMs[i] / rec.total_FPKM;
740                 }
741 
742 
743             }
744 
745             //fpkm_dists.push_back(FPKM_dist);
746 
747             //cerr << tracking_id << FPKM_dist<< endl;
748 
749             const size_t N = sample_names.size();
750 
751             assert (N >= 2);
752 
753             vector<ublas::vector<double> > kappas;
754             kappas.push_back(rec.cond_density);
755             kappas.push_back(ublas::zero_vector<double>(sample_names.size()));
756 
757             for (size_t i = 0; i < sample_names.size(); ++i)
758             {
759                 ublas::vector<double> specific_vec = ublas::unit_vector<double>(N, i);
760                 kappas[1] = specific_vec;
761 
762                 double js = jensen_shannon_distance(kappas);
763                 js /= norm_js;
764 
765                 rec.cond_specificities[i] = 1.0 - js;
766             }
767         }
768 
769         expr_records.push_back(rec);
770 
771         line_num++;
772     }
773 
774     if (k_clusters > 0)
775     {
776         kmeans(expr_records, k_clusters, max_iterations);
777     }
778 
779 
780     for (size_t i = 0; i < expr_records.size(); ++i)
781     {
782         const ExprRecord& rec = expr_records[i];
783 
784         char cluster_str[256];
785 
786         if (k_clusters == 0)
787         {
788             sprintf(cluster_str, "-");
789         }
790         else
791         {
792             sprintf(cluster_str, "%d", rec.cluster_id);
793         }
794 
795         fprintf(spec_out,
796                 "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%g\t%g\t%g\t%lg\t%s",
797                 rec.tracking_id.c_str(),
798                 rec.class_code.c_str(),
799                 rec.nearest_ref_id.c_str(),
800                 rec.gene_id.c_str(),
801                 rec.gene_short_name.c_str(),
802                 rec.tss_id.c_str(),
803                 rec.locus.c_str(),
804                 rec.length.c_str(),
805                 rec.coverage.c_str(),
806                 rec.status.c_str(),
807                 rec.total_FPKM,
808                 rec.total_FPKM_conf_lo,
809                 rec.total_FPKM_conf_hi,
810                 rec.max_FPKM,
811                 cluster_str);
812 
813         for (size_t i = 0; i < rec.cond_density.size(); ++i)
814         {
815             fprintf(spec_out, "\t%g", rec.cond_specificities[i]);
816         }
817 
818         fprintf(spec_out, "\n");
819     }
820 
821 //    if (row_matrix_out)
822 //    {
823 //         for (size_t i = 0; i < fpkm_dists.size(); ++i)
824 //         {
825 //             const ublas::vector<double>& i_vec = fpkm_dists[i];
826 //             vector<ublas::vector<double> > kappas;
827 //             kappas.push_back(i_vec);
828 //             kappas.push_back(ublas::zero_vector<double>(i_vec.size()));
829 //             if (i % 100 == 0)
830 //             {
831 //                 fprintf(stderr, "%lu of %lu (%g percent)\n", i, fpkm_dists.size(), (double)i/(double)fpkm_dists.size());
832 //             }
833 //
834 //             for(size_t j = 0; j < fpkm_dists.size(); ++j)
835 //             {
836 //                 const ublas::vector<double>& j_vec = fpkm_dists[j];
837 //                 kappas[1] = j_vec;
838 //                 double i_j_js = jensen_shannon_distance(kappas);
839 //                 if (j == fpkm_dists.size() - 1)
840 //                 {
841 //                     fprintf(row_matrix_out, "%g\n", i_j_js);
842 //                 }
843 //                 else
844 //                 {
845 //                     fprintf(row_matrix_out, "%g\t", i_j_js);
846 //                 }
847 //
848 //             }
849 //         }
850 //    }
851 //
852     if (row_density_out)
853     {
854         for (size_t i = 0; i < expr_records.size(); ++i)
855         {
856             const ExprRecord& rec = expr_records[i];
857             const ublas::vector<double>& i_vec = rec.cond_density;
858             fprintf(row_density_out, "%s\t", rec.tracking_id.c_str());
859             for (size_t j = 0; j < i_vec.size(); ++j)
860             {
861                 if (j == i_vec.size() - 1)
862                 {
863                     fprintf(row_density_out, "%g\n", i_vec(j));
864                 }
865                 else
866                 {
867                     fprintf(row_density_out, "%g\t", i_vec(j));
868                 }
869             }
870         }
871     }
872 }
873 
main(int argc,char ** argv)874 int main(int argc, char** argv)
875 {
876 	int parse_ret = parse_options(argc,argv);
877     if (parse_ret)
878         return parse_ret;
879 
880 	if(optind >= argc)
881     {
882         print_usage();
883         return 1;
884     }
885 
886     string fpkm_filename = argv[optind++];
887 
888     if(optind >= argc)
889     {
890         print_usage();
891         return 1;
892     }
893 
894     string spec_out_filename = argv[optind++];
895 
896 
897     FILE* fpkm_file = fopen(fpkm_filename.c_str(), "r");
898     if (!fpkm_file)
899     {
900         fprintf(stderr, "Error: cannot open FPKM tracking file %s for reading\n",
901                 fpkm_filename.c_str());
902         exit(1);
903     }
904 
905     FILE* spec_out_file = fopen(spec_out_filename.c_str(), "w");
906     if (!spec_out_file)
907     {
908         fprintf(stderr, "Error: cannot open output file %s for writing\n",
909                 spec_out_filename.c_str());
910         exit(1);
911     }
912 
913     FILE* row_matrix_out = NULL;
914     if (compute_row_matrix)
915     {
916         row_matrix_out = fopen(row_matrix_out_filename.c_str(), "w");
917         if (!row_matrix_out)
918         {
919             fprintf(stderr, "Error: cannot open output file %s for writing\n",
920                     row_matrix_out_filename.c_str());
921             exit(1);
922         }
923     }
924 
925     FILE* row_density_out = NULL;
926     if (output_row_density)
927     {
928         row_density_out = fopen(row_density_out_filename.c_str(), "w");
929         if (!row_density_out)
930         {
931             fprintf(stderr, "Error: cannot open output file %s for writing\n",
932                     row_density_out_filename.c_str());
933             exit(1);
934         }
935     }
936 
937     driver(fpkm_file, spec_out_file, row_matrix_out, row_density_out);
938 
939 	return 0;
940 }
941 
942 
943