1 #include "MinCollector.h"
2 #include <algorithm>
3 
4 // utility functions
5 
intersect(const std::vector<int> & x,const std::vector<int> & y)6 std::vector<int> intersect(const std::vector<int>& x, const std::vector<int>& y) {
7   std::vector<int> v;
8   auto a = x.begin();
9   auto b = y.begin();
10   while (a != x.end() && b != y.end()) {
11     if (*a < *b) {
12       ++a;
13     } else if (*b < *a) {
14       ++b;
15     } else {
16       v.push_back(*a);
17       ++a;
18       ++b;
19     }
20   }
21   return v;
22 }
23 
init_mean_fl_trunc(double mean,double sd)24 void MinCollector::init_mean_fl_trunc(double mean, double sd) {
25   auto tmp_trunc_fl = trunc_gaussian_fld(0, MAX_FRAG_LEN, mean, sd);
26   assert( tmp_trunc_fl.size() == mean_fl_trunc.size() );
27 
28   std::copy( tmp_trunc_fl.begin(), tmp_trunc_fl.end(), mean_fl_trunc.begin() );
29 
30   mean_fl = mean_fl_trunc[ MAX_FRAG_LEN - 1 ];
31 
32   has_mean_fl = true;
33   has_mean_fl_trunc = true;
34 }
35 
intersectKmers(std::vector<std::pair<KmerEntry,int>> & v1,std::vector<std::pair<KmerEntry,int>> & v2,bool nonpaired,std::vector<int> & u) const36 int MinCollector::intersectKmers(std::vector<std::pair<KmerEntry,int>>& v1,
37                           std::vector<std::pair<KmerEntry,int>>& v2, bool nonpaired, std::vector<int> &u) const {
38   std::vector<int> u1 = intersectECs(v1);
39   std::vector<int> u2 = intersectECs(v2);
40 
41   if (u1.empty() && u2.empty()) {
42     return -1;
43   }
44 
45   // non-strict intersection.
46   if (u1.empty()) {
47     if (v1.empty()) {
48       u = u2;
49     } else {
50       return -1;
51     }
52   } else if (u2.empty()) {
53     if (v2.empty()) {
54       u = u1;
55     } else {
56       return -1;
57     }
58   } else {
59     u = intersect(u1,u2);
60   }
61 
62   if (u.empty()) {
63     return -1;
64   }
65   return 1;
66 }
67 
collect(std::vector<std::pair<KmerEntry,int>> & v1,std::vector<std::pair<KmerEntry,int>> & v2,bool nonpaired)68 int MinCollector::collect(std::vector<std::pair<KmerEntry,int>>& v1,
69                           std::vector<std::pair<KmerEntry,int>>& v2, bool nonpaired) {
70   std::vector<int> u;
71   int r = intersectKmers(v1, v2, nonpaired, u);
72   if (r != -1) {
73     return increaseCount(u);
74   } else {
75     return -1;
76   }
77 }
78 
findEC(const std::vector<int> & u) const79 int MinCollector::findEC(const std::vector<int>& u) const {
80   if (u.empty()) {
81     return -1;
82   }
83   if (u.size() == 1) {
84     return u[0];
85   }
86   auto search = index.ecmapinv.find(u);
87   if (search != index.ecmapinv.end()) {
88     return search ->second;
89   } else {
90     return -1;
91   }
92 }
93 
increaseCount(const std::vector<int> & u)94 int MinCollector::increaseCount(const std::vector<int>& u) {
95   int ec = findEC(u);
96 
97   if (u.empty()) {
98     return -1;
99   } else {
100     if (ec != -1) {
101       ++counts[ec];
102       return ec;
103     } else {
104       auto necs = counts.size();
105       //index.ecmap.insert({necs,u});
106       index.ecmap.push_back(u);
107       index.ecmapinv.insert({u,necs});
108       counts.push_back(1);
109       return necs;
110     }
111   }
112 
113   /* -- removable
114   if (u.size() == 1) {
115     int ec = u[0];
116     ++counts[ec];
117     return ec;
118   }
119   auto search = index.ecmapinv.find(u);
120   if (search != index.ecmapinv.end()) {
121     // ec class already exists, update count
122     ++counts[search->second];
123     return search->second;
124   } else {
125     // new ec class, update the index and count
126     auto necs = counts.size();
127     //index.ecmap.insert({necs,u});
128     index.ecmap.push_back(u);
129     index.ecmapinv.insert({u,necs});
130     counts.push_back(1);
131     return necs;
132   }
133   */
134 }
135 
decreaseCount(const int ec)136 int MinCollector::decreaseCount(const int ec) {
137   assert(ec >= 0 && ec <= index.ecmap.size());
138   --counts[ec];
139   return ec;
140 }
141 
142 struct ComparePairsBySecond {
operator ()ComparePairsBySecond143   bool operator()(std::pair<KmerEntry,int> a, std::pair<KmerEntry,int> b) {
144     return a.second < b.second;
145   }
146 };
147 
intersectECs(std::vector<std::pair<KmerEntry,int>> & v) const148 std::vector<int> MinCollector::intersectECs(std::vector<std::pair<KmerEntry,int>>& v) const {
149   if (v.empty()) {
150     return {};
151   }
152   sort(v.begin(), v.end(), [&](std::pair<KmerEntry, int> a, std::pair<KmerEntry, int> b)
153        {
154          if (a.first.contig==b.first.contig) {
155            return a.second < b.second;
156          } else {
157            return a.first.contig < b.first.contig;
158          }
159        }); // sort by contig, and then first position
160 
161 
162   int ec = index.dbGraph.ecs[v[0].first.contig];
163   int lastEC = ec;
164   std::vector<int> u = index.ecmap[ec];
165 
166   for (int i = 1; i < v.size(); i++) {
167     if (v[i].first.contig != v[i-1].first.contig) {
168       ec = index.dbGraph.ecs[v[i].first.contig];
169       if (ec != lastEC) {
170         u = index.intersect(ec, u);
171         lastEC = ec;
172         if (u.empty()) {
173           return u;
174         }
175       }
176     }
177   }
178 
179   /*for (auto &x : vp) {
180     //tmp = index.intersect(x.first,u);
181     u = index.intersect(x.first,u);
182     //if (!tmp.empty()) {
183      // u = tmp;
184       //count++; // increase the count
185      // }
186   }*/
187 
188   // if u is empty do nothing
189   /*if (u.empty()) {
190     return u;
191     }*/
192 
193   // find the range of support
194   int minpos = std::numeric_limits<int>::max();
195   int maxpos = 0;
196 
197   for (auto& x : v) {
198     minpos = std::min(minpos, x.second);
199     maxpos = std::max(maxpos, x.second);
200   }
201 
202   if ((maxpos-minpos + k) < min_range) {
203     return {};
204   }
205 
206   return u;
207 }
208 
209 
loadCounts(ProgramOptions & opt)210 void MinCollector::loadCounts(ProgramOptions& opt) {
211   int num_ecs = counts.size();
212   counts.clear();
213   std::ifstream in((opt.output + "/counts.txt"));
214   int i = 0;
215   if (in.is_open()) {
216     std::string line;
217     while (getline(in, line)) {
218       std::stringstream ss(line);
219       int j,c;
220       ss >> j;
221       ss >> c;
222       if (j != i) {
223         std::cerr << "Error: equivalence class does not match index. Found "
224                   << j << ", expected " << i << std::endl;
225         exit(1);
226       }
227       counts.push_back(c);
228       i++;
229     }
230 
231     if (i != num_ecs) {
232       std::cerr << "Error: number of equivalence classes does not match index. Found "
233                 << i << ", expected " << num_ecs << std::endl;
234       exit(1);
235     }
236   } else {
237     std::cerr << "Error: could not open file " << opt.output << "/counts.txt" << std::endl;
238     exit(1);
239 
240   }
241 
242 }
243 
write(const std::string & pseudoprefix) const244 void MinCollector::write(const std::string& pseudoprefix) const {
245   std::string ecfilename = pseudoprefix + ".ec";
246   std::string countsfilename = pseudoprefix + ".tsv";
247 
248   std::ofstream ecof, countsof;
249   ecof.open(ecfilename.c_str(), std::ios::out);
250   // output equivalence classes in the form "EC TXLIST";
251   for (int i = 0; i < index.ecmap.size(); i++) {
252     ecof << i << "\t";
253     // output the rest of the class
254     const auto &v = index.ecmap[i];
255     bool first = true;
256     for (auto x : v) {
257       if (!first) {
258         ecof << ",";
259       } else {
260         first = false;
261       }
262       ecof << x;
263     }
264     ecof << "\n";
265   }
266   ecof.close();
267 
268   countsof.open(countsfilename.c_str(), std::ios::out);
269   for (int i = 0; i < counts.size(); i++) {
270     countsof << i << "\t" << counts[i] << "\n";
271   }
272   countsof.close();
273 }
274 
get_mean_frag_len(bool lenient) const275 double MinCollector::get_mean_frag_len(bool lenient) const {
276   if (has_mean_fl) {
277     return mean_fl;
278   }
279 
280   auto total_counts = 0;
281   double total_mass = 0.0;
282 
283   for ( size_t i = 0 ; i < flens.size(); ++i ) {
284     total_counts += flens[i];
285     total_mass += static_cast<double>(flens[i] * i);
286   }
287 
288   if (total_counts == 0) {
289     if (!lenient) {
290       std::cerr << "Error: could not determine mean fragment length from paired end reads, no pairs mapped to a unique transcript." << std::endl
291               << "       Run kallisto quant again with a pre-specified fragment length (option -l)." << std::endl;
292       exit(1);
293     } else {
294       return std::numeric_limits<double>::max();
295     }
296 
297   }
298 
299   // cache the value
300   const_cast<double&>(mean_fl) = total_mass / static_cast<double>(total_counts);
301   const_cast<bool&>(has_mean_fl) = true;
302   return mean_fl;
303 }
304 
get_sd_frag_len() const305 double MinCollector::get_sd_frag_len() const {
306   double tmp = get_mean_frag_len(true);
307   double m = mean_fl;
308 
309   size_t total_counts = 0;
310   double total_mass = 0.0;
311 
312   for (size_t i = 0; i < flens.size(); ++i) {
313     total_counts += flens[i];
314     total_mass += flens[i]*(i-m)*(i-m);
315   }
316 
317   double sd_fl = std::sqrt(total_mass/total_counts);
318   return sd_fl;
319 }
320 
321 
322 
compute_mean_frag_lens_trunc(bool verbose)323 void MinCollector::compute_mean_frag_lens_trunc(bool verbose)  {
324 
325   std::vector<int> counts(MAX_FRAG_LEN, 0);
326   std::vector<double> mass(MAX_FRAG_LEN, 0.0);
327 
328   counts[0] = flens[0];
329 
330   for (size_t i = 1; i < MAX_FRAG_LEN; ++i) {
331     // mass and counts keep track of the mass/counts up to and including index i
332     mass[i] = static_cast<double>( flens[i] * i) + mass[i-1];
333     counts[i] = flens[i] + counts[i-1];
334     if (counts[i] > 0) {
335       mean_fl_trunc[i] = mass[i] / static_cast<double>(counts[i]);
336     }
337     // std::cerr << "--- " << i << '\t' << mean_fl_trunc[i] << std::endl;
338   }
339 
340   has_mean_fl_trunc = true;
341 
342   if (verbose) {
343     std::cerr << "[quant] estimated average fragment length: " <<
344       mean_fl_trunc[MAX_FRAG_LEN - 1] << std::endl;
345   }
346 }
347 
hexamerToInt(const char * s,bool revcomp)348 int hexamerToInt(const char *s, bool revcomp) {
349   int hex = 0;
350   if (!revcomp) {
351     for (int i = 0; i < 6; i++) {
352       hex <<= 2;
353       switch (*(s+i) & 0xDF) {
354       case 'A': break;
355       case 'C': hex += 1; break;
356       case 'G': hex += 2; break;
357       case 'T': hex += 3; break;
358       default: return -1;
359       }
360     }
361   } else {
362     for (int i = 0; i < 6; i++) {
363       switch (*(s+i) & 0xDF) {
364       case 'A': hex += 3 << (2*i);break;
365       case 'C': hex += 2 << (2*i); break;
366       case 'G': hex += 1 << (2*i); break;
367       case 'T': break;
368       default: return -1;
369       }
370     }
371   }
372   return hex;
373 }
374 
countBias(const char * s1,const char * s2,const std::vector<std::pair<KmerEntry,int>> v1,const std::vector<std::pair<KmerEntry,int>> v2,bool paired)375 bool MinCollector::countBias(const char *s1, const char *s2, const std::vector<std::pair<KmerEntry,int>> v1, const std::vector<std::pair<KmerEntry,int>> v2, bool paired) {
376   return countBias(s1,s2,v1,v2,paired,bias5);
377 }
378 
countBias(const char * s1,const char * s2,const std::vector<std::pair<KmerEntry,int>> v1,const std::vector<std::pair<KmerEntry,int>> v2,bool paired,std::vector<int> & biasOut) const379 bool MinCollector::countBias(const char *s1, const char *s2, const std::vector<std::pair<KmerEntry,int>> v1, const std::vector<std::pair<KmerEntry,int>> v2, bool paired, std::vector<int>& biasOut) const {
380 
381   const int pre = 2, post = 4;
382 
383   if (v1.empty() || (paired && v2.empty())) {
384     return false;
385   }
386 
387 
388 
389   auto getPreSeq = [&](const char *s, Kmer km, bool fw, bool csense,  KmerEntry val, int p) -> int {
390     if (s==0) {
391       return -1;
392     }
393     if ((csense && val.getPos() - p >= pre) || (!csense && (val.contig_length - 1 - val.getPos() - p) >= pre )) {
394       const Contig &c = index.dbGraph.contigs[val.contig];
395       bool sense = c.transcripts[0].sense;
396 
397       int hex = -1;
398       //std::cout << "  " << s << "\n";
399       if (csense) {
400         hex = hexamerToInt(c.seq.c_str() + (val.getPos()-p - pre), true);
401         //std::cout << c.seq.substr(val.getPos()- p - pre,6) << "\n";
402       } else {
403         int pos = (val.getPos() + p) + k - post;
404         hex = hexamerToInt(c.seq.c_str() + (pos), false);
405         //std::cout << revcomp(c.seq.substr(pos,6)) << "\n";
406       }
407       return hex;
408     }
409 
410     return -1;
411   };
412 
413   // find first contig of read
414   KmerEntry val1 = v1[0].first;
415   int p1 = v1[0].second;
416   for (auto &x : v1) {
417     if (x.second < p1) {
418       val1 = x.first;
419       p1 = x.second;
420     }
421   }
422 
423   Kmer km1 = Kmer((s1+p1));
424   bool fw1 = (km1==km1.rep());
425   bool csense1 = (fw1 == val1.isFw()); // is this in the direction of the contig?
426 
427   int hex5 = getPreSeq(s1, km1, fw1, csense1, val1, p1);
428 
429   /*
430   int hex3 = -1;
431   if (paired) {
432     // do the same for the second read
433     KmerEntry val2 = v2[0].first;
434     int p2 = v2[0].second;
435     for (auto &x : v2) {
436       if (x.second < p2) {
437         val2 = x.first;
438         p2 = x.second;
439       }
440     }
441 
442     Kmer km2 = Kmer((s2+p2));
443     bool fw2 = (km2==km2.rep());
444     bool csense2 = (fw2 == val2.isFw()); // is this in the direction of the contig?
445 
446     hex3 = getPreSeq(s2, km2, fw2, csense2, val2, p2);
447   }
448   */
449 
450   if (hex5 >=0) { // && (!paired || hex3 >= 0)) {
451     biasOut[hex5]++;
452     //bias3[hex3]++;
453   } else {
454     return false;
455   }
456 
457   return false;
458 }
459