1 #include "MinCollector.h"
2 #include <algorithm>
3
4 // utility functions
5
intersect(const std::vector<int> & x,const std::vector<int> & y)6 std::vector<int> intersect(const std::vector<int>& x, const std::vector<int>& y) {
7 std::vector<int> v;
8 auto a = x.begin();
9 auto b = y.begin();
10 while (a != x.end() && b != y.end()) {
11 if (*a < *b) {
12 ++a;
13 } else if (*b < *a) {
14 ++b;
15 } else {
16 v.push_back(*a);
17 ++a;
18 ++b;
19 }
20 }
21 return v;
22 }
23
init_mean_fl_trunc(double mean,double sd)24 void MinCollector::init_mean_fl_trunc(double mean, double sd) {
25 auto tmp_trunc_fl = trunc_gaussian_fld(0, MAX_FRAG_LEN, mean, sd);
26 assert( tmp_trunc_fl.size() == mean_fl_trunc.size() );
27
28 std::copy( tmp_trunc_fl.begin(), tmp_trunc_fl.end(), mean_fl_trunc.begin() );
29
30 mean_fl = mean_fl_trunc[ MAX_FRAG_LEN - 1 ];
31
32 has_mean_fl = true;
33 has_mean_fl_trunc = true;
34 }
35
intersectKmers(std::vector<std::pair<KmerEntry,int>> & v1,std::vector<std::pair<KmerEntry,int>> & v2,bool nonpaired,std::vector<int> & u) const36 int MinCollector::intersectKmers(std::vector<std::pair<KmerEntry,int>>& v1,
37 std::vector<std::pair<KmerEntry,int>>& v2, bool nonpaired, std::vector<int> &u) const {
38 std::vector<int> u1 = intersectECs(v1);
39 std::vector<int> u2 = intersectECs(v2);
40
41 if (u1.empty() && u2.empty()) {
42 return -1;
43 }
44
45 // non-strict intersection.
46 if (u1.empty()) {
47 if (v1.empty()) {
48 u = u2;
49 } else {
50 return -1;
51 }
52 } else if (u2.empty()) {
53 if (v2.empty()) {
54 u = u1;
55 } else {
56 return -1;
57 }
58 } else {
59 u = intersect(u1,u2);
60 }
61
62 if (u.empty()) {
63 return -1;
64 }
65 return 1;
66 }
67
collect(std::vector<std::pair<KmerEntry,int>> & v1,std::vector<std::pair<KmerEntry,int>> & v2,bool nonpaired)68 int MinCollector::collect(std::vector<std::pair<KmerEntry,int>>& v1,
69 std::vector<std::pair<KmerEntry,int>>& v2, bool nonpaired) {
70 std::vector<int> u;
71 int r = intersectKmers(v1, v2, nonpaired, u);
72 if (r != -1) {
73 return increaseCount(u);
74 } else {
75 return -1;
76 }
77 }
78
findEC(const std::vector<int> & u) const79 int MinCollector::findEC(const std::vector<int>& u) const {
80 if (u.empty()) {
81 return -1;
82 }
83 if (u.size() == 1) {
84 return u[0];
85 }
86 auto search = index.ecmapinv.find(u);
87 if (search != index.ecmapinv.end()) {
88 return search ->second;
89 } else {
90 return -1;
91 }
92 }
93
increaseCount(const std::vector<int> & u)94 int MinCollector::increaseCount(const std::vector<int>& u) {
95 int ec = findEC(u);
96
97 if (u.empty()) {
98 return -1;
99 } else {
100 if (ec != -1) {
101 ++counts[ec];
102 return ec;
103 } else {
104 auto necs = counts.size();
105 //index.ecmap.insert({necs,u});
106 index.ecmap.push_back(u);
107 index.ecmapinv.insert({u,necs});
108 counts.push_back(1);
109 return necs;
110 }
111 }
112
113 /* -- removable
114 if (u.size() == 1) {
115 int ec = u[0];
116 ++counts[ec];
117 return ec;
118 }
119 auto search = index.ecmapinv.find(u);
120 if (search != index.ecmapinv.end()) {
121 // ec class already exists, update count
122 ++counts[search->second];
123 return search->second;
124 } else {
125 // new ec class, update the index and count
126 auto necs = counts.size();
127 //index.ecmap.insert({necs,u});
128 index.ecmap.push_back(u);
129 index.ecmapinv.insert({u,necs});
130 counts.push_back(1);
131 return necs;
132 }
133 */
134 }
135
decreaseCount(const int ec)136 int MinCollector::decreaseCount(const int ec) {
137 assert(ec >= 0 && ec <= index.ecmap.size());
138 --counts[ec];
139 return ec;
140 }
141
142 struct ComparePairsBySecond {
operator ()ComparePairsBySecond143 bool operator()(std::pair<KmerEntry,int> a, std::pair<KmerEntry,int> b) {
144 return a.second < b.second;
145 }
146 };
147
intersectECs(std::vector<std::pair<KmerEntry,int>> & v) const148 std::vector<int> MinCollector::intersectECs(std::vector<std::pair<KmerEntry,int>>& v) const {
149 if (v.empty()) {
150 return {};
151 }
152 sort(v.begin(), v.end(), [&](std::pair<KmerEntry, int> a, std::pair<KmerEntry, int> b)
153 {
154 if (a.first.contig==b.first.contig) {
155 return a.second < b.second;
156 } else {
157 return a.first.contig < b.first.contig;
158 }
159 }); // sort by contig, and then first position
160
161
162 int ec = index.dbGraph.ecs[v[0].first.contig];
163 int lastEC = ec;
164 std::vector<int> u = index.ecmap[ec];
165
166 for (int i = 1; i < v.size(); i++) {
167 if (v[i].first.contig != v[i-1].first.contig) {
168 ec = index.dbGraph.ecs[v[i].first.contig];
169 if (ec != lastEC) {
170 u = index.intersect(ec, u);
171 lastEC = ec;
172 if (u.empty()) {
173 return u;
174 }
175 }
176 }
177 }
178
179 /*for (auto &x : vp) {
180 //tmp = index.intersect(x.first,u);
181 u = index.intersect(x.first,u);
182 //if (!tmp.empty()) {
183 // u = tmp;
184 //count++; // increase the count
185 // }
186 }*/
187
188 // if u is empty do nothing
189 /*if (u.empty()) {
190 return u;
191 }*/
192
193 // find the range of support
194 int minpos = std::numeric_limits<int>::max();
195 int maxpos = 0;
196
197 for (auto& x : v) {
198 minpos = std::min(minpos, x.second);
199 maxpos = std::max(maxpos, x.second);
200 }
201
202 if ((maxpos-minpos + k) < min_range) {
203 return {};
204 }
205
206 return u;
207 }
208
209
loadCounts(ProgramOptions & opt)210 void MinCollector::loadCounts(ProgramOptions& opt) {
211 int num_ecs = counts.size();
212 counts.clear();
213 std::ifstream in((opt.output + "/counts.txt"));
214 int i = 0;
215 if (in.is_open()) {
216 std::string line;
217 while (getline(in, line)) {
218 std::stringstream ss(line);
219 int j,c;
220 ss >> j;
221 ss >> c;
222 if (j != i) {
223 std::cerr << "Error: equivalence class does not match index. Found "
224 << j << ", expected " << i << std::endl;
225 exit(1);
226 }
227 counts.push_back(c);
228 i++;
229 }
230
231 if (i != num_ecs) {
232 std::cerr << "Error: number of equivalence classes does not match index. Found "
233 << i << ", expected " << num_ecs << std::endl;
234 exit(1);
235 }
236 } else {
237 std::cerr << "Error: could not open file " << opt.output << "/counts.txt" << std::endl;
238 exit(1);
239
240 }
241
242 }
243
write(const std::string & pseudoprefix) const244 void MinCollector::write(const std::string& pseudoprefix) const {
245 std::string ecfilename = pseudoprefix + ".ec";
246 std::string countsfilename = pseudoprefix + ".tsv";
247
248 std::ofstream ecof, countsof;
249 ecof.open(ecfilename.c_str(), std::ios::out);
250 // output equivalence classes in the form "EC TXLIST";
251 for (int i = 0; i < index.ecmap.size(); i++) {
252 ecof << i << "\t";
253 // output the rest of the class
254 const auto &v = index.ecmap[i];
255 bool first = true;
256 for (auto x : v) {
257 if (!first) {
258 ecof << ",";
259 } else {
260 first = false;
261 }
262 ecof << x;
263 }
264 ecof << "\n";
265 }
266 ecof.close();
267
268 countsof.open(countsfilename.c_str(), std::ios::out);
269 for (int i = 0; i < counts.size(); i++) {
270 countsof << i << "\t" << counts[i] << "\n";
271 }
272 countsof.close();
273 }
274
get_mean_frag_len(bool lenient) const275 double MinCollector::get_mean_frag_len(bool lenient) const {
276 if (has_mean_fl) {
277 return mean_fl;
278 }
279
280 auto total_counts = 0;
281 double total_mass = 0.0;
282
283 for ( size_t i = 0 ; i < flens.size(); ++i ) {
284 total_counts += flens[i];
285 total_mass += static_cast<double>(flens[i] * i);
286 }
287
288 if (total_counts == 0) {
289 if (!lenient) {
290 std::cerr << "Error: could not determine mean fragment length from paired end reads, no pairs mapped to a unique transcript." << std::endl
291 << " Run kallisto quant again with a pre-specified fragment length (option -l)." << std::endl;
292 exit(1);
293 } else {
294 return std::numeric_limits<double>::max();
295 }
296
297 }
298
299 // cache the value
300 const_cast<double&>(mean_fl) = total_mass / static_cast<double>(total_counts);
301 const_cast<bool&>(has_mean_fl) = true;
302 return mean_fl;
303 }
304
get_sd_frag_len() const305 double MinCollector::get_sd_frag_len() const {
306 double tmp = get_mean_frag_len(true);
307 double m = mean_fl;
308
309 size_t total_counts = 0;
310 double total_mass = 0.0;
311
312 for (size_t i = 0; i < flens.size(); ++i) {
313 total_counts += flens[i];
314 total_mass += flens[i]*(i-m)*(i-m);
315 }
316
317 double sd_fl = std::sqrt(total_mass/total_counts);
318 return sd_fl;
319 }
320
321
322
compute_mean_frag_lens_trunc(bool verbose)323 void MinCollector::compute_mean_frag_lens_trunc(bool verbose) {
324
325 std::vector<int> counts(MAX_FRAG_LEN, 0);
326 std::vector<double> mass(MAX_FRAG_LEN, 0.0);
327
328 counts[0] = flens[0];
329
330 for (size_t i = 1; i < MAX_FRAG_LEN; ++i) {
331 // mass and counts keep track of the mass/counts up to and including index i
332 mass[i] = static_cast<double>( flens[i] * i) + mass[i-1];
333 counts[i] = flens[i] + counts[i-1];
334 if (counts[i] > 0) {
335 mean_fl_trunc[i] = mass[i] / static_cast<double>(counts[i]);
336 }
337 // std::cerr << "--- " << i << '\t' << mean_fl_trunc[i] << std::endl;
338 }
339
340 has_mean_fl_trunc = true;
341
342 if (verbose) {
343 std::cerr << "[quant] estimated average fragment length: " <<
344 mean_fl_trunc[MAX_FRAG_LEN - 1] << std::endl;
345 }
346 }
347
hexamerToInt(const char * s,bool revcomp)348 int hexamerToInt(const char *s, bool revcomp) {
349 int hex = 0;
350 if (!revcomp) {
351 for (int i = 0; i < 6; i++) {
352 hex <<= 2;
353 switch (*(s+i) & 0xDF) {
354 case 'A': break;
355 case 'C': hex += 1; break;
356 case 'G': hex += 2; break;
357 case 'T': hex += 3; break;
358 default: return -1;
359 }
360 }
361 } else {
362 for (int i = 0; i < 6; i++) {
363 switch (*(s+i) & 0xDF) {
364 case 'A': hex += 3 << (2*i);break;
365 case 'C': hex += 2 << (2*i); break;
366 case 'G': hex += 1 << (2*i); break;
367 case 'T': break;
368 default: return -1;
369 }
370 }
371 }
372 return hex;
373 }
374
countBias(const char * s1,const char * s2,const std::vector<std::pair<KmerEntry,int>> v1,const std::vector<std::pair<KmerEntry,int>> v2,bool paired)375 bool MinCollector::countBias(const char *s1, const char *s2, const std::vector<std::pair<KmerEntry,int>> v1, const std::vector<std::pair<KmerEntry,int>> v2, bool paired) {
376 return countBias(s1,s2,v1,v2,paired,bias5);
377 }
378
countBias(const char * s1,const char * s2,const std::vector<std::pair<KmerEntry,int>> v1,const std::vector<std::pair<KmerEntry,int>> v2,bool paired,std::vector<int> & biasOut) const379 bool MinCollector::countBias(const char *s1, const char *s2, const std::vector<std::pair<KmerEntry,int>> v1, const std::vector<std::pair<KmerEntry,int>> v2, bool paired, std::vector<int>& biasOut) const {
380
381 const int pre = 2, post = 4;
382
383 if (v1.empty() || (paired && v2.empty())) {
384 return false;
385 }
386
387
388
389 auto getPreSeq = [&](const char *s, Kmer km, bool fw, bool csense, KmerEntry val, int p) -> int {
390 if (s==0) {
391 return -1;
392 }
393 if ((csense && val.getPos() - p >= pre) || (!csense && (val.contig_length - 1 - val.getPos() - p) >= pre )) {
394 const Contig &c = index.dbGraph.contigs[val.contig];
395 bool sense = c.transcripts[0].sense;
396
397 int hex = -1;
398 //std::cout << " " << s << "\n";
399 if (csense) {
400 hex = hexamerToInt(c.seq.c_str() + (val.getPos()-p - pre), true);
401 //std::cout << c.seq.substr(val.getPos()- p - pre,6) << "\n";
402 } else {
403 int pos = (val.getPos() + p) + k - post;
404 hex = hexamerToInt(c.seq.c_str() + (pos), false);
405 //std::cout << revcomp(c.seq.substr(pos,6)) << "\n";
406 }
407 return hex;
408 }
409
410 return -1;
411 };
412
413 // find first contig of read
414 KmerEntry val1 = v1[0].first;
415 int p1 = v1[0].second;
416 for (auto &x : v1) {
417 if (x.second < p1) {
418 val1 = x.first;
419 p1 = x.second;
420 }
421 }
422
423 Kmer km1 = Kmer((s1+p1));
424 bool fw1 = (km1==km1.rep());
425 bool csense1 = (fw1 == val1.isFw()); // is this in the direction of the contig?
426
427 int hex5 = getPreSeq(s1, km1, fw1, csense1, val1, p1);
428
429 /*
430 int hex3 = -1;
431 if (paired) {
432 // do the same for the second read
433 KmerEntry val2 = v2[0].first;
434 int p2 = v2[0].second;
435 for (auto &x : v2) {
436 if (x.second < p2) {
437 val2 = x.first;
438 p2 = x.second;
439 }
440 }
441
442 Kmer km2 = Kmer((s2+p2));
443 bool fw2 = (km2==km2.rep());
444 bool csense2 = (fw2 == val2.isFw()); // is this in the direction of the contig?
445
446 hex3 = getPreSeq(s2, km2, fw2, csense2, val2, p2);
447 }
448 */
449
450 if (hex5 >=0) { // && (!paired || hex3 >= 0)) {
451 biasOut[hex5]++;
452 //bias3[hex3]++;
453 } else {
454 return false;
455 }
456
457 return false;
458 }
459