1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2010-2015, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks. If not, see <http://www.gnu.org/licenses/>.
19 //
20
21 #ifndef __UTILS_H__
22 #define __UTILS_H__
23
24 #include <cstdlib>
25 #include <cerrno>
26 #include <climits>
27 #include <cmath>
28 #include <ctime>
29 #include <iostream>
30 #include <utility>
31 #include <string>
32
33 #include <unistd.h>
34 #include <dirent.h>
35 #include <zlib.h>
36
37 #include "constants.h"
38 #include "nucleotides.h"
39
40 char reverse(char);
41 char *rev_comp(const char *);
42 void rev_comp_inplace(char*);
43 string rev_comp(const string&);
44 void reverse_string(char *);
45 int is_integer(const char *);
46 double is_double(const char *);
47 int tokenize_string(const char *, vector<string> &);
48
49 double factorial(double);
50 double reduced_factorial(double, double);
51
52 double log_factorial(double);
53 double reduced_log_factorial(double, double);
54
55 inline
almost_equal(double x,double y)56 bool almost_equal(double x, double y) {
57 const double precision = 1.0e-9;
58 if (x == 0.0 && y == 0.0)
59 return true;
60 if (!std::isnormal(x) || !std::isnormal(y)) {
61 stringstream ss;
62 ss << "almost_equal: x=" << x << ", y=" << y;
63 throw std::domain_error(ss.str());
64 }
65 return std::abs(x-y) <= precision * std::abs(std::min(x,y));
66 }
67
68 struct OnlineMeanVar {
69 // Computes the mean and variance in a numerically stable way.
70 // Uses the algorithm described in:
71 // B. P. Welford. (1962) Note on a Method for Calculating Corrected Sums of
72 // Squares and Products. Technometrics: 4(3), pp. 419-420.
73 // Chan TF, Golub GH, LeVeque RJ (1979), "Updating Formulae and a Pairwise
74 // Algorithm for Computing Sample Variances.", Technical Report STAN-CS-79-773,
75 // Dpt. Comp. Sci., Stanford University.
76 double n_;
77 double mean_;
78 double M2_;
79 public:
OnlineMeanVarOnlineMeanVar80 OnlineMeanVar() : n_(0.0), mean_(0.0), M2_(0.0) {}
81
nOnlineMeanVar82 size_t n() const {return std::round(n_);}
n_dblOnlineMeanVar83 double n_dbl() const {return n_;}
meanOnlineMeanVar84 double mean() const {return mean_;}
var_pOnlineMeanVar85 double var_p() const {return n_ > 0 ? M2_/n_ : NAN;}
var_sOnlineMeanVar86 double var_s() const {return n_ > 1 ? M2_/(n_-1) : NAN;}
sd_pOnlineMeanVar87 double sd_p() const {return std::sqrt(var_p());}
sd_sOnlineMeanVar88 double sd_s() const {return std::sqrt(var_s());}
89
incrementOnlineMeanVar90 void increment(double x) {
91 n_ += 1.0;
92 double delta1 = x - mean_;
93 mean_ += delta1 / n_;
94 double delta2 = x - mean_;
95 M2_ += delta1 * delta2;
96 }
97
98 OnlineMeanVar& operator+=(const OnlineMeanVar& other) {
99 double delta = other.mean_ - mean_;
100 double n_tot = n_ + other.n_;
101 double w_other = other.n_ / n_tot;
102 mean_ += delta * w_other;
103 M2_ += other.M2_ + delta * delta * n_ * w_other;
104 n_ = n_tot;
105 return *this;
106 }
107 };
108
109 //
110 // GtLiks: A class to store the likelihoods of SNP genotypes.
111 //
112 class GtLiks {
113 array<double,10> lnliks_; // {AA,AC,CC,AG,CG,GG,AT,CT,GT,TT} similar to VCF.
114 public:
GtLiks()115 GtLiks() : lnliks_{{1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}} {}
at(Nt2 n1,Nt2 n2)116 double at(Nt2 n1, Nt2 n2) const {return at(gt_index(n1,n2));}
has_lik(Nt2 n1,Nt2 n2)117 bool has_lik(Nt2 n1, Nt2 n2) const {return has_lik(gt_index(n1,n2));}
set(Nt2 n1,Nt2 n2,double lnl)118 void set(Nt2 n1, Nt2 n2, double lnl) {set(gt_index(n1, n2), lnl);}
119
at(size_t gt)120 double at(size_t gt) const {assert(has_lik(gt)); return lnliks_[gt];}
has_lik(size_t gt)121 bool has_lik(size_t gt) const {return lnliks_[gt] != 1.0;}
set(size_t gt,double lnl)122 void set(size_t gt, double lnl) {assert(!std::isnan(lnl)); assert(lnl<=0.0); assert(!has_lik(gt)); lnliks_[gt] = lnl;}
123
gt_index(Nt2 n1,Nt2 n2)124 static size_t gt_index(Nt2 n1, Nt2 n2) {
125 if(n1<n2)
126 return size_t(n1) + (size_t(n2)*(size_t(n2)+1)) / 2;
127 else
128 return size_t(n2) + (size_t(n1)*(size_t(n1)+1)) / 2;
129 }
130
131 // For debugging.
132 friend ostream& operator<<(ostream& os, const GtLiks& liks);
133 };
134
135 //
136 // Comparison functions for the STL sort routine
137 //
138 bool compare_ints(int, int);
139 bool compare_pair(pair<char, int>, pair<char, int>);
140 bool compare_pair_intint(pair<int, int>, pair<int, int>);
141 bool compare_pair_intdouble(pair<int, double>, pair<int, double>);
142 bool compare_pair_stringint(pair<string, int>, pair<string, int>);
143 bool compare_pair_haplotype(const pair<string, double>&, const pair<string, double>&);
144 bool compare_pair_haplotype_rev(const pair<string, double>&, const pair<string, double>&);
145 bool compare_str_len(const string&, const string&);
146 struct LessCStrs
147 {
operatorLessCStrs148 bool operator() (const char* str1, const char* str2) const {return strcmp(str1, str2) < 0 ;}
149 } ;
150 struct int_increasing {
operatorint_increasing151 bool operator() (const int& lhs, const int& rhs) const {
152 return lhs < rhs;
153 }
154 };
155
156 struct int_decreasing {
operatorint_decreasing157 bool operator() (const int& lhs, const int& rhs) const {
158 return lhs > rhs;
159 }
160 };
161
162 //
163 // Remove elements from a vector.
164 //
165 template<typename T, typename Predicate>
stacks_erase_if(vector<T> & v,Predicate p)166 void stacks_erase_if(vector<T>& v, Predicate p)
167 {
168 v.erase(std::remove_if(v.begin(), v.end(), p), v.end());
169 }
170
171 //
172 // Join a range of elements into a stream.
173 //
174 template<typename IterableT, typename SepT>
join(IterableT elements,const SepT & sep,ostream & os)175 void join(IterableT elements, const SepT& sep, ostream& os) {
176 auto first = elements.begin();
177 if (first != elements.end()) {
178 os << *first;
179 ++first;
180 while (first != elements.end()) {
181 os << sep << *first;
182 ++first;
183 }
184 }
185 }
186
187 //
188 // Routines to check that files are open.
189 //
190 inline
check_open(std::ifstream & fs,const string & path)191 void check_open (std::ifstream& fs, const string& path) {
192 if (!fs.is_open()) {
193 cerr << "Error: Failed to open '" << path << "' for reading.\n";
194 throw exception();
195 }
196 fs.exceptions(fs.exceptions() | ios::badbit);
197 }
198 inline
check_open(std::ofstream & fs,const string & path)199 void check_open (std::ofstream& fs, const string& path) {
200 if (!fs.is_open()) {
201 cerr << "Error: Failed to open '" << path << "' for writing.\n";
202 throw exception();
203 }
204 fs.exceptions(fs.exceptions() | ios::badbit);
205 }
206 inline
check_open(const gzFile fs,const string & path)207 void check_open (const gzFile fs, const string& path)
208 {if (fs == NULL) {cerr << "Error: Failed to gz-open file '" << path << "'.\n"; throw exception();}}
209
210 //
211 // Check that a directory exists or try to create it.
212 //
213 void check_or_mk_dir(const string& path);
214
215 //
216 // Chronometer.
217 //
218 class Timer {
219 double elapsed_;
220 double consumed_;
221 double start_;
222
223 public:
Timer()224 Timer() : elapsed_(0.0), consumed_(0.0), start_(0.0) {restart();}
restart()225 void restart() {start_=gettm(); consumed_+=2.0*(gettm()-start_);}
update()226 void update() {double now=gettm(); elapsed_+=now-start_; start_=now; consumed_+=2.0*(gettm()-now);}
elapsed()227 double elapsed() const {return elapsed_;}
consumed()228 double consumed() const {return consumed_;}
229
230 Timer& operator+=(const Timer& other) {elapsed_+=other.elapsed_; consumed_+=other.consumed_; return *this;}
231
232 private:
gettm()233 double gettm() {
234 #if HAVE_CLOCK_GETTIME && defined _POSIX_MONOTONIC_CLOCK && _POSIX_MONOTONIC_CLOCK >= 0
235 struct timespec ts;
236 if(clock_gettime(CLOCK_MONOTONIC, &ts) == 0)
237 return ts.tv_sec + ts.tv_nsec / 1.0e9;
238 else
239 return 0.0;
240 #else
241 return 0.0;
242 #endif
243 }
244 };
245
246 //
247 // Class to read lines from a plain text or compressed file indifferently.
248 //
249 class VersatileLineReader {
250 string path_;
251 size_t line_number_;
252 bool is_gzipped_;
253
254 ifstream ifs_;
255 string ifsbuffer_;
256
257 gzFile gzfile_;
258 char* gzbuffer_;
259 size_t gzbuffer_size_;
260 size_t gzline_len_;
261 static const size_t gzbuffer_init_size = 65536;
262
263 public:
264 VersatileLineReader(const string& path);
265 VersatileLineReader();
266 ~VersatileLineReader();
267
268 int open(string &);
269
270 //
271 // Reads one line from the file, removing the trailing '\n' (and '\r', if any).
272 // Returns false on EOF, or throws an exception if the file doesn't end with a newline.
273 // e.g.:
274 // const char* line; size_t len; while (file.getline(line, len)) {...}
275 //
276 bool getline(const char*& line, size_t& len);
277
path()278 const string& path() const {return path_;}
line_number()279 size_t line_number() const {return line_number_;} // 1-based.
280 };
281
282 class VersatileWriter {
283 const string path_;
284 bool is_gzipped_;
285 ofstream ofs_;
286 gzFile gzfile_;
287
288 void gzputs_(const char* s);
289
290 public:
291 VersatileWriter(const string& path);
~VersatileWriter()292 ~VersatileWriter() {gzclose(gzfile_);}
293
path()294 const string& path() const {return path_;}
295 void close();
296
297 friend VersatileWriter& operator<< (VersatileWriter& w, char c);
298 friend VersatileWriter& operator<< (VersatileWriter& w, const char* s);
299 friend VersatileWriter& operator<< (VersatileWriter& w, const string& s);
300 friend VersatileWriter& operator<< (VersatileWriter& w, int i);
301 friend VersatileWriter& operator<< (VersatileWriter& w, long i);
302 friend VersatileWriter& operator<< (VersatileWriter& w, size_t i);
303 };
304
305 //
306 // Wrapper for directory parsing functions.
307 // e.g. for(DirIterator e (path); e; ++e) {...}
308 //
309 class DirIterator {
310 DIR* dir;
311 struct dirent* entry;
312 public:
DirIterator(const string & dir_path)313 DirIterator(const string& dir_path) : dir(NULL), entry(NULL) {
314 dir = opendir(dir_path.c_str());
315 if (dir == NULL) {
316 cerr << "Error: Unable to open directory '" << dir_path << "' for reading.\n";
317 throw exception();
318 }
319 entry = readdir(dir);
320 }
~DirIterator()321 ~DirIterator() {closedir(dir);}
322
name()323 const char* name() const {return entry->d_name;}
324
325 operator bool() const {return entry!=NULL;}
326 DirIterator& operator++() {entry = readdir(dir); return *this;}
327 dirent* operator*() {return entry;}
328 };
329
330 // strip_read_number
331 // ----------
332 // Given a read name, removes the trailing /1, /2, _1 or _1.
333 inline
strip_read_number(string & read_name)334 void strip_read_number(string& read_name) {
335 if (read_name.size() >= 2) {
336 // Check for ".../1" or ".../2"
337 if ((*read_name.rbegin() == '1' || *read_name.rbegin() == '2')
338 && (*++read_name.rbegin() == '/' || *++read_name.rbegin() == '_')
339 ) {
340 // Remove the suffix & return.
341 read_name.resize(read_name.size()-2);
342 return;
343 }
344 // Check for "... 1:..." or "... 2:..."
345 const char *p, *q;
346 if ((p = q = strchr(read_name.c_str(), ' ')) != NULL) {
347 if ((*++q == '1' || *q == '2') && *++q == ':') {
348 read_name.resize(p - read_name.c_str());
349 return;
350 }
351 }
352 }
353 // Unexpected suffix.
354 cerr << "Error: Unrecognized paired-end read name format, at '" << read_name << "'.\n";
355 throw exception();
356 }
357
358 inline
close()359 void VersatileWriter::close() {
360 if (is_gzipped_) {
361 if (gzclose(gzfile_) != Z_OK)
362 throw ios::failure("gzclose");
363 gzfile_ = NULL;
364 } else {
365 ofs_.close();
366 }
367 }
368
369 inline
gzclose_throwing(gzFile f)370 void gzclose_throwing(gzFile f) {
371 if (gzclose(f) != Z_OK)
372 throw ios::failure("gzclose");
373 }
374
375 inline
376 VersatileWriter& operator<< (VersatileWriter& w, char c) {
377 if (w.is_gzipped_) {
378 if (gzputc(w.gzfile_, c) == -1)
379 throw ios::failure("gzputc");
380 } else {
381 w.ofs_ << c;
382 }
383 return w;
384 }
385
386 inline
387 VersatileWriter& operator<< (VersatileWriter& w, const char* s) {
388 if (w.is_gzipped_)
389 w.gzputs_(s);
390 else
391 w.ofs_ << s;
392 return w;
393 }
394
395 inline
gzputs_(const char * s)396 void VersatileWriter::gzputs_(const char* s) {
397 if (gzputs(gzfile_, s) == -1)
398 throw ios::failure("gzputs");
399 }
400
401 inline
gzputs_throwing(gzFile f,const char * s)402 void gzputs_throwing(gzFile f, const char* s) {
403 if (gzputs(f, s) == -1)
404 throw ios::failure("gzputs");
405 }
406
407 inline
408 VersatileWriter& operator<< (VersatileWriter& w, const string& s) {
409 if (w.is_gzipped_) {
410 if (!s.empty() && gzwrite(w.gzfile_, s.c_str(), s.length()) <= 0)
411 throw ios::failure("gzwrite");
412 } else {
413 w.ofs_ << s;
414 }
415 return w;
416 }
417
418 inline
419 VersatileWriter& operator<< (VersatileWriter& w, int i) {
420 if (w.is_gzipped_) {
421 char buf[16];
422 sprintf(buf, "%d", i);
423 w.gzputs_(buf);
424 } else {
425 w.ofs_ << i;
426 }
427 return w;
428 }
429
430 inline
431 VersatileWriter& operator<< (VersatileWriter& w, long i) {
432 if (w.is_gzipped_) {
433 char buf[32];
434 sprintf(buf, "%ld", i);
435 w.gzputs_(buf);
436 } else {
437 w.ofs_ << i;
438 }
439 return w;
440 }
441
442 inline
443 VersatileWriter& operator<< (VersatileWriter& w, size_t i) {
444 if (w.is_gzipped_) {
445 char buf[32];
446 sprintf(buf, "%zu", i);
447 w.gzputs_(buf);
448 } else {
449 w.ofs_ << i;
450 }
451 return w;
452 }
453
454 #endif // __UTILS_H__
455