1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2010-2015, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
19 //
20 
21 #ifndef __UTILS_H__
22 #define __UTILS_H__
23 
24 #include <cstdlib>
25 #include <cerrno>
26 #include <climits>
27 #include <cmath>
28 #include <ctime>
29 #include <iostream>
30 #include <utility>
31 #include <string>
32 
33 #include <unistd.h>
34 #include <dirent.h>
35 #include <zlib.h>
36 
37 #include "constants.h"
38 #include "nucleotides.h"
39 
40 char   reverse(char);
41 char  *rev_comp(const char *);
42 void   rev_comp_inplace(char*);
43 string rev_comp(const string&);
44 void   reverse_string(char *);
45 int    is_integer(const char *);
46 double is_double(const char *);
47 int    tokenize_string(const char *, vector<string> &);
48 
49 double factorial(double);
50 double reduced_factorial(double, double);
51 
52 double log_factorial(double);
53 double reduced_log_factorial(double, double);
54 
55 inline
almost_equal(double x,double y)56 bool almost_equal(double x, double y) {
57     const double precision = 1.0e-9;
58     if (x == 0.0 && y == 0.0)
59         return true;
60     if (!std::isnormal(x) || !std::isnormal(y)) {
61         stringstream ss;
62         ss << "almost_equal: x=" << x << ", y=" << y;
63         throw std::domain_error(ss.str());
64     }
65     return std::abs(x-y) <= precision * std::abs(std::min(x,y));
66 }
67 
68 struct OnlineMeanVar {
69     // Computes the mean and variance in a numerically stable way.
70     // Uses the algorithm described in:
71     // B. P. Welford. (1962) Note on a Method for Calculating Corrected Sums of
72     // Squares and Products. Technometrics: 4(3), pp. 419-420.
73     // Chan TF, Golub GH, LeVeque RJ (1979), "Updating Formulae and a Pairwise
74     // Algorithm for Computing Sample Variances.", Technical Report STAN-CS-79-773,
75     // Dpt. Comp. Sci., Stanford University.
76     double n_;
77     double mean_;
78     double M2_;
79 public:
OnlineMeanVarOnlineMeanVar80     OnlineMeanVar() : n_(0.0), mean_(0.0), M2_(0.0) {}
81 
nOnlineMeanVar82     size_t n()     const {return std::round(n_);}
n_dblOnlineMeanVar83     double n_dbl() const {return n_;}
meanOnlineMeanVar84     double mean()  const {return mean_;}
var_pOnlineMeanVar85     double var_p() const {return n_ > 0 ? M2_/n_ : NAN;}
var_sOnlineMeanVar86     double var_s() const {return n_ > 1 ? M2_/(n_-1) : NAN;}
sd_pOnlineMeanVar87     double sd_p()  const {return std::sqrt(var_p());}
sd_sOnlineMeanVar88     double sd_s()  const {return std::sqrt(var_s());}
89 
incrementOnlineMeanVar90     void increment(double x) {
91         n_ += 1.0;
92         double delta1 = x - mean_;
93         mean_ += delta1 / n_;
94         double delta2 = x - mean_;
95         M2_ += delta1 * delta2;
96     }
97 
98     OnlineMeanVar& operator+=(const OnlineMeanVar& other) {
99         double delta = other.mean_ - mean_;
100         double n_tot = n_ + other.n_;
101         double w_other = other.n_ / n_tot;
102         mean_ += delta * w_other;
103         M2_ += other.M2_ + delta * delta * n_ * w_other;
104         n_ = n_tot;
105         return *this;
106     }
107 };
108 
109 //
110 // GtLiks: A class to store the likelihoods of SNP genotypes.
111 //
112 class GtLiks {
113     array<double,10> lnliks_; // {AA,AC,CC,AG,CG,GG,AT,CT,GT,TT} similar to VCF.
114 public:
GtLiks()115     GtLiks() : lnliks_{{1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}} {}
at(Nt2 n1,Nt2 n2)116     double at(Nt2 n1, Nt2 n2) const {return at(gt_index(n1,n2));}
has_lik(Nt2 n1,Nt2 n2)117     bool has_lik(Nt2 n1, Nt2 n2) const {return has_lik(gt_index(n1,n2));}
set(Nt2 n1,Nt2 n2,double lnl)118     void set(Nt2 n1, Nt2 n2, double lnl) {set(gt_index(n1, n2), lnl);}
119 
at(size_t gt)120     double at(size_t gt) const {assert(has_lik(gt)); return lnliks_[gt];}
has_lik(size_t gt)121     bool has_lik(size_t gt) const {return lnliks_[gt] != 1.0;}
set(size_t gt,double lnl)122     void set(size_t gt, double lnl) {assert(!std::isnan(lnl)); assert(lnl<=0.0); assert(!has_lik(gt)); lnliks_[gt] = lnl;}
123 
gt_index(Nt2 n1,Nt2 n2)124     static size_t gt_index(Nt2 n1, Nt2 n2) {
125         if(n1<n2)
126             return size_t(n1) + (size_t(n2)*(size_t(n2)+1)) / 2;
127         else
128             return size_t(n2) + (size_t(n1)*(size_t(n1)+1)) / 2;
129     }
130 
131     // For debugging.
132     friend ostream& operator<<(ostream& os, const GtLiks& liks);
133 };
134 
135 //
136 // Comparison functions for the STL sort routine
137 //
138 bool compare_ints(int, int);
139 bool compare_pair(pair<char, int>, pair<char, int>);
140 bool compare_pair_intint(pair<int, int>, pair<int, int>);
141 bool compare_pair_intdouble(pair<int, double>, pair<int, double>);
142 bool compare_pair_stringint(pair<string, int>, pair<string, int>);
143 bool compare_pair_haplotype(const pair<string, double>&, const pair<string, double>&);
144 bool compare_pair_haplotype_rev(const pair<string, double>&, const pair<string, double>&);
145 bool compare_str_len(const string&, const string&);
146 struct LessCStrs
147 {
operatorLessCStrs148    bool operator() (const char* str1, const char* str2) const {return strcmp(str1, str2) < 0 ;}
149 } ;
150 struct int_increasing {
operatorint_increasing151     bool operator() (const int& lhs, const int& rhs) const {
152         return lhs < rhs;
153     }
154 };
155 
156 struct int_decreasing {
operatorint_decreasing157     bool operator() (const int& lhs, const int& rhs) const {
158         return lhs > rhs;
159     }
160 };
161 
162 //
163 // Remove elements from a vector.
164 //
165 template<typename T, typename Predicate>
stacks_erase_if(vector<T> & v,Predicate p)166 void stacks_erase_if(vector<T>& v, Predicate p)
167 {
168     v.erase(std::remove_if(v.begin(), v.end(), p), v.end());
169 }
170 
171 //
172 // Join a range of elements into a stream.
173 //
174 template<typename IterableT, typename SepT>
join(IterableT elements,const SepT & sep,ostream & os)175 void join(IterableT elements, const SepT& sep, ostream& os) {
176     auto first = elements.begin();
177     if (first != elements.end()) {
178         os << *first;
179         ++first;
180         while (first != elements.end()) {
181             os << sep << *first;
182             ++first;
183         }
184     }
185 }
186 
187 //
188 // Routines to check that files are open.
189 //
190 inline
check_open(std::ifstream & fs,const string & path)191 void check_open (std::ifstream& fs, const string& path) {
192     if (!fs.is_open()) {
193         cerr << "Error: Failed to open '" << path << "' for reading.\n";
194         throw exception();
195     }
196     fs.exceptions(fs.exceptions() | ios::badbit);
197 }
198 inline
check_open(std::ofstream & fs,const string & path)199 void check_open (std::ofstream& fs, const string& path) {
200     if (!fs.is_open()) {
201         cerr << "Error: Failed to open '" << path << "' for writing.\n";
202         throw exception();
203     }
204     fs.exceptions(fs.exceptions() | ios::badbit);
205 }
206 inline
check_open(const gzFile fs,const string & path)207 void check_open (const gzFile fs, const string& path)
208     {if (fs == NULL) {cerr << "Error: Failed to gz-open file '" << path << "'.\n"; throw exception();}}
209 
210 //
211 // Check that a directory exists or try to create it.
212 //
213 void check_or_mk_dir(const string& path);
214 
215 //
216 // Chronometer.
217 //
218 class Timer {
219     double elapsed_;
220     double consumed_;
221     double start_;
222 
223 public:
Timer()224     Timer() : elapsed_(0.0), consumed_(0.0), start_(0.0) {restart();}
restart()225     void   restart() {start_=gettm(); consumed_+=2.0*(gettm()-start_);}
update()226     void   update()  {double now=gettm(); elapsed_+=now-start_; start_=now; consumed_+=2.0*(gettm()-now);}
elapsed()227     double elapsed() const {return elapsed_;}
consumed()228     double consumed() const {return consumed_;}
229 
230     Timer& operator+=(const Timer& other) {elapsed_+=other.elapsed_; consumed_+=other.consumed_; return *this;}
231 
232 private:
gettm()233     double gettm() {
234         #if HAVE_CLOCK_GETTIME && defined _POSIX_MONOTONIC_CLOCK && _POSIX_MONOTONIC_CLOCK >= 0
235         struct timespec ts;
236         if(clock_gettime(CLOCK_MONOTONIC, &ts) == 0)
237             return ts.tv_sec + ts.tv_nsec / 1.0e9;
238         else
239             return 0.0;
240         #else
241         return 0.0;
242         #endif
243     }
244 };
245 
246 //
247 // Class to read lines from a plain text or compressed file indifferently.
248 //
249 class VersatileLineReader {
250     string path_;
251     size_t line_number_;
252     bool is_gzipped_;
253 
254     ifstream ifs_;
255     string ifsbuffer_;
256 
257     gzFile gzfile_;
258     char* gzbuffer_;
259     size_t gzbuffer_size_;
260     size_t gzline_len_;
261     static const size_t gzbuffer_init_size = 65536;
262 
263 public:
264     VersatileLineReader(const string& path);
265     VersatileLineReader();
266     ~VersatileLineReader();
267 
268     int open(string &);
269 
270     //
271     // Reads one line from the file, removing the trailing '\n' (and '\r', if any).
272     // Returns false on EOF, or throws an exception if the file doesn't end with a newline.
273     // e.g.:
274     // const char* line; size_t len; while (file.getline(line, len)) {...}
275     //
276     bool getline(const char*& line, size_t& len);
277 
path()278     const string& path() const {return path_;}
line_number()279     size_t line_number() const {return line_number_;} // 1-based.
280 };
281 
282 class VersatileWriter {
283     const string path_;
284     bool is_gzipped_;
285     ofstream ofs_;
286     gzFile gzfile_;
287 
288     void gzputs_(const char* s);
289 
290 public:
291     VersatileWriter(const string& path);
~VersatileWriter()292     ~VersatileWriter() {gzclose(gzfile_);}
293 
path()294     const string& path() const {return path_;}
295     void close();
296 
297     friend VersatileWriter& operator<< (VersatileWriter& w, char c);
298     friend VersatileWriter& operator<< (VersatileWriter& w, const char* s);
299     friend VersatileWriter& operator<< (VersatileWriter& w, const string& s);
300     friend VersatileWriter& operator<< (VersatileWriter& w, int i);
301     friend VersatileWriter& operator<< (VersatileWriter& w, long i);
302     friend VersatileWriter& operator<< (VersatileWriter& w, size_t i);
303 };
304 
305 //
306 // Wrapper for directory parsing functions.
307 // e.g. for(DirIterator e (path); e; ++e) {...}
308 //
309 class DirIterator {
310     DIR* dir;
311     struct dirent* entry;
312 public:
DirIterator(const string & dir_path)313     DirIterator(const string& dir_path) : dir(NULL), entry(NULL) {
314         dir = opendir(dir_path.c_str());
315         if (dir == NULL) {
316             cerr << "Error: Unable to open directory '" << dir_path << "' for reading.\n";
317             throw exception();
318         }
319         entry = readdir(dir);
320     }
~DirIterator()321     ~DirIterator() {closedir(dir);}
322 
name()323     const char* name() const {return entry->d_name;}
324 
325     operator bool() const {return entry!=NULL;}
326     DirIterator& operator++() {entry = readdir(dir); return *this;}
327     dirent* operator*() {return entry;}
328 };
329 
330 // strip_read_number
331 // ----------
332 // Given a read name, removes the trailing /1, /2, _1 or _1.
333 inline
strip_read_number(string & read_name)334 void strip_read_number(string& read_name) {
335     if (read_name.size() >= 2) {
336         // Check for ".../1" or ".../2"
337         if ((*read_name.rbegin() == '1' || *read_name.rbegin() == '2')
338             && (*++read_name.rbegin() == '/' || *++read_name.rbegin() == '_')
339         ) {
340             // Remove the suffix & return.
341             read_name.resize(read_name.size()-2);
342             return;
343         }
344         // Check for "... 1:..." or "... 2:..."
345         const char *p, *q;
346         if ((p = q = strchr(read_name.c_str(), ' ')) != NULL) {
347             if ((*++q == '1' || *q == '2') && *++q == ':') {
348                 read_name.resize(p - read_name.c_str());
349                 return;
350             }
351         }
352     }
353     // Unexpected suffix.
354     cerr << "Error: Unrecognized paired-end read name format, at '" << read_name << "'.\n";
355     throw exception();
356 }
357 
358 inline
close()359 void VersatileWriter::close() {
360     if (is_gzipped_) {
361         if (gzclose(gzfile_) != Z_OK)
362             throw ios::failure("gzclose");
363         gzfile_ = NULL;
364     } else {
365         ofs_.close();
366     }
367 }
368 
369 inline
gzclose_throwing(gzFile f)370 void gzclose_throwing(gzFile f) {
371     if (gzclose(f) != Z_OK)
372         throw ios::failure("gzclose");
373 }
374 
375 inline
376 VersatileWriter& operator<< (VersatileWriter& w, char c) {
377     if (w.is_gzipped_) {
378         if (gzputc(w.gzfile_, c) == -1)
379             throw ios::failure("gzputc");
380     } else {
381         w.ofs_ << c;
382     }
383     return w;
384 }
385 
386 inline
387 VersatileWriter& operator<< (VersatileWriter& w, const char* s) {
388     if (w.is_gzipped_)
389         w.gzputs_(s);
390     else
391         w.ofs_ << s;
392     return w;
393 }
394 
395 inline
gzputs_(const char * s)396 void VersatileWriter::gzputs_(const char* s) {
397     if (gzputs(gzfile_, s) == -1)
398         throw ios::failure("gzputs");
399 }
400 
401 inline
gzputs_throwing(gzFile f,const char * s)402 void gzputs_throwing(gzFile f, const char* s) {
403     if (gzputs(f, s) == -1)
404         throw ios::failure("gzputs");
405 }
406 
407 inline
408 VersatileWriter& operator<< (VersatileWriter& w, const string& s) {
409     if (w.is_gzipped_) {
410         if (!s.empty() && gzwrite(w.gzfile_, s.c_str(), s.length()) <= 0)
411             throw ios::failure("gzwrite");
412     } else {
413         w.ofs_ << s;
414     }
415     return w;
416 }
417 
418 inline
419 VersatileWriter& operator<< (VersatileWriter& w, int i) {
420     if (w.is_gzipped_) {
421         char buf[16];
422         sprintf(buf, "%d", i);
423         w.gzputs_(buf);
424     } else {
425         w.ofs_ << i;
426     }
427     return w;
428 }
429 
430 inline
431 VersatileWriter& operator<< (VersatileWriter& w, long i) {
432     if (w.is_gzipped_) {
433         char buf[32];
434         sprintf(buf, "%ld", i);
435         w.gzputs_(buf);
436     } else {
437         w.ofs_ << i;
438     }
439     return w;
440 }
441 
442 inline
443 VersatileWriter& operator<< (VersatileWriter& w, size_t i) {
444     if (w.is_gzipped_) {
445         char buf[32];
446         sprintf(buf, "%zu", i);
447         w.gzputs_(buf);
448     } else {
449         w.ofs_ << i;
450     }
451     return w;
452 }
453 
454 #endif // __UTILS_H__
455