1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2010-2018, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
19 //
20 
21 #ifndef __USTACKS_H__
22 #define __USTACKS_H__
23 
24 #include "constants.h"
25 
26 #ifdef _OPENMP
27 #include <omp.h>    // OpenMP library
28 #endif
29 
30 #include <getopt.h> // Process command-line options
31 #include <cmath>
32 #include <cstdlib>
33 #include <cstring>
34 #include <algorithm>
35 #include <utility>
36 
37 #include <string>
38 
39 #include <iostream>
40 #include <fstream>
41 #include <sstream>
42 #include <iomanip> // std::setprecision
43 
44 #include <vector>
45 #include <map>
46 #include <unordered_map>
47 #include <queue>
48 using std::queue;
49 #include <set>
50 #include <unistd.h>
51 
52 #include "config.h"
53 #include "kmers.h"
54 #include "utils.h"
55 #include "DNASeq.h"     // Class for storing two-bit compressed DNA sequences
56 #include "stacks.h"     // Major data structures for holding stacks
57 #include "mstack.h"
58 #include "mst.h"        // Minimum spanning tree implementation
59 #include "models.h"     // Contains maximum likelihood statistical models.
60 #include "FastaI.h"     // Reading input files in FASTA format
61 #include "FastqI.h"     // Reading input files in FASTQ format
62 #include "gzFasta.h"    // Reading gzipped input files in FASTA format
63 #include "gzFastq.h"    // Reading gzipped input files in FASTQ format
64 #include "aln_utils.h"
65 #include "GappedAln.h"
66 
67 class HVal {
68  public:
69     vector<int> ids;
70 
count()71     int count() {
72         return this->ids.size();
73     }
add_id(int id)74     int add_id(int id) {
75         this->ids.push_back(id);
76         return 0;
77     }
78 };
79 
80 typedef unordered_map<DNANSeq, HVal> DNASeqHashMap;
81 
82 void   help( void );
83 void   version( void );
84 int    parse_command_line(int, char**);
85 void   load_radtags(string, DNASeqHashMap &, size_t &);
86 int    load_seq_ids(vector<char *> &);
87 void   reduce_radtags(DNASeqHashMap &, map<int, Stack *> &, map<int, Rem *> &, size_t &, size_t &);
88 int    free_radtags_hash(DNASeqHashMap &, vector<DNANSeq *> &);
89 int    populate_merged_tags(map<int, Stack *> &, map<int, MergedStack *> &);
90 void   merge_stacks(map<int, MergedStack *> &, size_t &);
91 int    call_consensus(map<int, MergedStack *> &, map<int, Stack *> &, map<int, Rem *> &, bool);
92 int    call_alleles(MergedStack *, vector<DNANSeq *> &, vector<read_type> &);
93 int    update_consensus(MergedStack *, map<int, Stack *> &, map<int, Rem *> &);
94 size_t merge_remainders(map<int, MergedStack *> &, map<int, Stack *> &, map<int, Rem *> &);
95 int    search_for_gapped_remainders(map<int, MergedStack *> &, map<int, Stack *> &, map<int, Rem *> &);
96 size_t merge_gapped_remainders(map<int, MergedStack *> &, map<int, Stack *> &, map<int, Rem *> &);
97 int    write_results(map<int, MergedStack *> &, map<int, Stack *> &, map<int, Rem *> &);
98 
99 //
100 // Match MergedStacks using a k-mer hashing algorithm
101 //
102 int  calc_kmer_distance(map<int, MergedStack *> &, int);
103 int  search_for_gaps(map<int, MergedStack *> &);
104 int  merge_gapped_alns(map<int, Stack *> &, map<int, Rem *> &, map<int, MergedStack *> &);
105 int  edit_gapped_seqs(map<int, Stack *> &, map<int, Rem *> &, MergedStack *, vector<pair<char, uint> > &);
106 int  edit_gaps(vector<pair<char, uint> > &, char *);
107 int  dist(MergedStack *, MergedStack *, vector<pair<char, uint> > &);
108 bool rank_alignments(Aln, Aln);
109 //
110 // Calculate depth of coverage statistics for stacks
111 //
112 void calc_coverage_distribution(map<int, MergedStack *> &, double &, double &, double &, double &);
113 
114 //
115 // Dealing with lumberjack (huge) stacks
116 //
117 size_t remove_repetitive_stacks(map<int, MergedStack *> &);
118 int  deleverage(map<int, MergedStack *> &, set<int> &, int, vector<MergedStack *> &);
119 
120 //
121 // Debugging
122 //
123 int  dump_unique_tags(map<int, Stack *> &);
124 int  dump_merged_tags(map<int, MergedStack *> &);
125 int  dump_stack_graph(string, map<int, Stack *> &, map<int, MergedStack *> &, vector<int> &, map<int, map<int, double> > &, map<int, set<int> > &);
126 
127 //
128 // Utilities
129 //
130 MergedStack *merge_tags(MergedStack *, MergedStack *, int);
131 MergedStack *merge_tags(map<int, MergedStack *> &, set<int> &, int);
132 MergedStack *merge_tags(map<int, MergedStack *> &, int *, int, int);
133 long double factorial(int);
134 
135 #endif // __USTACKS_H__
136