1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2014-2017, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
19 //
20 
21 #ifndef __ORDERED_H__
22 #define __ORDERED_H__
23 
24 #include <iostream>
25 #include <fstream>
26 #include <vector>
27 #include <map>
28 #include <set>
29 
30 #include "MetaPopInfo.h"
31 #include "PopSum.h"
32 
33 extern bool verbose;
34 extern MetaPopInfo mpopi;
35 
36 enum loc_type {haplotype, snp};
37 
38 template<class StatT>
39 class Ordered {
40 public:
41     ofstream *log_fh;
42 
43     size_t incompatible_sites; // Sites containing more than two alleles.
44     size_t total_sites;        // Total number of sites in the data set.
45     size_t uniq_sites;         // Total number of sites in the genome.
46     size_t multiple_sites;     // Sites covered by more than one locus.
47 
Ordered()48     Ordered(): incompatible_sites(0), total_sites(0), uniq_sites(0), multiple_sites(0) {}
~Ordered()49     virtual ~Ordered() {}
50 
51     int init_sites(vector<const StatT *> &, map<uint, uint> &, const vector<LocBin *> &);
52     int init_sites(vector<const StatT *> &, map<uint, uint> &, const vector<LocBin *> &, uint);
53     int init_sites(vector<const StatT *> &, map<uint, uint> &, const vector<LocBin *> &, uint, uint);
54     int init_haplotypes(vector<const StatT *> &, map<uint, uint> &, const vector<LocBin *> &);
55 };
56 
57 template<class StatT>
58 int
init_sites(vector<const StatT * > & sites,map<uint,uint> & sites_key,const vector<LocBin * > & sorted_loci)59 Ordered<StatT>::init_sites(vector<const StatT *> &sites, map<uint, uint> &sites_key, const vector<LocBin *> &sorted_loci)
60 {
61     const CSLocus  *loc;
62     const LocTally *ltally;
63     vector<int>     bps;
64 
65     //
66     // We need to create an array to store all the SNPs for exporting. We must
67     // account for positions in the genome that are covered by more than one RAD tag.
68     //
69     for (uint pos = 0; pos < sorted_loci.size(); pos++) {
70         loc    = sorted_loci[pos]->cloc;
71         ltally = sorted_loci[pos]->s->meta_pop();
72 
73         for (uint k = 0; k < loc->len; k++) {
74             if (ltally->nucs[k].allele_cnt == 2)
75                 bps.push_back(ltally->nucs[k].bp);
76         }
77     }
78 
79     sort(bps.begin(), bps.end());
80     vector<int>::iterator new_end = std::unique(bps.begin(), bps.end());
81 
82     sites.resize(std::distance(bps.begin(), new_end), NULL);
83 
84     //
85     // Create a key describing where in the sites array to find each basepair coordinate.
86     //
87     int i = 0;
88     map<uint, uint>::iterator key_it = sites_key.begin();
89 
90     for (vector<int>::iterator it = bps.begin(); it != bps.end(); it++) {
91         key_it = sites_key.insert(key_it, pair<uint, uint>(*it, i));
92         i++;
93     }
94 
95     return 0;
96 }
97 
98 template<class StatT>
99 int
init_sites(vector<const StatT * > & sites,map<uint,uint> & sites_key,const vector<LocBin * > & sorted_loci,uint pop_id)100 Ordered<StatT>::init_sites(vector<const StatT *> &sites, map<uint, uint> &sites_key, const vector<LocBin *> &sorted_loci, uint pop_id)
101 {
102     const CSLocus *loc;
103     const LocSum  *lsum;
104     vector<int>    bps;
105 
106     //
107     // We need to create an array to store all the summary statistics for smoothing. We must
108     // account for positions in the genome that are covered by more than one RAD tag.
109     //
110     Timer T;
111     for (uint pos = 0; pos < sorted_loci.size(); pos++) {
112         loc  = sorted_loci[pos]->cloc;
113         lsum = sorted_loci[pos]->s->per_pop(pop_id);
114 
115         for (uint k = 0; k < loc->len; k++) {
116             if (lsum->nucs[k].num_indv > 0)
117                 bps.push_back(lsum->nucs[k].bp);
118         }
119     }
120 
121     sort(bps.begin(), bps.end());
122     vector<int>::iterator new_end = std::unique(bps.begin(), bps.end());
123 
124     sites.resize(std::distance(bps.begin(), new_end), NULL);
125 
126     //
127     // Create a key describing where in the sites array to find each basepair coordinate.
128     //
129     size_t i = 0;
130     map<uint, uint>::iterator key_it = sites_key.begin();
131 
132     for (vector<int>::iterator it = bps.begin(); it != new_end; it++) {
133         key_it = sites_key.insert(key_it, pair<uint, uint>(*it, i));
134         i++;
135     }
136 
137     return 0;
138 }
139 
140 template<class StatT>
141 int
init_sites(vector<const StatT * > & sites,map<uint,uint> & sites_key,const vector<LocBin * > & sorted_loci,uint pop_id_1,uint pop_id_2)142 Ordered<StatT>::init_sites(vector<const StatT *> &sites, map<uint, uint> &sites_key, const vector<LocBin *> &sorted_loci, uint pop_id_1, uint pop_id_2)
143 {
144     const CSLocus *loc;
145     const LocSum  *lsum_1, *lsum_2;
146     vector<int>    bps;
147 
148     //
149     // We need to create an array to store all the pair values for computing smoothed Fst. We must
150     // account for positions in the genome that are covered by more than one RAD tag.
151     //
152     for (uint pos = 0; pos < sorted_loci.size(); pos++) {
153         loc    = sorted_loci[pos]->cloc;
154         lsum_1 = sorted_loci[pos]->s->per_pop(pop_id_1);
155         lsum_2 = sorted_loci[pos]->s->per_pop(pop_id_2);
156 
157         for (uint k = 0; k < loc->len; k++) {
158             if (lsum_1->nucs[k].num_indv > 0 &&
159                 lsum_2->nucs[k].num_indv > 0)
160                 bps.push_back(lsum_1->nucs[k].bp);
161         }
162     }
163 
164     sort(bps.begin(), bps.end());
165     vector<int>::iterator new_end = std::unique(bps.begin(), bps.end());
166 
167     sites.resize(std::distance(bps.begin(), new_end), NULL);
168 
169     //
170     // Create a key describing where in the sites array to find each basepair coordinate.
171     //
172     size_t i = 0;
173     map<uint, uint>::iterator key_it = sites_key.begin();
174 
175     for (vector<int>::iterator it = bps.begin(); it != bps.end(); it++) {
176         key_it = sites_key.insert(key_it, pair<uint, uint>(*it, i));
177         i++;
178     }
179 
180     return 0;
181 }
182 
183 template<class StatT>
184 int
init_haplotypes(vector<const StatT * > & sites,map<uint,uint> & sites_key,const vector<LocBin * > & sorted_loci)185 Ordered<StatT>::init_haplotypes(vector<const StatT *> &sites, map<uint, uint> &sites_key, const vector<LocBin *> &sorted_loci)
186 {
187     const CSLocus *loc;
188     int         bp;
189     vector<int> bps;
190 
191     for (uint pos = 0; pos < sorted_loci.size(); pos++) {
192         loc = sorted_loci[pos]->cloc;
193         bp  = loc->sort_bp();
194 
195         bps.push_back(bp);
196     }
197 
198     sort(bps.begin(), bps.end());
199     vector<int>::iterator new_end = std::unique(bps.begin(), bps.end());
200 
201     sites.resize(std::distance(bps.begin(), new_end), NULL);
202 
203     //
204     // Create a key describing where in the sites array to find each basepair coordinate.
205     //
206     int i = 0;
207     map<uint, uint>::iterator key_it = sites_key.begin();
208 
209     for (vector<int>::iterator it = bps.begin(); it != bps.end(); it++) {
210         key_it = sites_key.insert(key_it, pair<uint, uint>(*it, i));
211         i++;
212     }
213 
214     return 0;
215 }
216 
217 template<class StatT>
218 class OHaplotypes: public Ordered<StatT> {
219 public:
OHaplotypes()220     OHaplotypes(): Ordered<StatT>() { }
221 
222     int order(vector<const StatT *> &, const vector<LocBin *> &);
223     int order(vector<const StatT *> &, const vector<LocBin *> &, const vector<StatT *> &);
224 };
225 
226 template<class StatT>
227 int
order(vector<const StatT * > & sites,const vector<LocBin * > & sorted_loci)228 OHaplotypes<StatT>::order(vector<const StatT *> &sites, const vector<LocBin *> &sorted_loci)
229 {
230     map<uint, uint> sites_key;
231 
232     this->init_haplotypes(sites, sites_key, sorted_loci);
233 
234     return 0;
235 };
236 
237 template<class StatT>
238 int
order(vector<const StatT * > & sites,const vector<LocBin * > & sorted_loci,const vector<StatT * > & div)239 OHaplotypes<StatT>::order(vector<const StatT *> &sites, const vector<LocBin *> &sorted_loci, const vector<StatT *> &div)
240 {
241     StatT *pair;
242     map<uint, uint> sites_key;
243 
244     this->init_haplotypes(sites, sites_key, sorted_loci);
245 
246     for (uint i = 0; i < div.size(); i++) {
247         pair = div[i];
248 
249         if (pair == NULL)
250             continue;
251 
252         sites[sites_key[pair->bp]] = pair;
253     }
254 
255     return 0;
256 };
257 
258 template<class StatT>
259 class OPopPair: public Ordered<StatT> {
260 public:
OPopPair(ofstream & log_fh)261     OPopPair(ofstream &log_fh): Ordered<StatT>() {
262         this->log_fh = &log_fh;
263     }
264 
265     bool order(vector<const StatT *> &, const vector<LocBin *> &, const vector<StatT **> &);
266 };
267 
268 template<class StatT>
269 bool
order(vector<const StatT * > & sites,const vector<LocBin * > & sorted_loci,const vector<StatT ** > & div)270 OPopPair<StatT>::order(vector<const StatT *> &sites, const vector<LocBin *> &sorted_loci, const vector<StatT **> &div)
271 {
272     CSLocus *loc;
273     StatT  **pair;
274     uint     cloc_len;
275     bool     found = false;
276 
277     this->incompatible_sites = 0;
278     this->total_sites        = 0;
279     this->multiple_sites     = 0;
280 
281     uint pop_1=UINT_MAX, pop_2=UINT_MAX;
282     for (uint i = 0; i < div.size(); i++) {
283         cloc_len = sorted_loci[i]->cloc->len;
284         pair     = div[i];
285 
286         for (uint j = 0; j < cloc_len; j++) {
287             if (pair[j] != NULL) {
288                 pop_1 = pair[j]->pop_1;
289                 pop_2 = pair[j]->pop_2;
290                 found = true;
291                 break;
292             }
293         }
294         if (found == true) break;
295     }
296     if (found == false)
297         return found;
298 
299     map<uint, uint> sites_key;
300 
301     this->init_sites(sites, sites_key, sorted_loci, pop_1, pop_2);
302 
303     this->uniq_sites = sites_key.size();
304 
305     for (uint i = 0; i < div.size(); i++) {
306         loc      = sorted_loci[i]->cloc;
307         cloc_len = loc->len;
308         pair     = div[i];
309 
310         for (uint pos = 0; pos < cloc_len; pos++) {
311             this->total_sites++;
312 
313             if (pair[pos] == NULL)
314                 continue;
315 
316             //
317             // Check if this basepair position is already covered by a RAD site.
318             //
319             if (sites[sites_key[pair[pos]->bp]] != NULL) {
320                 this->multiple_sites++;
321             } else {
322                 sites[sites_key[pair[pos]->bp]] = pair[pos];
323             }
324         }
325     }
326 
327     return true;
328 };
329 
330 template<class StatT>
331 class OSumStat: public Ordered<StatT> {
332 public:
OSumStat(ofstream & log_fh)333     OSumStat(ofstream &log_fh): Ordered<StatT>() {
334         this->log_fh = &log_fh;
335     }
336 
337     int order(vector<const StatT *> &, const vector<LocBin *> &, uint);
338 };
339 
340 template<class StatT>
341 int
order(vector<const StatT * > & sites,const vector<LocBin * > & sorted_loci,uint pop_id)342 OSumStat<StatT>::order(vector<const StatT *> &sites, const vector<LocBin *> &sorted_loci, uint pop_id)
343 {
344     this->incompatible_sites = 0;
345     this->total_sites        = 0;
346     this->multiple_sites     = 0;
347 
348     map<uint, uint> sites_key;
349 
350     this->init_sites(sites, sites_key, sorted_loci, pop_id);
351 
352     this->uniq_sites = sites_key.size();
353 
354     const CSLocus *loc;
355     const LocSum  *lsum;
356 
357     //
358     // Assign nucleotides to their proper, ordered location in the genome,
359     // checking that a site hasn't already been covered by another RAD locus.
360     //
361     for (uint pos = 0; pos < sorted_loci.size(); pos++) {
362         loc  = sorted_loci[pos]->cloc;
363         lsum = sorted_loci[pos]->s->per_pop(pop_id);
364 
365         for (uint k = 0; k < loc->len; k++) {
366             if (lsum->nucs[k].num_indv == 0) continue;
367 
368             assert(sites_key.count(lsum->nucs[k].bp) > 0);
369             this->total_sites++;
370 
371             if (sites[sites_key[lsum->nucs[k].bp]] == NULL) {
372                 sites[sites_key[lsum->nucs[k].bp]] = &(lsum->nucs[k]);
373             } else {
374                 this->multiple_sites++;
375             }
376         }
377     }
378 
379     return 0;
380 };
381 
382 template<class StatT>
383 class OLocTally: public Ordered<StatT> {
384 public:
OLocTally(ofstream & log_fh)385     OLocTally(ofstream &log_fh): Ordered<StatT>() {
386         this->log_fh = &log_fh;
387     }
OLocTally()388     OLocTally(): Ordered<StatT>() {
389         this->log_fh = NULL;
390     }
391 
392     int order(vector<const StatT *> &, const vector<LocBin *> &);
393 };
394 
395 template<class StatT>
396 int
order(vector<const StatT * > & sites,const vector<LocBin * > & sorted_loci)397 OLocTally<StatT>::order(vector<const StatT *> &sites, const vector<LocBin *> &sorted_loci)
398 {
399     this->incompatible_sites = 0;
400     this->total_sites        = 0;
401     this->multiple_sites     = 0;
402 
403     map<uint, uint> sites_key;
404 
405     this->init_sites(sites, sites_key, sorted_loci);
406 
407     this->uniq_sites = sites_key.size();
408 
409     const CSLocus  *loc;
410     const LocTally *ltally;
411 
412     //
413     // Assign nucleotides to their proper, ordered location in the genome,
414     // checking that a site hasn't already been covered by another RAD locus.
415     //
416     for (uint pos = 0; pos < sorted_loci.size(); pos++) {
417         loc    = sorted_loci[pos]->cloc;
418         ltally = sorted_loci[pos]->s->meta_pop();
419 
420         for (uint k = 0; k < loc->len; k++) {
421             if (ltally->nucs[k].allele_cnt != 2) continue;
422 
423             assert(sites_key.count(ltally->nucs[k].bp) > 0);
424             this->total_sites++;
425 
426             if (sites[sites_key[ltally->nucs[k].bp]] == NULL) {
427                 sites[sites_key[ltally->nucs[k].bp]] = &(ltally->nucs[k]);
428             } else {
429                 this->multiple_sites++;
430             }
431         }
432     }
433 
434     return 0;
435 };
436 
437 #endif // __ORDERED_H__
438