1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2014-2017, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks. If not, see <http://www.gnu.org/licenses/>.
19 //
20
21 #ifndef __ORDERED_H__
22 #define __ORDERED_H__
23
24 #include <iostream>
25 #include <fstream>
26 #include <vector>
27 #include <map>
28 #include <set>
29
30 #include "MetaPopInfo.h"
31 #include "PopSum.h"
32
33 extern bool verbose;
34 extern MetaPopInfo mpopi;
35
36 enum loc_type {haplotype, snp};
37
38 template<class StatT>
39 class Ordered {
40 public:
41 ofstream *log_fh;
42
43 size_t incompatible_sites; // Sites containing more than two alleles.
44 size_t total_sites; // Total number of sites in the data set.
45 size_t uniq_sites; // Total number of sites in the genome.
46 size_t multiple_sites; // Sites covered by more than one locus.
47
Ordered()48 Ordered(): incompatible_sites(0), total_sites(0), uniq_sites(0), multiple_sites(0) {}
~Ordered()49 virtual ~Ordered() {}
50
51 int init_sites(vector<const StatT *> &, map<uint, uint> &, const vector<LocBin *> &);
52 int init_sites(vector<const StatT *> &, map<uint, uint> &, const vector<LocBin *> &, uint);
53 int init_sites(vector<const StatT *> &, map<uint, uint> &, const vector<LocBin *> &, uint, uint);
54 int init_haplotypes(vector<const StatT *> &, map<uint, uint> &, const vector<LocBin *> &);
55 };
56
57 template<class StatT>
58 int
init_sites(vector<const StatT * > & sites,map<uint,uint> & sites_key,const vector<LocBin * > & sorted_loci)59 Ordered<StatT>::init_sites(vector<const StatT *> &sites, map<uint, uint> &sites_key, const vector<LocBin *> &sorted_loci)
60 {
61 const CSLocus *loc;
62 const LocTally *ltally;
63 vector<int> bps;
64
65 //
66 // We need to create an array to store all the SNPs for exporting. We must
67 // account for positions in the genome that are covered by more than one RAD tag.
68 //
69 for (uint pos = 0; pos < sorted_loci.size(); pos++) {
70 loc = sorted_loci[pos]->cloc;
71 ltally = sorted_loci[pos]->s->meta_pop();
72
73 for (uint k = 0; k < loc->len; k++) {
74 if (ltally->nucs[k].allele_cnt == 2)
75 bps.push_back(ltally->nucs[k].bp);
76 }
77 }
78
79 sort(bps.begin(), bps.end());
80 vector<int>::iterator new_end = std::unique(bps.begin(), bps.end());
81
82 sites.resize(std::distance(bps.begin(), new_end), NULL);
83
84 //
85 // Create a key describing where in the sites array to find each basepair coordinate.
86 //
87 int i = 0;
88 map<uint, uint>::iterator key_it = sites_key.begin();
89
90 for (vector<int>::iterator it = bps.begin(); it != bps.end(); it++) {
91 key_it = sites_key.insert(key_it, pair<uint, uint>(*it, i));
92 i++;
93 }
94
95 return 0;
96 }
97
98 template<class StatT>
99 int
init_sites(vector<const StatT * > & sites,map<uint,uint> & sites_key,const vector<LocBin * > & sorted_loci,uint pop_id)100 Ordered<StatT>::init_sites(vector<const StatT *> &sites, map<uint, uint> &sites_key, const vector<LocBin *> &sorted_loci, uint pop_id)
101 {
102 const CSLocus *loc;
103 const LocSum *lsum;
104 vector<int> bps;
105
106 //
107 // We need to create an array to store all the summary statistics for smoothing. We must
108 // account for positions in the genome that are covered by more than one RAD tag.
109 //
110 Timer T;
111 for (uint pos = 0; pos < sorted_loci.size(); pos++) {
112 loc = sorted_loci[pos]->cloc;
113 lsum = sorted_loci[pos]->s->per_pop(pop_id);
114
115 for (uint k = 0; k < loc->len; k++) {
116 if (lsum->nucs[k].num_indv > 0)
117 bps.push_back(lsum->nucs[k].bp);
118 }
119 }
120
121 sort(bps.begin(), bps.end());
122 vector<int>::iterator new_end = std::unique(bps.begin(), bps.end());
123
124 sites.resize(std::distance(bps.begin(), new_end), NULL);
125
126 //
127 // Create a key describing where in the sites array to find each basepair coordinate.
128 //
129 size_t i = 0;
130 map<uint, uint>::iterator key_it = sites_key.begin();
131
132 for (vector<int>::iterator it = bps.begin(); it != new_end; it++) {
133 key_it = sites_key.insert(key_it, pair<uint, uint>(*it, i));
134 i++;
135 }
136
137 return 0;
138 }
139
140 template<class StatT>
141 int
init_sites(vector<const StatT * > & sites,map<uint,uint> & sites_key,const vector<LocBin * > & sorted_loci,uint pop_id_1,uint pop_id_2)142 Ordered<StatT>::init_sites(vector<const StatT *> &sites, map<uint, uint> &sites_key, const vector<LocBin *> &sorted_loci, uint pop_id_1, uint pop_id_2)
143 {
144 const CSLocus *loc;
145 const LocSum *lsum_1, *lsum_2;
146 vector<int> bps;
147
148 //
149 // We need to create an array to store all the pair values for computing smoothed Fst. We must
150 // account for positions in the genome that are covered by more than one RAD tag.
151 //
152 for (uint pos = 0; pos < sorted_loci.size(); pos++) {
153 loc = sorted_loci[pos]->cloc;
154 lsum_1 = sorted_loci[pos]->s->per_pop(pop_id_1);
155 lsum_2 = sorted_loci[pos]->s->per_pop(pop_id_2);
156
157 for (uint k = 0; k < loc->len; k++) {
158 if (lsum_1->nucs[k].num_indv > 0 &&
159 lsum_2->nucs[k].num_indv > 0)
160 bps.push_back(lsum_1->nucs[k].bp);
161 }
162 }
163
164 sort(bps.begin(), bps.end());
165 vector<int>::iterator new_end = std::unique(bps.begin(), bps.end());
166
167 sites.resize(std::distance(bps.begin(), new_end), NULL);
168
169 //
170 // Create a key describing where in the sites array to find each basepair coordinate.
171 //
172 size_t i = 0;
173 map<uint, uint>::iterator key_it = sites_key.begin();
174
175 for (vector<int>::iterator it = bps.begin(); it != bps.end(); it++) {
176 key_it = sites_key.insert(key_it, pair<uint, uint>(*it, i));
177 i++;
178 }
179
180 return 0;
181 }
182
183 template<class StatT>
184 int
init_haplotypes(vector<const StatT * > & sites,map<uint,uint> & sites_key,const vector<LocBin * > & sorted_loci)185 Ordered<StatT>::init_haplotypes(vector<const StatT *> &sites, map<uint, uint> &sites_key, const vector<LocBin *> &sorted_loci)
186 {
187 const CSLocus *loc;
188 int bp;
189 vector<int> bps;
190
191 for (uint pos = 0; pos < sorted_loci.size(); pos++) {
192 loc = sorted_loci[pos]->cloc;
193 bp = loc->sort_bp();
194
195 bps.push_back(bp);
196 }
197
198 sort(bps.begin(), bps.end());
199 vector<int>::iterator new_end = std::unique(bps.begin(), bps.end());
200
201 sites.resize(std::distance(bps.begin(), new_end), NULL);
202
203 //
204 // Create a key describing where in the sites array to find each basepair coordinate.
205 //
206 int i = 0;
207 map<uint, uint>::iterator key_it = sites_key.begin();
208
209 for (vector<int>::iterator it = bps.begin(); it != bps.end(); it++) {
210 key_it = sites_key.insert(key_it, pair<uint, uint>(*it, i));
211 i++;
212 }
213
214 return 0;
215 }
216
217 template<class StatT>
218 class OHaplotypes: public Ordered<StatT> {
219 public:
OHaplotypes()220 OHaplotypes(): Ordered<StatT>() { }
221
222 int order(vector<const StatT *> &, const vector<LocBin *> &);
223 int order(vector<const StatT *> &, const vector<LocBin *> &, const vector<StatT *> &);
224 };
225
226 template<class StatT>
227 int
order(vector<const StatT * > & sites,const vector<LocBin * > & sorted_loci)228 OHaplotypes<StatT>::order(vector<const StatT *> &sites, const vector<LocBin *> &sorted_loci)
229 {
230 map<uint, uint> sites_key;
231
232 this->init_haplotypes(sites, sites_key, sorted_loci);
233
234 return 0;
235 };
236
237 template<class StatT>
238 int
order(vector<const StatT * > & sites,const vector<LocBin * > & sorted_loci,const vector<StatT * > & div)239 OHaplotypes<StatT>::order(vector<const StatT *> &sites, const vector<LocBin *> &sorted_loci, const vector<StatT *> &div)
240 {
241 StatT *pair;
242 map<uint, uint> sites_key;
243
244 this->init_haplotypes(sites, sites_key, sorted_loci);
245
246 for (uint i = 0; i < div.size(); i++) {
247 pair = div[i];
248
249 if (pair == NULL)
250 continue;
251
252 sites[sites_key[pair->bp]] = pair;
253 }
254
255 return 0;
256 };
257
258 template<class StatT>
259 class OPopPair: public Ordered<StatT> {
260 public:
OPopPair(ofstream & log_fh)261 OPopPair(ofstream &log_fh): Ordered<StatT>() {
262 this->log_fh = &log_fh;
263 }
264
265 bool order(vector<const StatT *> &, const vector<LocBin *> &, const vector<StatT **> &);
266 };
267
268 template<class StatT>
269 bool
order(vector<const StatT * > & sites,const vector<LocBin * > & sorted_loci,const vector<StatT ** > & div)270 OPopPair<StatT>::order(vector<const StatT *> &sites, const vector<LocBin *> &sorted_loci, const vector<StatT **> &div)
271 {
272 CSLocus *loc;
273 StatT **pair;
274 uint cloc_len;
275 bool found = false;
276
277 this->incompatible_sites = 0;
278 this->total_sites = 0;
279 this->multiple_sites = 0;
280
281 uint pop_1=UINT_MAX, pop_2=UINT_MAX;
282 for (uint i = 0; i < div.size(); i++) {
283 cloc_len = sorted_loci[i]->cloc->len;
284 pair = div[i];
285
286 for (uint j = 0; j < cloc_len; j++) {
287 if (pair[j] != NULL) {
288 pop_1 = pair[j]->pop_1;
289 pop_2 = pair[j]->pop_2;
290 found = true;
291 break;
292 }
293 }
294 if (found == true) break;
295 }
296 if (found == false)
297 return found;
298
299 map<uint, uint> sites_key;
300
301 this->init_sites(sites, sites_key, sorted_loci, pop_1, pop_2);
302
303 this->uniq_sites = sites_key.size();
304
305 for (uint i = 0; i < div.size(); i++) {
306 loc = sorted_loci[i]->cloc;
307 cloc_len = loc->len;
308 pair = div[i];
309
310 for (uint pos = 0; pos < cloc_len; pos++) {
311 this->total_sites++;
312
313 if (pair[pos] == NULL)
314 continue;
315
316 //
317 // Check if this basepair position is already covered by a RAD site.
318 //
319 if (sites[sites_key[pair[pos]->bp]] != NULL) {
320 this->multiple_sites++;
321 } else {
322 sites[sites_key[pair[pos]->bp]] = pair[pos];
323 }
324 }
325 }
326
327 return true;
328 };
329
330 template<class StatT>
331 class OSumStat: public Ordered<StatT> {
332 public:
OSumStat(ofstream & log_fh)333 OSumStat(ofstream &log_fh): Ordered<StatT>() {
334 this->log_fh = &log_fh;
335 }
336
337 int order(vector<const StatT *> &, const vector<LocBin *> &, uint);
338 };
339
340 template<class StatT>
341 int
order(vector<const StatT * > & sites,const vector<LocBin * > & sorted_loci,uint pop_id)342 OSumStat<StatT>::order(vector<const StatT *> &sites, const vector<LocBin *> &sorted_loci, uint pop_id)
343 {
344 this->incompatible_sites = 0;
345 this->total_sites = 0;
346 this->multiple_sites = 0;
347
348 map<uint, uint> sites_key;
349
350 this->init_sites(sites, sites_key, sorted_loci, pop_id);
351
352 this->uniq_sites = sites_key.size();
353
354 const CSLocus *loc;
355 const LocSum *lsum;
356
357 //
358 // Assign nucleotides to their proper, ordered location in the genome,
359 // checking that a site hasn't already been covered by another RAD locus.
360 //
361 for (uint pos = 0; pos < sorted_loci.size(); pos++) {
362 loc = sorted_loci[pos]->cloc;
363 lsum = sorted_loci[pos]->s->per_pop(pop_id);
364
365 for (uint k = 0; k < loc->len; k++) {
366 if (lsum->nucs[k].num_indv == 0) continue;
367
368 assert(sites_key.count(lsum->nucs[k].bp) > 0);
369 this->total_sites++;
370
371 if (sites[sites_key[lsum->nucs[k].bp]] == NULL) {
372 sites[sites_key[lsum->nucs[k].bp]] = &(lsum->nucs[k]);
373 } else {
374 this->multiple_sites++;
375 }
376 }
377 }
378
379 return 0;
380 };
381
382 template<class StatT>
383 class OLocTally: public Ordered<StatT> {
384 public:
OLocTally(ofstream & log_fh)385 OLocTally(ofstream &log_fh): Ordered<StatT>() {
386 this->log_fh = &log_fh;
387 }
OLocTally()388 OLocTally(): Ordered<StatT>() {
389 this->log_fh = NULL;
390 }
391
392 int order(vector<const StatT *> &, const vector<LocBin *> &);
393 };
394
395 template<class StatT>
396 int
order(vector<const StatT * > & sites,const vector<LocBin * > & sorted_loci)397 OLocTally<StatT>::order(vector<const StatT *> &sites, const vector<LocBin *> &sorted_loci)
398 {
399 this->incompatible_sites = 0;
400 this->total_sites = 0;
401 this->multiple_sites = 0;
402
403 map<uint, uint> sites_key;
404
405 this->init_sites(sites, sites_key, sorted_loci);
406
407 this->uniq_sites = sites_key.size();
408
409 const CSLocus *loc;
410 const LocTally *ltally;
411
412 //
413 // Assign nucleotides to their proper, ordered location in the genome,
414 // checking that a site hasn't already been covered by another RAD locus.
415 //
416 for (uint pos = 0; pos < sorted_loci.size(); pos++) {
417 loc = sorted_loci[pos]->cloc;
418 ltally = sorted_loci[pos]->s->meta_pop();
419
420 for (uint k = 0; k < loc->len; k++) {
421 if (ltally->nucs[k].allele_cnt != 2) continue;
422
423 assert(sites_key.count(ltally->nucs[k].bp) > 0);
424 this->total_sites++;
425
426 if (sites[sites_key[ltally->nucs[k].bp]] == NULL) {
427 sites[sites_key[ltally->nucs[k].bp]] = &(ltally->nucs[k]);
428 } else {
429 this->multiple_sites++;
430 }
431 }
432 }
433
434 return 0;
435 };
436
437 #endif // __ORDERED_H__
438