1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2014-2018, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
19 //
20 
21 #ifndef __SMOOTHING_H__
22 #define __SMOOTHING_H__
23 
24 #include <cmath>
25 #include "smoothing_utils.h"
26 
27 template<class StatT=PopStat>
28 class KSmooth {
29     uint    size;    // Number of elements expected in the StatT class to smooth.
30     double *weights; // Weight matrix to apply while smoothing.
31 
32 public:
KSmooth(int size)33     KSmooth(int size)  {
34         this->size    = size;
35         this->weights = calc_weights();
36 
37     }
~KSmooth()38     ~KSmooth() {
39         delete [] this->weights;
40     }
41 
42     int smooth(vector<const StatT *> &popstats);
43 };
44 
45 template<class StatT>
46 int
smooth(vector<const StatT * > & popstats)47 KSmooth<StatT>::smooth(vector<const StatT *> &popstats)
48 {
49     //
50     // To generate smooth genome-wide distributions of Fst, we calculate a kernel-smoothing
51     // moving average of Fst values along each ordered chromosome.
52     //
53     // For each genomic region centered on a nucleotide position c, the contribution of the population
54     // genetic statistic at position p to the region average was weighted by the Gaussian function:
55     //   exp( (-1 * (p - c)^2) / (2 * sigma^2))
56     //
57     // In addition, we weight each position according to (n_k - 1), where n_k is the number of alleles
58     // sampled at that location.
59     //
60     // By default, sigma = 150Kb, for computational efficiency, only calculate average out to 3sigma.
61     //
62     #pragma omp parallel
63     {
64         int      limit = 3 * sigma;
65         int      dist;
66         uint     pos_l, pos_u;
67         double   *sum, final_weight;
68         const PopStat *c, *p;
69 
70         sum   = new double[this->size];
71         pos_l = 0;
72         pos_u = 0;
73 
74         #pragma omp for schedule(dynamic, 1)
75         for (uint pos_c = 0; pos_c < popstats.size(); pos_c++) {
76             c = popstats[pos_c];
77 
78             if (c == NULL || c->fixed == true)
79                 continue;
80 
81             for (uint i = 0; i < this->size; i++) {
82                 c->smoothed[i] = 0.0;
83                 sum[i]         = 0.0;
84             }
85 
86             determine_window_limits(popstats, c->bp, pos_l, pos_u);
87 
88             for (uint pos_p = pos_l; pos_p < pos_u; pos_p++) {
89                 p = popstats[pos_p];
90 
91                 if (p == NULL)
92                     continue;
93 
94                 dist = p->bp > c->bp ? p->bp - c->bp : c->bp - p->bp;
95 
96                 if (dist > limit || dist < 0) {
97                     #pragma omp critical
98                     {
99                         cerr << "ERROR: current basepair is out of the sliding window.\n"
100                              << "  Calculating sliding window; start position: " << pos_l << ", " << (popstats[pos_l] == NULL ? -1 : popstats[pos_l]->bp +1) << "bp; end position: "
101                              << pos_u << ", " << (popstats[pos_u] == NULL ? -1 : popstats[pos_u]->bp +1) << "bp; center: "
102                              << pos_c << ", " << popstats[pos_c]->bp +1 << "bp\n"
103                              << "  Current position: " << pos_p << ", " << popstats[pos_p]->bp +1 << "; Dist: " << dist << "\n"
104                              << "  Window positions:\n";
105 
106                         for (uint j = pos_l; j < pos_u; j++) {
107                             p = popstats[j];
108                             if (p == NULL) continue;
109                             cerr << "    Position: " << j << "; " << p->bp + 1 << "bp\n";
110                         }
111                     }
112                     continue;
113                 }
114 
115                 // sites_cnt++;
116 
117                 final_weight = (p->alleles - 1) * this->weights[dist];
118                 for (uint i = 0; i < this->size; i++) {
119                     if (p->stat[i] > -7.0) {
120                         c->smoothed[i] += p->stat[i] * final_weight;
121                         sum[i]         += final_weight;
122                     }
123                 }
124                 // if (c->loc_id == 9314) {
125                 //     cerr << "    id: " << p->loc_id
126                 //          << "; dist: " << dist
127                 //          << "; weight: " << weights[dist]
128                 //          << "; final_weight: " << final_weight
129                 //          << "; fst': " << p->stat[3]
130                 //          << "; sum: " << sum
131                 //          << "; smoothed: " << c->smoothed[3] << "\n";
132                 // }
133             }
134 
135             // sites_per_snp += (sites_cnt / snp_cnt);
136             // tot_windows++;
137             //
138             // if (snp_cnt < max_snp_dist) {
139             //     #pragma omp atomic
140             //     snp_dist[snp_cnt]++;
141             // }
142             // c->snp_cnt = snp_cnt;
143 
144             for (uint i = 0; i < this->size; i++)
145                 c->smoothed[i] /= sum[i];
146         }
147 
148         delete [] sum;
149     }
150 
151     return 0;
152 }
153 
154 #endif // __SMOOTHING_H__
155