1 ///////////////////////////////////////////////////////////////////////////////
2 // weighted_p_square_cumul_dist.hpp
3 //
4 //  Copyright 2006 Daniel Egloff, Olivier Gygi. Distributed under the Boost
5 //  Software License, Version 1.0. (See accompanying file
6 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
7 
8 #ifndef BOOST_ACCUMULATORS_STATISTICS_WEIGHTED_P_SQUARE_CUMUL_DIST_HPP_DE_01_01_2006
9 #define BOOST_ACCUMULATORS_STATISTICS_WEIGHTED_P_SQUARE_CUMUL_DIST_HPP_DE_01_01_2006
10 
11 #include <vector>
12 #include <functional>
13 #include <boost/parameter/keyword.hpp>
14 #include <boost/mpl/placeholders.hpp>
15 #include <boost/range.hpp>
16 #include <boost/accumulators/framework/accumulator_base.hpp>
17 #include <boost/accumulators/framework/extractor.hpp>
18 #include <boost/accumulators/numeric/functional.hpp>
19 #include <boost/accumulators/framework/parameters/sample.hpp>
20 #include <boost/accumulators/statistics_fwd.hpp>
21 #include <boost/accumulators/statistics/count.hpp>
22 #include <boost/accumulators/statistics/sum.hpp>
23 #include <boost/accumulators/statistics/p_square_cumul_dist.hpp> // for named parameter p_square_cumulative_distribution_num_cells
24 
25 namespace boost { namespace accumulators
26 {
27 
28 namespace impl
29 {
30     ///////////////////////////////////////////////////////////////////////////////
31     // weighted_p_square_cumulative_distribution_impl
32     //  cumulative distribution calculation (as histogram)
33     /**
34         @brief Histogram calculation of the cumulative distribution with the \f$P^2\f$ algorithm for weighted samples
35 
36         A histogram of the sample cumulative distribution is computed dynamically without storing samples
37         based on the \f$ P^2 \f$ algorithm for weighted samples. The returned histogram has a specifiable
38         amount (num_cells) equiprobable (and not equal-sized) cells.
39 
40         Note that applying importance sampling results in regions to be more and other regions to be less
41         accurately estimated than without importance sampling, i.e., with unweighted samples.
42 
43         For further details, see
44 
45         R. Jain and I. Chlamtac, The P^2 algorithm for dynamic calculation of quantiles and
46         histograms without storing observations, Communications of the ACM,
47         Volume 28 (October), Number 10, 1985, p. 1076-1085.
48 
49         @param p_square_cumulative_distribution_num_cells
50     */
51     template<typename Sample, typename Weight>
52     struct weighted_p_square_cumulative_distribution_impl
53       : accumulator_base
54     {
55         typedef typename numeric::functional::multiplies<Sample, Weight>::result_type weighted_sample;
56         typedef typename numeric::functional::fdiv<weighted_sample, std::size_t>::result_type float_type;
57         typedef std::vector<std::pair<float_type, float_type> > histogram_type;
58         typedef std::vector<float_type> array_type;
59         // for boost::result_of
60         typedef iterator_range<typename histogram_type::iterator> result_type;
61 
62         template<typename Args>
weighted_p_square_cumulative_distribution_implboost::accumulators::impl::weighted_p_square_cumulative_distribution_impl63         weighted_p_square_cumulative_distribution_impl(Args const &args)
64           : num_cells(args[p_square_cumulative_distribution_num_cells])
65           , heights(num_cells + 1)
66           , actual_positions(num_cells + 1)
67           , desired_positions(num_cells + 1)
68           , histogram(num_cells + 1)
69           , is_dirty(true)
70         {
71         }
72 
73         template<typename Args>
operator ()boost::accumulators::impl::weighted_p_square_cumulative_distribution_impl74         void operator ()(Args const &args)
75         {
76             this->is_dirty = true;
77 
78             std::size_t cnt = count(args);
79             std::size_t sample_cell = 1; // k
80             std::size_t b = this->num_cells;
81 
82             // accumulate num_cells + 1 first samples
83             if (cnt <= b + 1)
84             {
85                 this->heights[cnt - 1] = args[sample];
86                 this->actual_positions[cnt - 1] = args[weight];
87 
88                 // complete the initialization of heights by sorting
89                 if (cnt == b + 1)
90                 {
91                     //std::sort(this->heights.begin(), this->heights.end());
92 
93                     // TODO: we need to sort the initial samples (in heights) in ascending order and
94                     // sort their weights (in actual_positions) the same way. The following lines do
95                     // it, but there must be a better and more efficient way of doing this.
96                     typename array_type::iterator it_begin, it_end, it_min;
97 
98                     it_begin = this->heights.begin();
99                     it_end   = this->heights.end();
100 
101                     std::size_t pos = 0;
102 
103                     while (it_begin != it_end)
104                     {
105                         it_min = std::min_element(it_begin, it_end);
106                         std::size_t d = std::distance(it_begin, it_min);
107                         std::swap(*it_begin, *it_min);
108                         std::swap(this->actual_positions[pos], this->actual_positions[pos + d]);
109                         ++it_begin;
110                         ++pos;
111                     }
112 
113                     // calculate correct initial actual positions
114                     for (std::size_t i = 1; i < b; ++i)
115                     {
116                         this->actual_positions[i] += this->actual_positions[i - 1];
117                     }
118                 }
119             }
120             else
121             {
122                 // find cell k such that heights[k-1] <= args[sample] < heights[k] and adjust extreme values
123                 if (args[sample] < this->heights[0])
124                 {
125                     this->heights[0] = args[sample];
126                     this->actual_positions[0] = args[weight];
127                     sample_cell = 1;
128                 }
129                 else if (this->heights[b] <= args[sample])
130                 {
131                     this->heights[b] = args[sample];
132                     sample_cell = b;
133                 }
134                 else
135                 {
136                     typename array_type::iterator it;
137                     it = std::upper_bound(
138                         this->heights.begin()
139                       , this->heights.end()
140                       , args[sample]
141                     );
142 
143                     sample_cell = std::distance(this->heights.begin(), it);
144                 }
145 
146                 // increment positions of markers above sample_cell
147                 for (std::size_t i = sample_cell; i < b + 1; ++i)
148                 {
149                     this->actual_positions[i] += args[weight];
150                 }
151 
152                 // determine desired marker positions
153                 for (std::size_t i = 1; i < b + 1; ++i)
154                 {
155                     this->desired_positions[i] = this->actual_positions[0]
156                                                + numeric::fdiv((i-1) * (sum_of_weights(args) - this->actual_positions[0]), b);
157                 }
158 
159                 // adjust heights of markers 2 to num_cells if necessary
160                 for (std::size_t i = 1; i < b; ++i)
161                 {
162                     // offset to desire position
163                     float_type d = this->desired_positions[i] - this->actual_positions[i];
164 
165                     // offset to next position
166                     float_type dp = this->actual_positions[i + 1] - this->actual_positions[i];
167 
168                     // offset to previous position
169                     float_type dm = this->actual_positions[i - 1] - this->actual_positions[i];
170 
171                     // height ds
172                     float_type hp = (this->heights[i + 1] - this->heights[i]) / dp;
173                     float_type hm = (this->heights[i - 1] - this->heights[i]) / dm;
174 
175                     if ( ( d >= 1. && dp > 1. ) || ( d <= -1. && dm < -1. ) )
176                     {
177                         short sign_d = static_cast<short>(d / std::abs(d));
178 
179                         // try adjusting heights[i] using p-squared formula
180                         float_type h = this->heights[i] + sign_d / (dp - dm) * ( (sign_d - dm) * hp + (dp - sign_d) * hm );
181 
182                         if ( this->heights[i - 1] < h && h < this->heights[i + 1] )
183                         {
184                             this->heights[i] = h;
185                         }
186                         else
187                         {
188                             // use linear formula
189                             if (d>0)
190                             {
191                                 this->heights[i] += hp;
192                             }
193                             if (d<0)
194                             {
195                                 this->heights[i] -= hm;
196                             }
197                         }
198                         this->actual_positions[i] += sign_d;
199                     }
200                 }
201             }
202         }
203 
204         template<typename Args>
resultboost::accumulators::impl::weighted_p_square_cumulative_distribution_impl205         result_type result(Args const &args) const
206         {
207             if (this->is_dirty)
208             {
209                 this->is_dirty = false;
210 
211                 // creates a vector of std::pair where each pair i holds
212                 // the values heights[i] (x-axis of histogram) and
213                 // actual_positions[i] / sum_of_weights (y-axis of histogram)
214 
215                 for (std::size_t i = 0; i < this->histogram.size(); ++i)
216                 {
217                     this->histogram[i] = std::make_pair(this->heights[i], numeric::fdiv(this->actual_positions[i], sum_of_weights(args)));
218                 }
219             }
220 
221             return make_iterator_range(this->histogram);
222         }
223 
224         // make this accumulator serializeable
225         // TODO split to save/load and check on parameters provided in ctor
226         template<class Archive>
serializeboost::accumulators::impl::weighted_p_square_cumulative_distribution_impl227         void serialize(Archive & ar, const unsigned int file_version)
228         {
229             ar & num_cells;
230             ar & heights;
231             ar & actual_positions;
232             ar & desired_positions;
233             ar & histogram;
234             ar & is_dirty;
235         }
236 
237     private:
238         std::size_t num_cells;            // number of cells b
239         array_type  heights;              // q_i
240         array_type  actual_positions;     // n_i
241         array_type  desired_positions;    // n'_i
242         mutable histogram_type histogram; // histogram
243         mutable bool is_dirty;
244     };
245 
246 } // namespace detail
247 
248 ///////////////////////////////////////////////////////////////////////////////
249 // tag::weighted_p_square_cumulative_distribution
250 //
251 namespace tag
252 {
253     struct weighted_p_square_cumulative_distribution
254       : depends_on<count, sum_of_weights>
255       , p_square_cumulative_distribution_num_cells
256     {
257         typedef accumulators::impl::weighted_p_square_cumulative_distribution_impl<mpl::_1, mpl::_2> impl;
258     };
259 }
260 
261 ///////////////////////////////////////////////////////////////////////////////
262 // extract::weighted_p_square_cumulative_distribution
263 //
264 namespace extract
265 {
266     extractor<tag::weighted_p_square_cumulative_distribution> const weighted_p_square_cumulative_distribution = {};
267 
268     BOOST_ACCUMULATORS_IGNORE_GLOBAL(weighted_p_square_cumulative_distribution)
269 }
270 
271 using extract::weighted_p_square_cumulative_distribution;
272 
273 }} // namespace boost::accumulators
274 
275 #endif
276