1 /////////////////////////////////////////////////////////////////////////////// 2 // weighted_p_square_cumul_dist.hpp 3 // 4 // Copyright 2006 Daniel Egloff, Olivier Gygi. Distributed under the Boost 5 // Software License, Version 1.0. (See accompanying file 6 // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 7 8 #ifndef BOOST_ACCUMULATORS_STATISTICS_WEIGHTED_P_SQUARE_CUMUL_DIST_HPP_DE_01_01_2006 9 #define BOOST_ACCUMULATORS_STATISTICS_WEIGHTED_P_SQUARE_CUMUL_DIST_HPP_DE_01_01_2006 10 11 #include <vector> 12 #include <functional> 13 #include <boost/parameter/keyword.hpp> 14 #include <boost/mpl/placeholders.hpp> 15 #include <boost/range.hpp> 16 #include <boost/accumulators/framework/accumulator_base.hpp> 17 #include <boost/accumulators/framework/extractor.hpp> 18 #include <boost/accumulators/numeric/functional.hpp> 19 #include <boost/accumulators/framework/parameters/sample.hpp> 20 #include <boost/accumulators/statistics_fwd.hpp> 21 #include <boost/accumulators/statistics/count.hpp> 22 #include <boost/accumulators/statistics/sum.hpp> 23 #include <boost/accumulators/statistics/p_square_cumul_dist.hpp> // for named parameter p_square_cumulative_distribution_num_cells 24 25 namespace boost { namespace accumulators 26 { 27 28 namespace impl 29 { 30 /////////////////////////////////////////////////////////////////////////////// 31 // weighted_p_square_cumulative_distribution_impl 32 // cumulative distribution calculation (as histogram) 33 /** 34 @brief Histogram calculation of the cumulative distribution with the \f$P^2\f$ algorithm for weighted samples 35 36 A histogram of the sample cumulative distribution is computed dynamically without storing samples 37 based on the \f$ P^2 \f$ algorithm for weighted samples. The returned histogram has a specifiable 38 amount (num_cells) equiprobable (and not equal-sized) cells. 39 40 Note that applying importance sampling results in regions to be more and other regions to be less 41 accurately estimated than without importance sampling, i.e., with unweighted samples. 42 43 For further details, see 44 45 R. Jain and I. Chlamtac, The P^2 algorithm for dynamic calculation of quantiles and 46 histograms without storing observations, Communications of the ACM, 47 Volume 28 (October), Number 10, 1985, p. 1076-1085. 48 49 @param p_square_cumulative_distribution_num_cells 50 */ 51 template<typename Sample, typename Weight> 52 struct weighted_p_square_cumulative_distribution_impl 53 : accumulator_base 54 { 55 typedef typename numeric::functional::multiplies<Sample, Weight>::result_type weighted_sample; 56 typedef typename numeric::functional::fdiv<weighted_sample, std::size_t>::result_type float_type; 57 typedef std::vector<std::pair<float_type, float_type> > histogram_type; 58 typedef std::vector<float_type> array_type; 59 // for boost::result_of 60 typedef iterator_range<typename histogram_type::iterator> result_type; 61 62 template<typename Args> weighted_p_square_cumulative_distribution_implboost::accumulators::impl::weighted_p_square_cumulative_distribution_impl63 weighted_p_square_cumulative_distribution_impl(Args const &args) 64 : num_cells(args[p_square_cumulative_distribution_num_cells]) 65 , heights(num_cells + 1) 66 , actual_positions(num_cells + 1) 67 , desired_positions(num_cells + 1) 68 , histogram(num_cells + 1) 69 , is_dirty(true) 70 { 71 } 72 73 template<typename Args> operator ()boost::accumulators::impl::weighted_p_square_cumulative_distribution_impl74 void operator ()(Args const &args) 75 { 76 this->is_dirty = true; 77 78 std::size_t cnt = count(args); 79 std::size_t sample_cell = 1; // k 80 std::size_t b = this->num_cells; 81 82 // accumulate num_cells + 1 first samples 83 if (cnt <= b + 1) 84 { 85 this->heights[cnt - 1] = args[sample]; 86 this->actual_positions[cnt - 1] = args[weight]; 87 88 // complete the initialization of heights by sorting 89 if (cnt == b + 1) 90 { 91 //std::sort(this->heights.begin(), this->heights.end()); 92 93 // TODO: we need to sort the initial samples (in heights) in ascending order and 94 // sort their weights (in actual_positions) the same way. The following lines do 95 // it, but there must be a better and more efficient way of doing this. 96 typename array_type::iterator it_begin, it_end, it_min; 97 98 it_begin = this->heights.begin(); 99 it_end = this->heights.end(); 100 101 std::size_t pos = 0; 102 103 while (it_begin != it_end) 104 { 105 it_min = std::min_element(it_begin, it_end); 106 std::size_t d = std::distance(it_begin, it_min); 107 std::swap(*it_begin, *it_min); 108 std::swap(this->actual_positions[pos], this->actual_positions[pos + d]); 109 ++it_begin; 110 ++pos; 111 } 112 113 // calculate correct initial actual positions 114 for (std::size_t i = 1; i < b; ++i) 115 { 116 this->actual_positions[i] += this->actual_positions[i - 1]; 117 } 118 } 119 } 120 else 121 { 122 // find cell k such that heights[k-1] <= args[sample] < heights[k] and adjust extreme values 123 if (args[sample] < this->heights[0]) 124 { 125 this->heights[0] = args[sample]; 126 this->actual_positions[0] = args[weight]; 127 sample_cell = 1; 128 } 129 else if (this->heights[b] <= args[sample]) 130 { 131 this->heights[b] = args[sample]; 132 sample_cell = b; 133 } 134 else 135 { 136 typename array_type::iterator it; 137 it = std::upper_bound( 138 this->heights.begin() 139 , this->heights.end() 140 , args[sample] 141 ); 142 143 sample_cell = std::distance(this->heights.begin(), it); 144 } 145 146 // increment positions of markers above sample_cell 147 for (std::size_t i = sample_cell; i < b + 1; ++i) 148 { 149 this->actual_positions[i] += args[weight]; 150 } 151 152 // determine desired marker positions 153 for (std::size_t i = 1; i < b + 1; ++i) 154 { 155 this->desired_positions[i] = this->actual_positions[0] 156 + numeric::fdiv((i-1) * (sum_of_weights(args) - this->actual_positions[0]), b); 157 } 158 159 // adjust heights of markers 2 to num_cells if necessary 160 for (std::size_t i = 1; i < b; ++i) 161 { 162 // offset to desire position 163 float_type d = this->desired_positions[i] - this->actual_positions[i]; 164 165 // offset to next position 166 float_type dp = this->actual_positions[i + 1] - this->actual_positions[i]; 167 168 // offset to previous position 169 float_type dm = this->actual_positions[i - 1] - this->actual_positions[i]; 170 171 // height ds 172 float_type hp = (this->heights[i + 1] - this->heights[i]) / dp; 173 float_type hm = (this->heights[i - 1] - this->heights[i]) / dm; 174 175 if ( ( d >= 1. && dp > 1. ) || ( d <= -1. && dm < -1. ) ) 176 { 177 short sign_d = static_cast<short>(d / std::abs(d)); 178 179 // try adjusting heights[i] using p-squared formula 180 float_type h = this->heights[i] + sign_d / (dp - dm) * ( (sign_d - dm) * hp + (dp - sign_d) * hm ); 181 182 if ( this->heights[i - 1] < h && h < this->heights[i + 1] ) 183 { 184 this->heights[i] = h; 185 } 186 else 187 { 188 // use linear formula 189 if (d>0) 190 { 191 this->heights[i] += hp; 192 } 193 if (d<0) 194 { 195 this->heights[i] -= hm; 196 } 197 } 198 this->actual_positions[i] += sign_d; 199 } 200 } 201 } 202 } 203 204 template<typename Args> resultboost::accumulators::impl::weighted_p_square_cumulative_distribution_impl205 result_type result(Args const &args) const 206 { 207 if (this->is_dirty) 208 { 209 this->is_dirty = false; 210 211 // creates a vector of std::pair where each pair i holds 212 // the values heights[i] (x-axis of histogram) and 213 // actual_positions[i] / sum_of_weights (y-axis of histogram) 214 215 for (std::size_t i = 0; i < this->histogram.size(); ++i) 216 { 217 this->histogram[i] = std::make_pair(this->heights[i], numeric::fdiv(this->actual_positions[i], sum_of_weights(args))); 218 } 219 } 220 221 return make_iterator_range(this->histogram); 222 } 223 224 // make this accumulator serializeable 225 // TODO split to save/load and check on parameters provided in ctor 226 template<class Archive> serializeboost::accumulators::impl::weighted_p_square_cumulative_distribution_impl227 void serialize(Archive & ar, const unsigned int file_version) 228 { 229 ar & num_cells; 230 ar & heights; 231 ar & actual_positions; 232 ar & desired_positions; 233 ar & histogram; 234 ar & is_dirty; 235 } 236 237 private: 238 std::size_t num_cells; // number of cells b 239 array_type heights; // q_i 240 array_type actual_positions; // n_i 241 array_type desired_positions; // n'_i 242 mutable histogram_type histogram; // histogram 243 mutable bool is_dirty; 244 }; 245 246 } // namespace detail 247 248 /////////////////////////////////////////////////////////////////////////////// 249 // tag::weighted_p_square_cumulative_distribution 250 // 251 namespace tag 252 { 253 struct weighted_p_square_cumulative_distribution 254 : depends_on<count, sum_of_weights> 255 , p_square_cumulative_distribution_num_cells 256 { 257 typedef accumulators::impl::weighted_p_square_cumulative_distribution_impl<mpl::_1, mpl::_2> impl; 258 }; 259 } 260 261 /////////////////////////////////////////////////////////////////////////////// 262 // extract::weighted_p_square_cumulative_distribution 263 // 264 namespace extract 265 { 266 extractor<tag::weighted_p_square_cumulative_distribution> const weighted_p_square_cumulative_distribution = {}; 267 268 BOOST_ACCUMULATORS_IGNORE_GLOBAL(weighted_p_square_cumulative_distribution) 269 } 270 271 using extract::weighted_p_square_cumulative_distribution; 272 273 }} // namespace boost::accumulators 274 275 #endif 276