1 /** 2 * @file methods/kde/kde.hpp 3 * @author Roberto Hueso 4 * 5 * Kernel Density Estimation. 6 * 7 * mlpack is free software; you may redistribute it and/or modify it under the 8 * terms of the 3-clause BSD license. You should have received a copy of the 9 * 3-clause BSD license along with mlpack. If not, see 10 * http://www.opensource.org/licenses/BSD-3-Clause for more information. 11 */ 12 13 #ifndef MLPACK_METHODS_KDE_KDE_HPP 14 #define MLPACK_METHODS_KDE_KDE_HPP 15 16 #include <mlpack/prereqs.hpp> 17 #include <mlpack/core/tree/binary_space_tree.hpp> 18 19 #include "kde_stat.hpp" 20 21 namespace mlpack { 22 namespace kde /** Kernel Density Estimation. */ { 23 24 //! KDEMode represents the ways in which KDE algorithm can be executed. 25 enum KDEMode 26 { 27 DUAL_TREE_MODE, 28 SINGLE_TREE_MODE 29 }; 30 31 //! KDEDefaultParams contains the default input parameter values for KDE. 32 struct KDEDefaultParams 33 { 34 //! Relative error tolerance. 35 static constexpr double relError = 0.05; 36 37 //! Absolute error tolerance. 38 static constexpr double absError = 0; 39 40 //! KDE algorithm mode. 41 static constexpr KDEMode mode = KDEMode::DUAL_TREE_MODE; 42 43 //! Whether to use Monte Carlo estimations when possible. 44 static constexpr bool monteCarlo = false; 45 46 //! Probability of a Monte Carlo estimation to be bounded by the relative 47 //! error tolerance. 48 static constexpr double mcProb = 0.95; 49 50 //! Initial sample size for Monte Carlo estimations. 51 static constexpr size_t initialSampleSize = 100; 52 53 //! Monte Carlo entry coefficient. 54 static constexpr double mcEntryCoef = 3; 55 56 //! Monte Carlo break coefficient. 57 static constexpr double mcBreakCoef = 0.4; 58 }; 59 60 /** 61 * The KDE class is a template class for performing Kernel Density Estimations. 62 * In statistics, kernel density estimation is a way to estimate the 63 * probability density function of a variable in a non parametric way. 64 * This implementation performs this estimation using a tree-independent 65 * dual-tree algorithm. Details about this algorithm are available in KDERules. 66 * 67 * @tparam KernelType Kernel function to use for KDE calculations. 68 * @tparam MetricType Metric to use for KDE calculations. 69 * @tparam MatType Type of data to use. 70 * @tparam TreeType Type of tree to use; must satisfy the TreeType policy API. 71 * @tparam DualTreeTraversalType Type of dual-tree traversal to use. 72 * @tparam SingleTreeTraversalType Type of single-tree traversal to use. 73 */ 74 template<typename KernelType = kernel::GaussianKernel, 75 typename MetricType = mlpack::metric::EuclideanDistance, 76 typename MatType = arma::mat, 77 template<typename TreeMetricType, 78 typename TreeStatType, 79 typename TreeMatType> class TreeType = tree::KDTree, 80 template<typename RuleType> class DualTreeTraversalType = 81 TreeType<MetricType, 82 kde::KDEStat, 83 MatType>::template DualTreeTraverser, 84 template<typename RuleType> class SingleTreeTraversalType = 85 TreeType<MetricType, 86 kde::KDEStat, 87 MatType>::template SingleTreeTraverser> 88 class KDE 89 { 90 public: 91 //! Convenience typedef. 92 typedef TreeType<MetricType, kde::KDEStat, MatType> Tree; 93 94 /** 95 * Initialize KDE object using custom instantiated Metric and Kernel objects. 96 * 97 * @param relError Relative error tolerance of the model. 98 * @param absError Absolute error tolerance of the model. 99 * @param kernel Instantiated kernel object. 100 * @param mode Mode for the algorithm. 101 * @param metric Instantiated metric object. 102 * @param monteCarlo Whether to use Monte Carlo estimations when possible. 103 * @param mcProb Probability of a Monte Carlo estimation to be bounded by 104 * relative error tolerance. 105 * @param initialSampleSize Initial sample size for Monte Carlo estimations. 106 * @param mcEntryCoef Coefficient to control how much larger does the amount 107 * of node descendants has to be compared to the initial 108 * sample size in order for it to be a candidate for Monte 109 * Carlo estimations. 110 * @param mcBreakCoef Coefficient to control what fraction of the node's 111 * descendants evaluated is the limit before Monte Carlo 112 * estimation recurses. 113 */ 114 KDE(const double relError = KDEDefaultParams::relError, 115 const double absError = KDEDefaultParams::absError, 116 KernelType kernel = KernelType(), 117 const KDEMode mode = KDEDefaultParams::mode, 118 MetricType metric = MetricType(), 119 const bool monteCarlo = KDEDefaultParams::monteCarlo, 120 const double mcProb = KDEDefaultParams::mcProb, 121 const size_t initialSampleSize = KDEDefaultParams::initialSampleSize, 122 const double mcEntryCoef = KDEDefaultParams::mcEntryCoef, 123 const double mcBreakCoef = KDEDefaultParams::mcBreakCoef); 124 125 /** 126 * Construct KDE object as a copy of the given model. This may be 127 * computationally intensive! 128 * 129 * @param other KDE object to copy. 130 */ 131 KDE(const KDE& other); 132 133 /** 134 * Construct KDE object taking ownership of the given model. 135 * 136 * @param other KDE object to take ownership of. 137 */ 138 KDE(KDE&& other); 139 140 /** 141 * Copy a KDE model. 142 * 143 * Use std::move if the object to copy is no longer needed. 144 * 145 * @param other KDE model to copy. 146 */ 147 KDE& operator=(KDE other); 148 149 /** 150 * Destroy the KDE object. If this object created any trees, they will be 151 * deleted. If you created the trees then you have to delete them yourself. 152 */ 153 ~KDE(); 154 155 /** 156 * Trains the KDE model. It builds a tree using a reference set. 157 * 158 * Use std::move if the reference set is no longer needed. 159 * 160 * @param referenceSet Set of reference data. 161 */ 162 void Train(MatType referenceSet); 163 164 /** 165 * Trains the KDE model. Sets the reference tree to an already created tree. 166 * 167 * - If TreeTraits<TreeType>::RearrangesDataset is false then it is possible 168 * to use an empty oldFromNewReferences vector. 169 * 170 * @param referenceTree Built reference tree. 171 * @param oldFromNewReferences Permutations of reference points obtained 172 * during tree generation. 173 */ 174 void Train(Tree* referenceTree, std::vector<size_t>* oldFromNewReferences); 175 176 /** 177 * Estimate density of each point in the query set given the data of the 178 * reference set. The result is stored in an estimations vector. 179 * Estimations might not be normalized. 180 * 181 * - Dimension of each point in the query set must match the dimension of each 182 * point in the reference set. 183 * 184 * - Use std::move if the query set is no longer needed. 185 * 186 * @pre The model has to be previously trained. 187 * @param querySet Set of query points to get the density of. 188 * @param estimations Object which will hold the density of each query point. 189 */ 190 void Evaluate(MatType querySet, arma::vec& estimations); 191 192 /** 193 * Estimate density of each point in the query set given the data of an 194 * already created query tree. The result is stored in an estimations vector. 195 * Estimations might not be normalized. 196 * 197 * - Dimension of each point in the queryTree dataset must match the dimension 198 * of each point in the reference set. 199 * 200 * - Use std::move if the query tree is no longer needed. 201 * 202 * @pre The model has to be previously trained and mode has to be dual-tree. 203 * @param queryTree Tree of query points to get the density of. 204 * @param oldFromNewQueries Mappings of query points to the tree dataset. 205 * @param estimations Object which will hold the density of each query point. 206 */ 207 void Evaluate(Tree* queryTree, 208 const std::vector<size_t>& oldFromNewQueries, 209 arma::vec& estimations); 210 211 /** 212 * Estimate density of each point in the reference set given the data of the 213 * reference set. It does not compute the estimation of a point with itself. 214 * The result is stored in an estimations vector. Estimations might not be 215 * normalized. 216 * 217 * @pre The model has to be previously trained. 218 * @param estimations Object which will hold the density of each reference 219 * point. 220 */ 221 void Evaluate(arma::vec& estimations); 222 223 //! Get the kernel. Kernel() const224 const KernelType& Kernel() const { return kernel; } 225 226 //! Modify the kernel. Kernel()227 KernelType& Kernel() { return kernel; } 228 229 //! Get the metric. Metric() const230 const MetricType& Metric() const { return metric; } 231 232 //! Modify the metric. Metric()233 MetricType& Metric() { return metric; } 234 235 //! Get the reference tree. ReferenceTree()236 Tree* ReferenceTree() { return referenceTree; } 237 238 //! Get relative error tolerance. RelativeError() const239 double RelativeError() const { return relError; } 240 241 //! Modify relative error tolerance (0 <= newError <= 1). 242 void RelativeError(const double newError); 243 244 //! Get absolute error tolerance. AbsoluteError() const245 double AbsoluteError() const { return absError; } 246 247 //! Modify absolute error tolerance (0 <= newError). 248 void AbsoluteError(const double newError); 249 250 //! Check whether reference tree is owned by the KDE model. OwnsReferenceTree() const251 bool OwnsReferenceTree() const { return ownsReferenceTree; } 252 253 //! Check whether KDE model is trained or not. IsTrained() const254 bool IsTrained() const { return trained; } 255 256 //! Get the mode of KDE. Mode() const257 KDEMode Mode() const { return mode; } 258 259 //! Modify the mode of KDE. Mode()260 KDEMode& Mode() { return mode; } 261 262 //! Get whether Monte Carlo estimations are being used or not. MonteCarlo() const263 bool MonteCarlo() const { return monteCarlo; } 264 265 //! Modify whether Monte Carlo estimations are being used or not. MonteCarlo()266 bool& MonteCarlo() { return monteCarlo; } 267 268 //! Get Monte Carlo probability of error being bounded by relative error. MCProb() const269 double MCProb() const { return mcProb; } 270 271 //! Modify Monte Carlo probability of error being bounded by relative error. 272 //! (0 <= newProb < 1). 273 void MCProb(const double newProb); 274 275 //! Get Monte Carlo initial sample size. MCInitialSampleSize() const276 size_t MCInitialSampleSize() const { return initialSampleSize; } 277 278 //! Modify Monte Carlo initial sample size. MCInitialSampleSize()279 size_t& MCInitialSampleSize() { return initialSampleSize; } 280 281 //! Get Monte Carlo entry coefficient. MCEntryCoef() const282 double MCEntryCoef() const { return mcEntryCoef; } 283 284 //! Modify Monte Carlo entry coefficient. (newCoef >= 1). 285 void MCEntryCoef(const double newCoef); 286 287 //! Get Monte Carlo break coefficient. MCBreakCoef() const288 double MCBreakCoef() const { return mcBreakCoef; } 289 290 //! Modify Monte Carlo break coefficient. (0 < newCoef <= 1). 291 void MCBreakCoef(const double newCoef); 292 293 //! Serialize the model. 294 template<typename Archive> 295 void serialize(Archive& ar, const unsigned int version); 296 297 private: 298 //! Kernel. 299 KernelType kernel; 300 301 //! Metric. 302 MetricType metric; 303 304 //! Reference tree. 305 Tree* referenceTree; 306 307 //! Permutations of reference points. 308 std::vector<size_t>* oldFromNewReferences; 309 310 //! Relative error tolerance. 311 double relError; 312 313 //! Absolute error tolerance. 314 double absError; 315 316 //! If true, the KDE object is responsible for deleting the reference tree. 317 bool ownsReferenceTree; 318 319 //! If true, the KDE object is trained. 320 bool trained; 321 322 //! Mode of the KDE algorithm. 323 KDEMode mode; 324 325 //! If true Monte Carlo approximations will be used when possible. 326 bool monteCarlo; 327 328 //! Probability of error being bounded by relError when Monte Carlo is used. 329 double mcProb; 330 331 //! Size of the initial sample for Monte Carlo approximations. 332 size_t initialSampleSize; 333 334 //! Coefficient to control how much larger does the amount of node descendants 335 //! has to be compared to the initial sample size in order to be a candidate 336 //! for Monte Carlo estimations. If the evaluation results in a non-natural 337 //! number, then the floor function will be applied. 338 double mcEntryCoef; 339 340 //! Coefficient to control what fraction of the amount of node's descendants 341 //! is the limit before Monte Carlo estimation recurses. 342 double mcBreakCoef; 343 344 //! Check whether absolute and relative error values are compatible. 345 static void CheckErrorValues(const double relError, const double absError); 346 347 //! Rearrange estimations vector if required. 348 static void RearrangeEstimations(const std::vector<size_t>& oldFromNew, 349 arma::vec& estimations); 350 }; 351 352 } // namespace kde 353 } // namespace mlpack 354 355 //! Set the serialization version of the KDE class. 356 /* TODO FIX 357 Cannot use BOOST_TEMPLATE_CLASS_VERSION because of the problem stated in 358 https://stackoverflow.com/questions/8942912/how-to-pass-multi-argument-templates 359 -to-macros 360 */ 361 362 namespace boost { 363 namespace serialization{ 364 365 template<typename KernelType, 366 typename MetricType, 367 typename MatType, 368 template<typename TreeMetricType, 369 typename TreeStatType, 370 typename TreeMatType> class TreeType, 371 template<typename RuleType> class DualTreeTraversalType, 372 template<typename RuleType> class SingleTreeTraversalType> 373 struct version<mlpack::kde::KDE<KernelType, 374 MetricType, 375 MatType, 376 TreeType, 377 DualTreeTraversalType, 378 SingleTreeTraversalType>> 379 { 380 typedef mpl::int_<1> type; 381 typedef mpl::integral_c_tag tag; 382 BOOST_STATIC_CONSTANT(int, value = version::type::value); 383 BOOST_MPL_ASSERT((boost::mpl::less<boost::mpl::int_<1>, 384 boost::mpl::int_<256>>)); 385 }; 386 387 } // namespace serialization. 388 } // namespace boost. 389 390 // Include implementation. 391 #include "kde_impl.hpp" 392 393 #endif // MLPACK_METHODS_KDE_KDE_HPP 394