1 /**
2  * @file methods/kde/kde.hpp
3  * @author Roberto Hueso
4  *
5  * Kernel Density Estimation.
6  *
7  * mlpack is free software; you may redistribute it and/or modify it under the
8  * terms of the 3-clause BSD license.  You should have received a copy of the
9  * 3-clause BSD license along with mlpack.  If not, see
10  * http://www.opensource.org/licenses/BSD-3-Clause for more information.
11  */
12 
13 #ifndef MLPACK_METHODS_KDE_KDE_HPP
14 #define MLPACK_METHODS_KDE_KDE_HPP
15 
16 #include <mlpack/prereqs.hpp>
17 #include <mlpack/core/tree/binary_space_tree.hpp>
18 
19 #include "kde_stat.hpp"
20 
21 namespace mlpack {
22 namespace kde /** Kernel Density Estimation. */ {
23 
24 //! KDEMode represents the ways in which KDE algorithm can be executed.
25 enum KDEMode
26 {
27   DUAL_TREE_MODE,
28   SINGLE_TREE_MODE
29 };
30 
31 //! KDEDefaultParams contains the default input parameter values for KDE.
32 struct KDEDefaultParams
33 {
34   //! Relative error tolerance.
35   static constexpr double relError = 0.05;
36 
37   //! Absolute error tolerance.
38   static constexpr double absError = 0;
39 
40   //! KDE algorithm mode.
41   static constexpr KDEMode mode = KDEMode::DUAL_TREE_MODE;
42 
43   //! Whether to use Monte Carlo estimations when possible.
44   static constexpr bool monteCarlo = false;
45 
46   //! Probability of a Monte Carlo estimation to be bounded by the relative
47   //! error tolerance.
48   static constexpr double mcProb = 0.95;
49 
50   //! Initial sample size for Monte Carlo estimations.
51   static constexpr size_t initialSampleSize = 100;
52 
53   //! Monte Carlo entry coefficient.
54   static constexpr double mcEntryCoef = 3;
55 
56   //! Monte Carlo break coefficient.
57   static constexpr double mcBreakCoef = 0.4;
58 };
59 
60 /**
61  * The KDE class is a template class for performing Kernel Density Estimations.
62  * In statistics, kernel density estimation is a way to estimate the
63  * probability density function of a variable in a non parametric way.
64  * This implementation performs this estimation using a tree-independent
65  * dual-tree algorithm. Details about this algorithm are available in KDERules.
66  *
67  * @tparam KernelType Kernel function to use for KDE calculations.
68  * @tparam MetricType Metric to use for KDE calculations.
69  * @tparam MatType Type of data to use.
70  * @tparam TreeType Type of tree to use; must satisfy the TreeType policy API.
71  * @tparam DualTreeTraversalType Type of dual-tree traversal to use.
72  * @tparam SingleTreeTraversalType Type of single-tree traversal to use.
73  */
74 template<typename KernelType = kernel::GaussianKernel,
75          typename MetricType = mlpack::metric::EuclideanDistance,
76          typename MatType = arma::mat,
77          template<typename TreeMetricType,
78                   typename TreeStatType,
79                   typename TreeMatType> class TreeType = tree::KDTree,
80          template<typename RuleType> class DualTreeTraversalType =
81              TreeType<MetricType,
82                       kde::KDEStat,
83                       MatType>::template DualTreeTraverser,
84          template<typename RuleType> class SingleTreeTraversalType =
85              TreeType<MetricType,
86                       kde::KDEStat,
87                       MatType>::template SingleTreeTraverser>
88 class KDE
89 {
90  public:
91   //! Convenience typedef.
92   typedef TreeType<MetricType, kde::KDEStat, MatType> Tree;
93 
94   /**
95    * Initialize KDE object using custom instantiated Metric and Kernel objects.
96    *
97    * @param relError Relative error tolerance of the model.
98    * @param absError Absolute error tolerance of the model.
99    * @param kernel Instantiated kernel object.
100    * @param mode Mode for the algorithm.
101    * @param metric Instantiated metric object.
102    * @param monteCarlo Whether to use Monte Carlo estimations when possible.
103    * @param mcProb Probability of a Monte Carlo estimation to be bounded by
104    *               relative error tolerance.
105    * @param initialSampleSize Initial sample size for Monte Carlo estimations.
106    * @param mcEntryCoef Coefficient to control how much larger does the amount
107    *                    of node descendants has to be compared to the initial
108    *                    sample size in order for it to be a candidate for Monte
109    *                    Carlo estimations.
110    * @param mcBreakCoef Coefficient to control what fraction of the node's
111    *                    descendants evaluated is the limit before Monte Carlo
112    *                    estimation recurses.
113    */
114   KDE(const double relError = KDEDefaultParams::relError,
115       const double absError = KDEDefaultParams::absError,
116       KernelType kernel = KernelType(),
117       const KDEMode mode = KDEDefaultParams::mode,
118       MetricType metric = MetricType(),
119       const bool monteCarlo = KDEDefaultParams::monteCarlo,
120       const double mcProb = KDEDefaultParams::mcProb,
121       const size_t initialSampleSize = KDEDefaultParams::initialSampleSize,
122       const double mcEntryCoef = KDEDefaultParams::mcEntryCoef,
123       const double mcBreakCoef = KDEDefaultParams::mcBreakCoef);
124 
125   /**
126    * Construct KDE object as a copy of the given model. This may be
127    * computationally intensive!
128    *
129    * @param other KDE object to copy.
130    */
131   KDE(const KDE& other);
132 
133   /**
134    * Construct KDE object taking ownership of the given model.
135    *
136    * @param other KDE object to take ownership of.
137    */
138   KDE(KDE&& other);
139 
140   /**
141    * Copy a KDE model.
142    *
143    * Use std::move if the object to copy is no longer needed.
144    *
145    * @param other KDE model to copy.
146    */
147   KDE& operator=(KDE other);
148 
149   /**
150    * Destroy the KDE object. If this object created any trees, they will be
151    * deleted. If you created the trees then you have to delete them yourself.
152    */
153   ~KDE();
154 
155   /**
156    * Trains the KDE model. It builds a tree using a reference set.
157    *
158    * Use std::move if the reference set is no longer needed.
159    *
160    * @param referenceSet Set of reference data.
161    */
162   void Train(MatType referenceSet);
163 
164   /**
165    * Trains the KDE model. Sets the reference tree to an already created tree.
166    *
167    * - If TreeTraits<TreeType>::RearrangesDataset is false then it is possible
168    *   to use an empty oldFromNewReferences vector.
169    *
170    * @param referenceTree Built reference tree.
171    * @param oldFromNewReferences Permutations of reference points obtained
172    *                             during tree generation.
173    */
174   void Train(Tree* referenceTree, std::vector<size_t>* oldFromNewReferences);
175 
176   /**
177    * Estimate density of each point in the query set given the data of the
178    * reference set. The result is stored in an estimations vector.
179    * Estimations might not be normalized.
180    *
181    * - Dimension of each point in the query set must match the dimension of each
182    *   point in the reference set.
183    *
184    * - Use std::move if the query set is no longer needed.
185    *
186    * @pre The model has to be previously trained.
187    * @param querySet Set of query points to get the density of.
188    * @param estimations Object which will hold the density of each query point.
189    */
190   void Evaluate(MatType querySet, arma::vec& estimations);
191 
192   /**
193    * Estimate density of each point in the query set given the data of an
194    * already created query tree. The result is stored in an estimations vector.
195    * Estimations might not be normalized.
196    *
197    * - Dimension of each point in the queryTree dataset must match the dimension
198    *    of each point in the reference set.
199    *
200    * - Use std::move if the query tree is no longer needed.
201    *
202    * @pre The model has to be previously trained and mode has to be dual-tree.
203    * @param queryTree Tree of query points to get the density of.
204    * @param oldFromNewQueries Mappings of query points to the tree dataset.
205    * @param estimations Object which will hold the density of each query point.
206    */
207   void Evaluate(Tree* queryTree,
208                 const std::vector<size_t>& oldFromNewQueries,
209                 arma::vec& estimations);
210 
211   /**
212    * Estimate density of each point in the reference set given the data of the
213    * reference set. It does not compute the estimation of a point with itself.
214    * The result is stored in an estimations vector. Estimations might not be
215    * normalized.
216    *
217    * @pre The model has to be previously trained.
218    * @param estimations Object which will hold the density of each reference
219    *                    point.
220    */
221   void Evaluate(arma::vec& estimations);
222 
223   //! Get the kernel.
Kernel() const224   const KernelType& Kernel() const { return kernel; }
225 
226   //! Modify the kernel.
Kernel()227   KernelType& Kernel() { return kernel; }
228 
229   //! Get the metric.
Metric() const230   const MetricType& Metric() const { return metric; }
231 
232   //! Modify the metric.
Metric()233   MetricType& Metric() { return metric; }
234 
235   //! Get the reference tree.
ReferenceTree()236   Tree* ReferenceTree() { return referenceTree; }
237 
238   //! Get relative error tolerance.
RelativeError() const239   double RelativeError() const { return relError; }
240 
241   //! Modify relative error tolerance (0 <= newError <= 1).
242   void RelativeError(const double newError);
243 
244   //! Get absolute error tolerance.
AbsoluteError() const245   double AbsoluteError() const { return absError; }
246 
247   //! Modify absolute error tolerance (0 <= newError).
248   void AbsoluteError(const double newError);
249 
250   //! Check whether reference tree is owned by the KDE model.
OwnsReferenceTree() const251   bool OwnsReferenceTree() const { return ownsReferenceTree; }
252 
253   //! Check whether KDE model is trained or not.
IsTrained() const254   bool IsTrained() const { return trained; }
255 
256   //! Get the mode of KDE.
Mode() const257   KDEMode Mode() const { return mode; }
258 
259   //! Modify the mode of KDE.
Mode()260   KDEMode& Mode() { return mode; }
261 
262   //! Get whether Monte Carlo estimations are being used or not.
MonteCarlo() const263   bool MonteCarlo() const { return monteCarlo; }
264 
265   //! Modify whether Monte Carlo estimations are being used or not.
MonteCarlo()266   bool& MonteCarlo() { return monteCarlo; }
267 
268   //! Get Monte Carlo probability of error being bounded by relative error.
MCProb() const269   double MCProb() const { return mcProb; }
270 
271   //! Modify Monte Carlo probability of error being bounded by relative error.
272   //! (0 <= newProb < 1).
273   void MCProb(const double newProb);
274 
275   //! Get Monte Carlo initial sample size.
MCInitialSampleSize() const276   size_t MCInitialSampleSize() const { return initialSampleSize; }
277 
278   //! Modify Monte Carlo initial sample size.
MCInitialSampleSize()279   size_t& MCInitialSampleSize() { return initialSampleSize; }
280 
281   //! Get Monte Carlo entry coefficient.
MCEntryCoef() const282   double MCEntryCoef() const { return mcEntryCoef; }
283 
284   //! Modify Monte Carlo entry coefficient. (newCoef >= 1).
285   void MCEntryCoef(const double newCoef);
286 
287   //! Get Monte Carlo break coefficient.
MCBreakCoef() const288   double MCBreakCoef() const { return mcBreakCoef; }
289 
290   //! Modify Monte Carlo break coefficient. (0 < newCoef <= 1).
291   void MCBreakCoef(const double newCoef);
292 
293   //! Serialize the model.
294   template<typename Archive>
295   void serialize(Archive& ar, const unsigned int version);
296 
297  private:
298   //! Kernel.
299   KernelType kernel;
300 
301   //! Metric.
302   MetricType metric;
303 
304   //! Reference tree.
305   Tree* referenceTree;
306 
307   //! Permutations of reference points.
308   std::vector<size_t>* oldFromNewReferences;
309 
310   //! Relative error tolerance.
311   double relError;
312 
313   //! Absolute error tolerance.
314   double absError;
315 
316   //! If true, the KDE object is responsible for deleting the reference tree.
317   bool ownsReferenceTree;
318 
319   //! If true, the KDE object is trained.
320   bool trained;
321 
322   //! Mode of the KDE algorithm.
323   KDEMode mode;
324 
325   //! If true Monte Carlo approximations will be used when possible.
326   bool monteCarlo;
327 
328   //! Probability of error being bounded by relError when Monte Carlo is used.
329   double mcProb;
330 
331   //! Size of the initial sample for Monte Carlo approximations.
332   size_t initialSampleSize;
333 
334   //! Coefficient to control how much larger does the amount of node descendants
335   //! has to be compared to the initial sample size in order to be a candidate
336   //! for Monte Carlo estimations. If the evaluation results in a non-natural
337   //! number, then the floor function will be applied.
338   double mcEntryCoef;
339 
340   //! Coefficient to control what fraction of the amount of node's descendants
341   //! is the limit before Monte Carlo estimation recurses.
342   double mcBreakCoef;
343 
344   //! Check whether absolute and relative error values are compatible.
345   static void CheckErrorValues(const double relError, const double absError);
346 
347   //! Rearrange estimations vector if required.
348   static void RearrangeEstimations(const std::vector<size_t>& oldFromNew,
349                                    arma::vec& estimations);
350 };
351 
352 } // namespace kde
353 } // namespace mlpack
354 
355 //! Set the serialization version of the KDE class.
356 /* TODO FIX
357 Cannot use BOOST_TEMPLATE_CLASS_VERSION because of the problem stated in
358 https://stackoverflow.com/questions/8942912/how-to-pass-multi-argument-templates
359 -to-macros
360 */
361 
362 namespace boost {
363 namespace serialization{
364 
365 template<typename KernelType,
366          typename MetricType,
367          typename MatType,
368          template<typename TreeMetricType,
369                   typename TreeStatType,
370                   typename TreeMatType> class TreeType,
371          template<typename RuleType> class DualTreeTraversalType,
372          template<typename RuleType> class SingleTreeTraversalType>
373 struct version<mlpack::kde::KDE<KernelType,
374                                 MetricType,
375                                 MatType,
376                                 TreeType,
377                                 DualTreeTraversalType,
378                                 SingleTreeTraversalType>>
379 {
380   typedef mpl::int_<1> type;
381   typedef mpl::integral_c_tag tag;
382   BOOST_STATIC_CONSTANT(int, value = version::type::value);
383   BOOST_MPL_ASSERT((boost::mpl::less<boost::mpl::int_<1>,
384                     boost::mpl::int_<256>>));
385 };
386 
387 } // namespace serialization.
388 } // namespace boost.
389 
390 // Include implementation.
391 #include "kde_impl.hpp"
392 
393 #endif // MLPACK_METHODS_KDE_KDE_HPP
394