1 /**
2  * @file methods/pca/pca_main.cpp
3  * @author Ryan Curtin
4  * @author Marcus Edel
5  *
6  * Main executable to run PCA.
7  *
8  * mlpack is free software; you may redistribute it and/or modify it under the
9  * terms of the 3-clause BSD license.  You should have received a copy of the
10  * 3-clause BSD license along with mlpack.  If not, see
11  * http://www.opensource.org/licenses/BSD-3-Clause for more information.
12  */
13 #include <mlpack/prereqs.hpp>
14 #include <mlpack/core/util/io.hpp>
15 #include <mlpack/core/util/mlpack_main.hpp>
16 
17 #include "pca.hpp"
18 #include <mlpack/methods/pca/decomposition_policies/exact_svd_method.hpp>
19 #include <mlpack/methods/pca/decomposition_policies/quic_svd_method.hpp>
20 #include <mlpack/methods/pca/decomposition_policies/randomized_svd_method.hpp>
21 #include <mlpack/methods/pca/decomposition_policies/randomized_block_krylov_method.hpp>
22 
23 using namespace mlpack;
24 using namespace mlpack::pca;
25 using namespace mlpack::util;
26 using namespace std;
27 
28 // Program Name.
29 BINDING_NAME("Principal Components Analysis");
30 
31 // Short description.
32 BINDING_SHORT_DESC(
33     "An implementation of several strategies for principal components analysis "
34     "(PCA), a common preprocessing step.  Given a dataset and a desired new "
35     "dimensionality, this can reduce the dimensionality of the data using the "
36     "linear transformation determined by PCA.");
37 
38 // Long description.
39 BINDING_LONG_DESC(
40     "This program performs principal components analysis on the given dataset "
41     "using the exact, randomized, randomized block Krylov, or QUIC SVD method. "
42     "It will transform the data onto its principal components, optionally "
43     "performing dimensionality reduction by ignoring the principal components "
44     "with the smallest eigenvalues."
45     "\n\n"
46     "Use the " + PRINT_PARAM_STRING("input") + " parameter to specify the "
47     "dataset to perform PCA on.  A desired new dimensionality can be specified "
48     "with the " + PRINT_PARAM_STRING("new_dimensionality") + " parameter, or "
49     "the desired variance to retain can be specified with the " +
50     PRINT_PARAM_STRING("var_to_retain") + " parameter.  If desired, the "
51     "dataset can be scaled before running PCA with the " +
52     PRINT_PARAM_STRING("scale") + " parameter."
53     "\n\n"
54     "Multiple different decomposition techniques can be used.  The method to "
55     "use can be specified with the " +
56     PRINT_PARAM_STRING("decomposition_method") + " parameter, and it may take "
57     "the values 'exact', 'randomized', or 'quic'.");
58 
59 // Example.
60 BINDING_EXAMPLE(
61     "For example, to reduce the dimensionality of the matrix " +
62     PRINT_DATASET("data") + " to 5 dimensions using randomized SVD for the "
63     "decomposition, storing the output matrix to " +
64     PRINT_DATASET("data_mod") + ", the following command can be used:"
65     "\n\n" +
66     PRINT_CALL("pca", "input", "data", "new_dimensionality", 5,
67         "decomposition_method", "randomized", "output", "data_mod"));
68 
69 // See also...
70 BINDING_SEE_ALSO("Principal component analysis on Wikipedia",
71         "https://en.wikipedia.org/wiki/Principal_component_analysis");
72 BINDING_SEE_ALSO("mlpack::pca::PCA C++ class documentation",
73         "@doxygen/classmlpack_1_1pca_1_1PCA.html");
74 
75 // Parameters for program.
76 PARAM_MATRIX_IN_REQ("input", "Input dataset to perform PCA on.", "i");
77 PARAM_MATRIX_OUT("output", "Matrix to save modified dataset to.", "o");
78 PARAM_INT_IN("new_dimensionality", "Desired dimensionality of output dataset. "
79     "If 0, no dimensionality reduction is performed.", "d", 0);
80 PARAM_DOUBLE_IN("var_to_retain", "Amount of variance to retain; should be "
81     "between 0 and 1.  If 1, all variance is retained.  Overrides -d.", "r", 0);
82 
83 PARAM_FLAG("scale", "If set, the data will be scaled before running PCA, such "
84     "that the variance of each feature is 1.", "s");
85 
86 PARAM_STRING_IN("decomposition_method", "Method used for the principal "
87     "components analysis: 'exact', 'randomized', 'randomized-block-krylov', "
88     "'quic'.", "c", "exact");
89 
90 
91 //! Run RunPCA on the specified dataset with the given decomposition method.
92 template<typename DecompositionPolicy>
RunPCA(arma::mat & dataset,const size_t newDimension,const bool scale,const double varToRetain)93 void RunPCA(arma::mat& dataset,
94             const size_t newDimension,
95             const bool scale,
96             const double varToRetain)
97 {
98   PCA<DecompositionPolicy> p(scale);
99 
100   Log::Info << "Performing PCA on dataset..." << endl;
101   double varRetained;
102 
103   if (IO::HasParam("var_to_retain"))
104   {
105     if (IO::HasParam("new_dimensionality"))
106       Log::Warn << "New dimensionality (-d) ignored because --var_to_retain "
107           << "(-r) was specified." << endl;
108 
109     varRetained = p.Apply(dataset, varToRetain);
110   }
111   else
112   {
113     varRetained = p.Apply(dataset, newDimension);
114   }
115 
116   Log::Info << (varRetained * 100) << "% of variance retained (" <<
117       dataset.n_rows << " dimensions)." << endl;
118 }
119 
mlpackMain()120 static void mlpackMain()
121 {
122   // Load input dataset.
123   arma::mat& dataset = IO::GetParam<arma::mat>("input");
124 
125   // Issue a warning if the user did not specify an output file.
126   RequireAtLeastOnePassed({ "output" }, false, "no output will be saved");
127 
128   // Check decomposition method validity.
129   RequireParamInSet<string>("decomposition_method", { "exact", "randomized",
130       "randomized-block-krylov", "quic" }, true,
131       "unknown decomposition method");
132 
133   // Find out what dimension we want.
134   RequireParamValue<int>("new_dimensionality", [](int x) { return x >= 0; },
135       true, "new dimensionality must be non-negative");
136   std::ostringstream error;
137   error << "cannot be greater than existing dimensionality (" << dataset.n_rows
138       << ")";
139   RequireParamValue<int>("new_dimensionality",
140       [dataset](int x) { return x <= (int) dataset.n_rows; }, true,
141       error.str());
142 
143   RequireParamValue<double>("var_to_retain",
144       [](double x) { return x >= 0.0 && x <= 1.0; }, true,
145       "variance retained must be between 0 and 1");
146   size_t newDimension = (IO::GetParam<int>("new_dimensionality") == 0) ?
147       dataset.n_rows : IO::GetParam<int>("new_dimensionality");
148 
149   // Get the options for running PCA.
150   const bool scale = IO::HasParam("scale");
151   const double varToRetain = IO::GetParam<double>("var_to_retain");
152   const string decompositionMethod = IO::GetParam<string>(
153       "decomposition_method");
154 
155   // Perform PCA.
156   if (decompositionMethod == "exact")
157   {
158     RunPCA<ExactSVDPolicy>(dataset, newDimension, scale, varToRetain);
159   }
160   else if (decompositionMethod == "randomized")
161   {
162     RunPCA<RandomizedSVDPolicy>(dataset, newDimension, scale, varToRetain);
163   }
164   else if (decompositionMethod == "randomized-block-krylov")
165   {
166     RunPCA<RandomizedBlockKrylovSVDPolicy>(dataset, newDimension, scale,
167         varToRetain);
168   }
169   else if (decompositionMethod == "quic")
170   {
171     RunPCA<QUICSVDPolicy>(dataset, newDimension, scale, varToRetain);
172   }
173 
174   // Now save the results.
175   if (IO::HasParam("output"))
176     IO::GetParam<arma::mat>("output") = std::move(dataset);
177 }
178