1 /**
2  * @file methods/preprocess/preprocess_binarize_main.cpp
3  * @author Keon Kim
4  *
5  * A binding to binarize a dataset.
6  *
7  * mlpack is free software; you may redistribute it and/or modify it under the
8  * terms of the 3-clause BSD license.  You should have received a copy of the
9  * 3-clause BSD license along with mlpack.  If not, see
10  * http://www.opensource.org/licenses/BSD-3-Clause for more information.
11  */
12 #include <mlpack/prereqs.hpp>
13 #include <mlpack/core/util/io.hpp>
14 #include <mlpack/core/util/mlpack_main.hpp>
15 #include <mlpack/core/data/binarize.hpp>
16 
17 // Program Name.
18 BINDING_NAME("Binarize Data");
19 
20 // Short description.
21 BINDING_SHORT_DESC(
22     "A utility to binarize a dataset.  Given a dataset, this utility converts "
23     "each value in the desired dimension(s) to 0 or 1; this can be a useful "
24     "preprocessing step.");
25 
26 // Long description.
27 BINDING_LONG_DESC(
28     "This utility takes a dataset and binarizes the "
29     "variables into either 0 or 1 given threshold. User can apply binarization "
30     "on a dimension or the whole dataset.  The dimension to apply binarization "
31     "to can be specified using the " + PRINT_PARAM_STRING("dimension") +
32     " parameter; if left unspecified, every dimension will be binarized.  The "
33     "threshold for binarization can also be specified with the " +
34     PRINT_PARAM_STRING("threshold") + " parameter; the default threshold is "
35     "0.0."
36     "\n\n"
37     "The binarized matrix may be saved with the " +
38     PRINT_PARAM_STRING("output") + " output parameter.");
39 
40 // Example.
41 BINDING_EXAMPLE(
42     "For example, if we want to set all variables greater than 5 in the "
43     "dataset " + PRINT_DATASET("X") + " to 1 and variables less than or equal "
44     "to 5.0 to 0, and save the result to " + PRINT_DATASET("Y") + ", we could "
45     "run"
46     "\n\n" +
47     PRINT_CALL("preprocess_binarize", "input", "X", "threshold", 5.0, "output",
48         "Y") +
49     "\n\n"
50     "But if we want to apply this to only the first (0th) dimension of " +
51     PRINT_DATASET("X") + ",  we could instead run"
52     "\n\n" +
53     PRINT_CALL("preprocess_binarize", "input", "X", "threshold", 5.0,
54         "dimension", 0, "output", "Y"));
55 
56 // See also...
57 BINDING_SEE_ALSO("@preprocess_describe", "#preprocess_describe");
58 BINDING_SEE_ALSO("@preprocess_imputer", "#preprocess_imputer");
59 BINDING_SEE_ALSO("@preprocess_split", "#preprocess_split");
60 
61 // Define parameters for data.
62 PARAM_MATRIX_IN_REQ("input", "Input data matrix.", "i");
63 // Define optional parameters.
64 PARAM_MATRIX_OUT("output", "Matrix in which to save the output.", "o");
65 PARAM_INT_IN("dimension", "Dimension to apply the binarization. If not set, the"
66     " program will binarize every dimension by default.", "d", 0);
67 PARAM_DOUBLE_IN("threshold", "Threshold to be applied for binarization. If not "
68     "set, the threshold defaults to 0.0.", "t", 0.0);
69 
70 using namespace mlpack;
71 using namespace mlpack::util;
72 using namespace arma;
73 using namespace std;
74 
mlpackMain()75 static void mlpackMain()
76 {
77   const size_t dimension = (size_t) IO::GetParam<int>("dimension");
78   const double threshold = IO::GetParam<double>("threshold");
79 
80   // Check on data parameters.
81   if (!IO::HasParam("dimension"))
82   {
83     Log::Warn << "You did not specify " << PRINT_PARAM_STRING("dimension")
84         << ", so the program will perform binarization on every dimension."
85         << endl;
86   }
87 
88   if (!IO::HasParam("threshold"))
89   {
90     Log::Warn << "You did not specify " << PRINT_PARAM_STRING("threshold")
91         << ", so the threshold will be automatically set to '0.0'." << endl;
92   }
93 
94   RequireAtLeastOnePassed({ "output" }, false, "no output will be saved");
95 
96   // Load the data.
97   arma::mat input = std::move(IO::GetParam<arma::mat>("input"));
98   arma::mat output;
99 
100   RequireParamValue<int>("dimension", [](int x) { return x >= 0; }, true,
101       "dimension to binarize must be nonnegative");
102   std::ostringstream error;
103   error << "dimension to binarize must be less than the number of dimensions "
104       << "of the input data (" << input.n_rows << ")";
105   RequireParamValue<int>("dimension",
106       [input](int x) { return size_t(x) < input.n_rows; }, true, error.str());
107 
108   Timer::Start("binarize");
109   if (IO::HasParam("dimension"))
110   {
111     data::Binarize<double>(input, output, threshold, dimension);
112   }
113   else
114   {
115     // Binarize the whole dataset.
116     data::Binarize<double>(input, output, threshold);
117   }
118   Timer::Stop("binarize");
119 
120   if (IO::HasParam("output"))
121     IO::GetParam<arma::mat>("output") = std::move(output);
122 }
123