1 /**
2 * @file methods/preprocess/preprocess_binarize_main.cpp
3 * @author Keon Kim
4 *
5 * A binding to binarize a dataset.
6 *
7 * mlpack is free software; you may redistribute it and/or modify it under the
8 * terms of the 3-clause BSD license. You should have received a copy of the
9 * 3-clause BSD license along with mlpack. If not, see
10 * http://www.opensource.org/licenses/BSD-3-Clause for more information.
11 */
12 #include <mlpack/prereqs.hpp>
13 #include <mlpack/core/util/io.hpp>
14 #include <mlpack/core/util/mlpack_main.hpp>
15 #include <mlpack/core/data/binarize.hpp>
16
17 // Program Name.
18 BINDING_NAME("Binarize Data");
19
20 // Short description.
21 BINDING_SHORT_DESC(
22 "A utility to binarize a dataset. Given a dataset, this utility converts "
23 "each value in the desired dimension(s) to 0 or 1; this can be a useful "
24 "preprocessing step.");
25
26 // Long description.
27 BINDING_LONG_DESC(
28 "This utility takes a dataset and binarizes the "
29 "variables into either 0 or 1 given threshold. User can apply binarization "
30 "on a dimension or the whole dataset. The dimension to apply binarization "
31 "to can be specified using the " + PRINT_PARAM_STRING("dimension") +
32 " parameter; if left unspecified, every dimension will be binarized. The "
33 "threshold for binarization can also be specified with the " +
34 PRINT_PARAM_STRING("threshold") + " parameter; the default threshold is "
35 "0.0."
36 "\n\n"
37 "The binarized matrix may be saved with the " +
38 PRINT_PARAM_STRING("output") + " output parameter.");
39
40 // Example.
41 BINDING_EXAMPLE(
42 "For example, if we want to set all variables greater than 5 in the "
43 "dataset " + PRINT_DATASET("X") + " to 1 and variables less than or equal "
44 "to 5.0 to 0, and save the result to " + PRINT_DATASET("Y") + ", we could "
45 "run"
46 "\n\n" +
47 PRINT_CALL("preprocess_binarize", "input", "X", "threshold", 5.0, "output",
48 "Y") +
49 "\n\n"
50 "But if we want to apply this to only the first (0th) dimension of " +
51 PRINT_DATASET("X") + ", we could instead run"
52 "\n\n" +
53 PRINT_CALL("preprocess_binarize", "input", "X", "threshold", 5.0,
54 "dimension", 0, "output", "Y"));
55
56 // See also...
57 BINDING_SEE_ALSO("@preprocess_describe", "#preprocess_describe");
58 BINDING_SEE_ALSO("@preprocess_imputer", "#preprocess_imputer");
59 BINDING_SEE_ALSO("@preprocess_split", "#preprocess_split");
60
61 // Define parameters for data.
62 PARAM_MATRIX_IN_REQ("input", "Input data matrix.", "i");
63 // Define optional parameters.
64 PARAM_MATRIX_OUT("output", "Matrix in which to save the output.", "o");
65 PARAM_INT_IN("dimension", "Dimension to apply the binarization. If not set, the"
66 " program will binarize every dimension by default.", "d", 0);
67 PARAM_DOUBLE_IN("threshold", "Threshold to be applied for binarization. If not "
68 "set, the threshold defaults to 0.0.", "t", 0.0);
69
70 using namespace mlpack;
71 using namespace mlpack::util;
72 using namespace arma;
73 using namespace std;
74
mlpackMain()75 static void mlpackMain()
76 {
77 const size_t dimension = (size_t) IO::GetParam<int>("dimension");
78 const double threshold = IO::GetParam<double>("threshold");
79
80 // Check on data parameters.
81 if (!IO::HasParam("dimension"))
82 {
83 Log::Warn << "You did not specify " << PRINT_PARAM_STRING("dimension")
84 << ", so the program will perform binarization on every dimension."
85 << endl;
86 }
87
88 if (!IO::HasParam("threshold"))
89 {
90 Log::Warn << "You did not specify " << PRINT_PARAM_STRING("threshold")
91 << ", so the threshold will be automatically set to '0.0'." << endl;
92 }
93
94 RequireAtLeastOnePassed({ "output" }, false, "no output will be saved");
95
96 // Load the data.
97 arma::mat input = std::move(IO::GetParam<arma::mat>("input"));
98 arma::mat output;
99
100 RequireParamValue<int>("dimension", [](int x) { return x >= 0; }, true,
101 "dimension to binarize must be nonnegative");
102 std::ostringstream error;
103 error << "dimension to binarize must be less than the number of dimensions "
104 << "of the input data (" << input.n_rows << ")";
105 RequireParamValue<int>("dimension",
106 [input](int x) { return size_t(x) < input.n_rows; }, true, error.str());
107
108 Timer::Start("binarize");
109 if (IO::HasParam("dimension"))
110 {
111 data::Binarize<double>(input, output, threshold, dimension);
112 }
113 else
114 {
115 // Binarize the whole dataset.
116 data::Binarize<double>(input, output, threshold);
117 }
118 Timer::Stop("binarize");
119
120 if (IO::HasParam("output"))
121 IO::GetParam<arma::mat>("output") = std::move(output);
122 }
123