1 /**
2  * @file methods/linear_regression/linear_regression_main.cpp
3  * @author James Cline
4  *
5  * Main function for least-squares linear regression.
6  *
7  * mlpack is free software; you may redistribute it and/or modify it under the
8  * terms of the 3-clause BSD license.  You should have received a copy of the
9  * 3-clause BSD license along with mlpack.  If not, see
10  * http://www.opensource.org/licenses/BSD-3-Clause for more information.
11  */
12 #include <mlpack/prereqs.hpp>
13 #include <mlpack/core/util/io.hpp>
14 #include <mlpack/core/util/mlpack_main.hpp>
15 
16 #include "linear_regression.hpp"
17 
18 using namespace mlpack;
19 using namespace mlpack::regression;
20 using namespace mlpack::util;
21 using namespace arma;
22 using namespace std;
23 
24 // Program Name.
25 BINDING_NAME("Simple Linear Regression and Prediction");
26 
27 // Short description.
28 BINDING_SHORT_DESC(
29     "An implementation of simple linear regression and ridge regression using "
30     "ordinary least squares.  Given a dataset and responses, a model can be "
31     "trained and saved for later use, or a pre-trained model can be used to "
32     "output regression predictions for a test set.");
33 
34 // Long description.
35 BINDING_LONG_DESC(
36     "An implementation of simple linear regression and simple ridge regression "
37     "using ordinary least squares. This solves the problem"
38     "\n\n"
39     "  y = X * b + e"
40     "\n\n"
41     "where X (specified by " + PRINT_PARAM_STRING("training") + ") and y "
42     "(specified either as the last column of the input matrix " +
43     PRINT_PARAM_STRING("training") + " or via the " +
44     PRINT_PARAM_STRING("training_responses") + " parameter) are known and b is"
45     " the desired variable.  If the covariance matrix (X'X) is not invertible, "
46     "or if the solution is overdetermined, then specify a Tikhonov "
47     "regularization constant (with " + PRINT_PARAM_STRING("lambda") + ") "
48     "greater than 0, which will regularize the covariance matrix to make it "
49     "invertible.  The calculated b may be saved with the " +
50     PRINT_PARAM_STRING("output_predictions") + " output parameter."
51     "\n\n"
52     "Optionally, the calculated value of b is used to predict the responses for"
53     " another matrix X' (specified by the " + PRINT_PARAM_STRING("test") + " "
54     "parameter):"
55     "\n\n"
56     "   y' = X' * b"
57     "\n\n"
58     "and the predicted responses y' may be saved with the " +
59     PRINT_PARAM_STRING("output_predictions") + " output parameter.  This type "
60     "of regression is related to least-angle regression, which mlpack "
61     "implements as the 'lars' program.");
62 
63 // Example.
64 BINDING_EXAMPLE(
65     "For example, to run a linear regression on the dataset " +
66     PRINT_DATASET("X") + " with responses " + PRINT_DATASET("y") + ", saving "
67     "the trained model to " + PRINT_MODEL("lr_model") + ", the following "
68     "command could be used:"
69     "\n\n" +
70     PRINT_CALL("linear_regression", "training", "X", "training_responses", "y",
71         "output_model", "lr_model") +
72     "\n\n"
73     "Then, to use " + PRINT_MODEL("lr_model") + " to predict responses for a "
74     "test set " + PRINT_DATASET("X_test") + ", saving the predictions to " +
75     PRINT_DATASET("X_test_responses") + ", the following command could be "
76     "used:"
77     "\n\n" +
78     PRINT_CALL("linear_regression", "input_model", "lr_model", "test", "X_test",
79         "output_predictions", "X_test_responses"));
80 
81 // See also...
82 BINDING_SEE_ALSO("Linear/ridge regression tutorial",
83        "@doxygen/lrtutorial.html");
84 BINDING_SEE_ALSO("@lars", "#lars");
85 BINDING_SEE_ALSO("Linear regression on Wikipedia",
86         "https://en.wikipedia.org/wiki/Linear_regression");
87 BINDING_SEE_ALSO("mlpack::regression::LinearRegression C++ class "
88         "documentation",
89         "@doxygen/classmlpack_1_1regression_1_1LinearRegression.html");
90 
91 PARAM_MATRIX_IN("training", "Matrix containing training set X (regressors).",
92     "t");
93 PARAM_ROW_IN("training_responses", "Optional vector containing y "
94     "(responses). If not given, the responses are assumed to be the last row "
95     "of the input file.", "r");
96 
97 PARAM_MODEL_IN(LinearRegression, "input_model", "Existing LinearRegression "
98     "model to use.", "m");
99 PARAM_MODEL_OUT(LinearRegression, "output_model", "Output LinearRegression "
100     "model.", "M");
101 
102 PARAM_MATRIX_IN("test", "Matrix containing X' (test regressors).", "T");
103 
104 // This is the future name of the parameter.
105 PARAM_ROW_OUT("output_predictions", "If --test_file is specified, this "
106     "matrix is where the predicted responses will be saved.", "o");
107 
108 PARAM_DOUBLE_IN("lambda", "Tikhonov regularization for ridge regression.  If 0,"
109     " the method reduces to linear regression.", "l", 0.0);
110 
mlpackMain()111 static void mlpackMain()
112 {
113   const double lambda = IO::GetParam<double>("lambda");
114 
115   RequireOnlyOnePassed({ "training", "input_model" }, true);
116 
117   ReportIgnoredParam({{ "test", true }}, "output_predictions");
118 
119   mat regressors;
120   rowvec responses;
121 
122   LinearRegression* lr;
123 
124   const bool computeModel = !IO::HasParam("input_model");
125   const bool computePrediction = IO::HasParam("test");
126 
127   // If they specified a model file, we also need a test file or we
128   // have nothing to do.
129   if (!computeModel)
130   {
131     RequireAtLeastOnePassed({ "test" }, true, "test points must be specified "
132         "when an input model is given");
133   }
134 
135   ReportIgnoredParam({{ "input_model", true }}, "lambda");
136 
137   RequireAtLeastOnePassed({ "output_model", "output_predictions" }, false,
138       "no output will be saved");
139 
140   // An input file was given and we need to generate the model.
141   if (computeModel)
142   {
143     Timer::Start("load_regressors");
144     regressors = std::move(IO::GetParam<mat>("training"));
145     Timer::Stop("load_regressors");
146 
147     // Are the responses in a separate file?
148     if (!IO::HasParam("training_responses"))
149     {
150       // The initial predictors for y, Nx1.
151       if (regressors.n_rows < 2)
152       {
153         Log::Fatal << "Can't get responses from training data "
154             "since it has less than 2 rows." << endl;
155       }
156       responses = regressors.row(regressors.n_rows - 1);
157       regressors.shed_row(regressors.n_rows - 1);
158     }
159     else
160     {
161       // The initial predictors for y, Nx1.
162       Timer::Start("load_responses");
163       responses = IO::GetParam<rowvec>("training_responses");
164       Timer::Stop("load_responses");
165 
166       if (responses.n_cols != regressors.n_cols)
167       {
168         Log::Fatal << "The responses must have the same number of columns "
169             "as the training set." << endl;
170       }
171     }
172 
173     Timer::Start("regression");
174     lr = new LinearRegression(regressors, responses, lambda);
175     Timer::Stop("regression");
176   }
177   else
178   {
179     // A model file was passed in, so load it.
180     Timer::Start("load_model");
181     lr = IO::GetParam<LinearRegression*>("input_model");
182     Timer::Stop("load_model");
183   }
184 
185   // Did we want to predict, too?
186   if (computePrediction)
187   {
188     // Cache the output of GetPrintableParam before we std::move() the test
189     // matrix.  Loading actually will happen during GetPrintableParam() since
190     // that needs to load to print the size.
191     Timer::Start("load_test_points");
192     std::ostringstream oss;
193     oss << IO::GetPrintableParam<mat>("test");
194     std::string testOutput = oss.str();
195     Timer::Stop("load_test_points");
196 
197     mat points = std::move(IO::GetParam<mat>("test"));
198 
199     // Ensure that test file data has the right number of features.
200     if ((lr->Parameters().n_elem - 1) != points.n_rows)
201     {
202       // If we built the model, nothing will free it so we have to...
203       const size_t dimensions = lr->Parameters().n_elem - 1;
204       if (computeModel)
205         delete lr;
206 
207       Log::Fatal << "The model was trained on " << dimensions << "-dimensional "
208           << "data, but the test points in '" << testOutput << "' are "
209           << points.n_rows << "-dimensional!" << endl;
210     }
211 
212     // Perform the predictions using our model.
213     rowvec predictions;
214     Timer::Start("prediction");
215     lr->Predict(points, predictions);
216     Timer::Stop("prediction");
217 
218     // Save predictions.
219     IO::GetParam<rowvec>("output_predictions") = std::move(predictions);
220   }
221 
222   // Save the model if needed.
223   IO::GetParam<LinearRegression*>("output_model") = lr;
224 }
225