1
2 // =================================================================================================
3 // This file is part of the CLTune project, which loosely follows the Google C++ styleguide and uses
4 // a tab-size of two spaces and a max-width of 100 characters per line.
5 //
6 // Author: cedric.nugteren@surfsara.nl (Cedric Nugteren)
7 //
8 // This file implements the NeuralNetwork class (see the header for information about the class).
9 //
10 // -------------------------------------------------------------------------------------------------
11 //
12 // Copyright 2014 SURFsara
13 //
14 // Licensed under the Apache License, Version 2.0 (the "License");
15 // you may not use this file except in compliance with the License.
16 // You may obtain a copy of the License at
17 //
18 // http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
23 // See the License for the specific language governing permissions and
24 // limitations under the License.
25 //
26 // =================================================================================================
27
28 // The corresponding header file
29 #include "internal/ml_models/neural_network.h"
30
31 #include <vector>
32 #include <cmath>
33 #include <random>
34 #include <exception>
35 #include <chrono>
36
37 namespace cltune {
38 // =================================================================================================
39
40 // Calls the base-class constructor
41 template <typename T>
NeuralNetwork(const size_t learning_iterations,const T learning_rate,const T lambda,const std::vector<size_t> & layer_sizes,const bool debug_display)42 NeuralNetwork<T>::NeuralNetwork(const size_t learning_iterations, const T learning_rate,
43 const T lambda, const std::vector<size_t> &layer_sizes,
44 const bool debug_display):
45 MLModel<T>(debug_display),
46 num_layers_(layer_sizes.size()),
47 layer_sizes_(layer_sizes),
48 learning_iterations_(learning_iterations),
49 learning_rate_(learning_rate),
50 lambda_(lambda) {
51 if (num_layers_ != 3) { throw std::runtime_error("Only supporting networks with 3 layers"); }
52 }
53
54 // =================================================================================================
55
56 // Trains the model
57 template <typename T>
Train(const std::vector<std::vector<T>> & x,const std::vector<T> & y)58 void NeuralNetwork<T>::Train(const std::vector<std::vector<T>> &x, const std::vector<T> &y) {
59 auto x_temp = x;
60 auto y_temp = y;
61
62 // Modifies data to get a better model
63 ComputeNormalizations(x_temp);
64 PreProcessFeatures(x_temp);
65 PreProcessExecutionTimes(y_temp);
66
67 // Runs gradient descent to train the model
68 GradientDescent(x_temp, y_temp, learning_rate_, lambda_, learning_iterations_);
69
70 // Verifies and displays the trained results
71 auto cost = Verify(x_temp, y_temp);
72 printf("%s Training cost: %.2e\n", TunerImpl::kMessageResult.c_str(), cost);
73 }
74
75 // Validates the model
76 template <typename T>
Validate(const std::vector<std::vector<T>> & x,const std::vector<T> & y)77 void NeuralNetwork<T>::Validate(const std::vector<std::vector<T>> &x, const std::vector<T> &y) {
78 auto x_temp = x;
79 auto y_temp = y;
80
81 // Modifies validation data in the same way as the training data
82 PreProcessFeatures(x_temp);
83 PreProcessExecutionTimes(y_temp);
84
85 // Verifies and displays the trained results
86 auto cost = Verify(x_temp, y_temp);
87 printf("%s Validation cost: %.2e\n", TunerImpl::kMessageResult.c_str(), cost);
88 }
89
90 // Prediction: pre-processe a single sample and pass it through the model
91 template <typename T>
Predict(const std::vector<T> & x) const92 T NeuralNetwork<T>::Predict(const std::vector<T> &x) const {
93 auto x_preprocessed = std::vector<std::vector<T>>{x};
94 PreProcessFeatures(x_preprocessed);
95 return PostProcessExecutionTime(Hypothesis(x_preprocessed[0]));
96 }
97
98 // =================================================================================================
99
100 // Pre-processes the features based on normalization data
101 template <typename T>
PreProcessFeatures(std::vector<std::vector<T>> & x) const102 void NeuralNetwork<T>::PreProcessFeatures(std::vector<std::vector<T>> &x) const {
103 NormalizeFeatures(x);
104 }
105
106 // Pre-processes the execution times using a logarithmic function
107 template <typename T>
PreProcessExecutionTimes(std::vector<T> & y) const108 void NeuralNetwork<T>::PreProcessExecutionTimes(std::vector<T> &y) const {
109 for (auto &value: y) { value = static_cast<T>(log(static_cast<double>(value))); }
110 }
111
112 // Post-processes an execution time using an exponent function (inverse of the logarithm)
113 template <typename T>
PostProcessExecutionTime(T value) const114 T NeuralNetwork<T>::PostProcessExecutionTime(T value) const {
115 return static_cast<T>(exp(static_cast<double>(value)));
116 }
117
118 // =================================================================================================
119
120 // Initialization-function: sets the initial weights theta
121 template <typename T>
InitializeTheta(const size_t n)122 void NeuralNetwork<T>::InitializeTheta(const size_t n) {
123
124 // Resizes the weight matrices theta
125 if (layer_sizes_[0] != n) { throw std::runtime_error("Invalid size of the first layer"); }
126 if (layer_sizes_[2] != 1) { throw std::runtime_error("Invalid size of the third layer"); }
127 theta1_.resize((layer_sizes_[0]+1)*layer_sizes_[1]);
128 theta2_.resize((layer_sizes_[1]+1)*layer_sizes_[2]);
129
130 // Calculates the random-initialization range
131 auto epsilon1 = static_cast<T>(sqrt(static_cast<T>(6))/sqrt(static_cast<T>(layer_sizes_[0]+layer_sizes_[1])));
132 auto epsilon2 = static_cast<T>(sqrt(static_cast<T>(6))/sqrt(static_cast<T>(layer_sizes_[1]+layer_sizes_[2])));
133
134 // Creates a random number generator
135 const auto random_seed = std::chrono::system_clock::now().time_since_epoch().count();
136 std::default_random_engine generator(static_cast<unsigned int>(random_seed));
137 std::uniform_real_distribution<T> distribution1(-epsilon1, epsilon1);
138 std::uniform_real_distribution<T> distribution2(-epsilon2, epsilon2);
139
140 // Fills the weights with random values
141 for (auto &weight: theta1_) { weight = distribution1(generator); }
142 for (auto &weight: theta2_) { weight = distribution2(generator); }
143 }
144
145 // =================================================================================================
146
147 // Hypothesis-function: pass a single sample through the model and returns its hypothesis
148 // TODO: Memory usage and performance can be improved
149 template <typename T>
Hypothesis(const std::vector<T> & x) const150 T NeuralNetwork<T>::Hypothesis(const std::vector<T> &x) const {
151
152 // Adds a bias to the input data
153 auto a0 = FeedForward0(x);
154
155 // Performs the activations of the first hidden layer
156 auto a1 = FeedForward1(a0, true);
157
158 // Performs the activations of the output layer
159 auto a2 = FeedForward2(a1);
160
161 // Only one output supported
162 if (layer_sizes_[2] != 1) { throw std::runtime_error("Invalid size of the third layer"); }
163 auto hypothesis = a2[0];
164 return hypothesis;
165 }
166
167 // Cost-function: computes the sum of squared differences
168 template <typename T>
Cost(const size_t m,const size_t,const T lambda,const std::vector<std::vector<T>> & x,const std::vector<T> & y) const169 T NeuralNetwork<T>::Cost(const size_t m, const size_t, const T lambda,
170 const std::vector<std::vector<T>> &x, const std::vector<T> &y) const {
171
172 // Computes the sum of squared differences
173 auto cost = static_cast<T>(0);
174 for (auto mid=size_t{0}; mid<m; ++mid) {
175 auto difference = Hypothesis(x[mid]) - y[mid];
176 cost += difference * difference;
177 }
178 cost /= static_cast<T>(m);
179
180 // Computes the squared sum of theta's (not counting theta-zero) for the regularization term
181 auto theta_squared_sum = static_cast<T>(0);
182 for (auto id1=size_t{0}; id1<layer_sizes_[1]; ++id1) {
183 for (auto id0=size_t{1}; id0<layer_sizes_[0]+1; ++id0) {
184 auto value = theta1_[id1*(layer_sizes_[0]+1) + id0];
185 theta_squared_sum += value * value;
186 }
187 }
188 for (auto id2=size_t{0}; id2<layer_sizes_[2]; ++id2) {
189 for (auto id1=size_t{1}; id1<layer_sizes_[1]+1; ++id1) {
190 auto value = theta2_[id2*(layer_sizes_[1]+1) + id1];
191 theta_squared_sum += value * value;
192 }
193 }
194
195 // Computes the final cost
196 return cost + (lambda*theta_squared_sum) / (static_cast<T>(2 * m));
197 }
198
199 // Gradient-function: computes the gradient of the cost-function
200 template <typename T>
Gradient(const size_t m,const size_t,const T lambda,const T alpha,const std::vector<std::vector<T>> & x,const std::vector<T> & y)201 void NeuralNetwork<T>::Gradient(const size_t m, const size_t, const T lambda, const T alpha,
202 const std::vector<std::vector<T>> &x, const std::vector<T> &y) {
203
204 // Temporary data storage for the gradients
205 auto gradient1 = std::vector<T>(theta1_.size(), static_cast<T>(0));
206 auto gradient2 = std::vector<T>(theta2_.size(), static_cast<T>(0));
207
208 // Loops over all samples of the training data
209 for (auto mid=size_t{0}; mid<m; ++mid) {
210
211 // Performs the feed-forward computations (with and without sigmoid)
212 auto a0 = FeedForward0(x[mid]);
213 auto z1 = FeedForward1(a0, false);
214 auto a1 = FeedForward1(a0, true);
215 auto a2 = FeedForward2(a1);
216
217 // Computes the error at the last layer
218 auto d2 = std::vector<T>(layer_sizes_[2]);
219 if (layer_sizes_[2] != 1) { throw std::runtime_error("Invalid size of the third layer"); }
220 d2[0] = a2[0] - y[mid];
221
222 // Propagates the error back (backpropagation) to compute the delta at the hidden layer
223 auto d1 = std::vector<T>(layer_sizes_[1]);
224 for (auto id1=size_t{1}; id1<layer_sizes_[1]+1; ++id1) {
225 auto value = static_cast<T>(0);
226 for (auto id2=size_t{0}; id2<layer_sizes_[2]; ++id2) {
227 value += d2[id2] * theta2_[id2*(layer_sizes_[1]+1) + id1];
228 }
229 d1[id1-1] = value * SigmoidGradient(z1[id1]);
230 }
231
232 // Accumulates the partial gradients
233 for (auto id0=size_t{0}; id0<layer_sizes_[0]+1; ++id0) {
234 for (auto id1=size_t{0}; id1<layer_sizes_[1]; ++id1) {
235 gradient1[id1*(layer_sizes_[0]+1) + id0] += d1[id1] * a0[id0];
236 }
237 }
238 for (auto id1=size_t{0}; id1<layer_sizes_[1]+1; ++id1) {
239 for (auto id2=size_t{0}; id2<layer_sizes_[2]; ++id2) {
240 gradient2[id2*(layer_sizes_[1]+1) + id1] += d2[id2] * a1[id1];
241 }
242 }
243 }
244
245 // Computes the final gradients, adding regularization
246 for (auto id0=size_t{0}; id0<layer_sizes_[0]+1; ++id0) {
247 for (auto id1=size_t{0}; id1<layer_sizes_[1]; ++id1) {
248 auto index = id1*(layer_sizes_[0]+1) + id0;
249 if (id0 != 0) { // Don't add regularization for the bias term
250 gradient1[index] += lambda * theta1_[index];
251 }
252 gradient1[index] /= static_cast<T>(m);
253 }
254 }
255 for (auto id1=size_t{0}; id1<layer_sizes_[1]+1; ++id1) {
256 for (auto id2=size_t{0}; id2<layer_sizes_[2]; ++id2) {
257 auto index = id2*(layer_sizes_[1]+1) + id1;
258 if (id1 != 0) { // Don't add regularization for the bias term
259 gradient2[index] += lambda * theta2_[index];
260 }
261 gradient2[index] /= static_cast<T>(m);
262 }
263 }
264
265 // Sets the new values of theta
266 for (auto id0=size_t{0}; id0<layer_sizes_[0]+1; ++id0) {
267 for (auto id1=size_t{0}; id1<layer_sizes_[1]; ++id1) {
268 auto index = id1*(layer_sizes_[0]+1) + id0;
269 theta1_[index] = theta1_[index] - alpha * gradient1[index];
270 }
271 }
272 for (auto id1=size_t{0}; id1<layer_sizes_[1]+1; ++id1) {
273 for (auto id2=size_t{0}; id2<layer_sizes_[2]; ++id2) {
274 auto index = id2*(layer_sizes_[1]+1) + id1;
275 theta2_[index] = theta2_[index] - alpha * gradient2[index];
276 }
277 }
278 }
279
280 // =================================================================================================
281
282 // Feed-forward function: input layer (with bias unit)
283 template <typename T>
FeedForward0(const std::vector<T> & x) const284 std::vector<T> NeuralNetwork<T>::FeedForward0(const std::vector<T> &x) const {
285 auto a0 = std::vector<T>(layer_sizes_[0]+1);
286 a0[0] = static_cast<T>(1);
287 for (auto id0=size_t{0}; id0<layer_sizes_[0]; ++id0) {
288 a0[id0 + 1] = x[id0];
289 }
290 return a0;
291 }
292
293 // Feed-forward function: hidden layer (with bias unit and optionally a sigmoid activation function)
294 template <typename T>
FeedForward1(const std::vector<T> & a0,const bool sigmoid) const295 std::vector<T> NeuralNetwork<T>::FeedForward1(const std::vector<T> &a0, const bool sigmoid) const {
296 auto a1 = std::vector<T>(layer_sizes_[1]+1);
297 a1[0] = static_cast<T>(1);
298 for (auto id1=size_t{0}; id1<layer_sizes_[1]; ++id1) {
299 auto z1 = static_cast<T>(0);
300 for (auto id0=size_t{0}; id0<layer_sizes_[0]+1; ++id0) {
301 z1 += a0[id0] * theta1_[id1*(layer_sizes_[0]+1) + id0];
302 }
303 a1[id1 + 1] = (sigmoid) ? Sigmoid(z1) : z1;
304 }
305 return a1;
306 }
307
308 // Feed-forward function: output layer
309 template <typename T>
FeedForward2(const std::vector<T> & a1) const310 std::vector<T> NeuralNetwork<T>::FeedForward2(const std::vector<T> &a1) const {
311 auto a2 = std::vector<T>(layer_sizes_[2]);
312 for (auto id2=size_t{0}; id2<layer_sizes_[2]; ++id2) {
313 auto z2 = static_cast<T>(0);
314 for (auto id1=size_t{0}; id1<layer_sizes_[1]+1; ++id1) {
315 z2 += a1[id1] * theta2_[id2*(layer_sizes_[1]+1) + id1];
316 }
317 a2[id2] = z2; // No sigmoid activation function in the output layer
318 }
319 return a2;
320 }
321
322 // =================================================================================================
323
324 // Compiles the class
325 template class NeuralNetwork<float>;
326
327 // =================================================================================================
328 } // namespace cltune
329