1 
2 // =================================================================================================
3 // This file is part of the CLTune project, which loosely follows the Google C++ styleguide and uses
4 // a tab-size of two spaces and a max-width of 100 characters per line.
5 //
6 // Author: cedric.nugteren@surfsara.nl (Cedric Nugteren)
7 //
8 // This file implements the NeuralNetwork class (see the header for information about the class).
9 //
10 // -------------------------------------------------------------------------------------------------
11 //
12 // Copyright 2014 SURFsara
13 //
14 // Licensed under the Apache License, Version 2.0 (the "License");
15 // you may not use this file except in compliance with the License.
16 // You may obtain a copy of the License at
17 //
18 //  http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
23 // See the License for the specific language governing permissions and
24 // limitations under the License.
25 //
26 // =================================================================================================
27 
28 // The corresponding header file
29 #include "internal/ml_models/neural_network.h"
30 
31 #include <vector>
32 #include <cmath>
33 #include <random>
34 #include <exception>
35 #include <chrono>
36 
37 namespace cltune {
38 // =================================================================================================
39 
40 // Calls the base-class constructor
41 template <typename T>
NeuralNetwork(const size_t learning_iterations,const T learning_rate,const T lambda,const std::vector<size_t> & layer_sizes,const bool debug_display)42 NeuralNetwork<T>::NeuralNetwork(const size_t learning_iterations, const T learning_rate,
43                                 const T lambda, const std::vector<size_t> &layer_sizes,
44                                 const bool debug_display):
45     MLModel<T>(debug_display),
46     num_layers_(layer_sizes.size()),
47     layer_sizes_(layer_sizes),
48     learning_iterations_(learning_iterations),
49     learning_rate_(learning_rate),
50     lambda_(lambda) {
51   if (num_layers_ != 3) { throw std::runtime_error("Only supporting networks with 3 layers"); }
52 }
53 
54 // =================================================================================================
55 
56 // Trains the model
57 template <typename T>
Train(const std::vector<std::vector<T>> & x,const std::vector<T> & y)58 void NeuralNetwork<T>::Train(const std::vector<std::vector<T>> &x, const std::vector<T> &y) {
59   auto x_temp = x;
60   auto y_temp = y;
61 
62   // Modifies data to get a better model
63   ComputeNormalizations(x_temp);
64   PreProcessFeatures(x_temp);
65   PreProcessExecutionTimes(y_temp);
66 
67   // Runs gradient descent to train the model
68   GradientDescent(x_temp, y_temp, learning_rate_, lambda_, learning_iterations_);
69 
70   // Verifies and displays the trained results
71   auto cost = Verify(x_temp, y_temp);
72   printf("%s Training cost: %.2e\n", TunerImpl::kMessageResult.c_str(), cost);
73 }
74 
75 // Validates the model
76 template <typename T>
Validate(const std::vector<std::vector<T>> & x,const std::vector<T> & y)77 void NeuralNetwork<T>::Validate(const std::vector<std::vector<T>> &x, const std::vector<T> &y) {
78   auto x_temp = x;
79   auto y_temp = y;
80 
81   // Modifies validation data in the same way as the training data
82   PreProcessFeatures(x_temp);
83   PreProcessExecutionTimes(y_temp);
84 
85   // Verifies and displays the trained results
86   auto cost = Verify(x_temp, y_temp);
87   printf("%s Validation cost: %.2e\n", TunerImpl::kMessageResult.c_str(), cost);
88 }
89 
90 // Prediction: pre-processe a single sample and pass it through the model
91 template <typename T>
Predict(const std::vector<T> & x) const92 T NeuralNetwork<T>::Predict(const std::vector<T> &x) const {
93   auto x_preprocessed = std::vector<std::vector<T>>{x};
94   PreProcessFeatures(x_preprocessed);
95   return PostProcessExecutionTime(Hypothesis(x_preprocessed[0]));
96 }
97 
98 // =================================================================================================
99 
100 // Pre-processes the features based on normalization data
101 template <typename T>
PreProcessFeatures(std::vector<std::vector<T>> & x) const102 void NeuralNetwork<T>::PreProcessFeatures(std::vector<std::vector<T>> &x) const {
103   NormalizeFeatures(x);
104 }
105 
106 // Pre-processes the execution times using a logarithmic function
107 template <typename T>
PreProcessExecutionTimes(std::vector<T> & y) const108 void NeuralNetwork<T>::PreProcessExecutionTimes(std::vector<T> &y) const {
109   for (auto &value: y) { value = static_cast<T>(log(static_cast<double>(value))); }
110 }
111 
112 // Post-processes an execution time using an exponent function (inverse of the logarithm)
113 template <typename T>
PostProcessExecutionTime(T value) const114 T NeuralNetwork<T>::PostProcessExecutionTime(T value) const {
115   return static_cast<T>(exp(static_cast<double>(value)));
116 }
117 
118 // =================================================================================================
119 
120 // Initialization-function: sets the initial weights theta
121 template <typename T>
InitializeTheta(const size_t n)122 void NeuralNetwork<T>::InitializeTheta(const size_t n) {
123 
124   // Resizes the weight matrices theta
125   if (layer_sizes_[0] != n) { throw std::runtime_error("Invalid size of the first layer"); }
126   if (layer_sizes_[2] != 1) { throw std::runtime_error("Invalid size of the third layer"); }
127   theta1_.resize((layer_sizes_[0]+1)*layer_sizes_[1]);
128   theta2_.resize((layer_sizes_[1]+1)*layer_sizes_[2]);
129 
130   // Calculates the random-initialization range
131   auto epsilon1 = static_cast<T>(sqrt(static_cast<T>(6))/sqrt(static_cast<T>(layer_sizes_[0]+layer_sizes_[1])));
132   auto epsilon2 = static_cast<T>(sqrt(static_cast<T>(6))/sqrt(static_cast<T>(layer_sizes_[1]+layer_sizes_[2])));
133 
134   // Creates a random number generator
135   const auto random_seed = std::chrono::system_clock::now().time_since_epoch().count();
136   std::default_random_engine generator(static_cast<unsigned int>(random_seed));
137   std::uniform_real_distribution<T> distribution1(-epsilon1, epsilon1);
138   std::uniform_real_distribution<T> distribution2(-epsilon2, epsilon2);
139 
140   // Fills the weights with random values
141   for (auto &weight: theta1_) { weight = distribution1(generator); }
142   for (auto &weight: theta2_) { weight = distribution2(generator); }
143 }
144 
145 // =================================================================================================
146 
147 // Hypothesis-function: pass a single sample through the model and returns its hypothesis
148 // TODO: Memory usage and performance can be improved
149 template <typename T>
Hypothesis(const std::vector<T> & x) const150 T NeuralNetwork<T>::Hypothesis(const std::vector<T> &x) const {
151 
152   // Adds a bias to the input data
153   auto a0 = FeedForward0(x);
154 
155   // Performs the activations of the first hidden layer
156   auto a1 = FeedForward1(a0, true);
157 
158   // Performs the activations of the output layer
159   auto a2 = FeedForward2(a1);
160 
161   // Only one output supported
162   if (layer_sizes_[2] != 1) { throw std::runtime_error("Invalid size of the third layer"); }
163   auto hypothesis = a2[0];
164   return hypothesis;
165 }
166 
167 // Cost-function: computes the sum of squared differences
168 template <typename T>
Cost(const size_t m,const size_t,const T lambda,const std::vector<std::vector<T>> & x,const std::vector<T> & y) const169 T NeuralNetwork<T>::Cost(const size_t m, const size_t, const T lambda,
170                          const std::vector<std::vector<T>> &x, const std::vector<T> &y) const {
171 
172   // Computes the sum of squared differences
173   auto cost = static_cast<T>(0);
174   for (auto mid=size_t{0}; mid<m; ++mid) {
175     auto difference = Hypothesis(x[mid]) - y[mid];
176     cost += difference * difference;
177   }
178   cost /= static_cast<T>(m);
179 
180   // Computes the squared sum of theta's (not counting theta-zero) for the regularization term
181   auto theta_squared_sum = static_cast<T>(0);
182   for (auto id1=size_t{0}; id1<layer_sizes_[1]; ++id1) {
183     for (auto id0=size_t{1}; id0<layer_sizes_[0]+1; ++id0) {
184       auto value = theta1_[id1*(layer_sizes_[0]+1) + id0];
185       theta_squared_sum += value * value;
186     }
187   }
188   for (auto id2=size_t{0}; id2<layer_sizes_[2]; ++id2) {
189     for (auto id1=size_t{1}; id1<layer_sizes_[1]+1; ++id1) {
190       auto value = theta2_[id2*(layer_sizes_[1]+1) + id1];
191       theta_squared_sum += value * value;
192     }
193   }
194 
195   // Computes the final cost
196   return cost + (lambda*theta_squared_sum) / (static_cast<T>(2 * m));
197 }
198 
199 // Gradient-function: computes the gradient of the cost-function
200 template <typename T>
Gradient(const size_t m,const size_t,const T lambda,const T alpha,const std::vector<std::vector<T>> & x,const std::vector<T> & y)201 void NeuralNetwork<T>::Gradient(const size_t m, const size_t, const T lambda, const T alpha,
202                                 const std::vector<std::vector<T>> &x, const std::vector<T> &y) {
203 
204   // Temporary data storage for the gradients
205   auto gradient1 = std::vector<T>(theta1_.size(), static_cast<T>(0));
206   auto gradient2 = std::vector<T>(theta2_.size(), static_cast<T>(0));
207 
208   // Loops over all samples of the training data
209   for (auto mid=size_t{0}; mid<m; ++mid) {
210 
211     // Performs the feed-forward computations (with and without sigmoid)
212     auto a0 = FeedForward0(x[mid]);
213     auto z1 = FeedForward1(a0, false);
214     auto a1 = FeedForward1(a0, true);
215     auto a2 = FeedForward2(a1);
216 
217     // Computes the error at the last layer
218     auto d2 = std::vector<T>(layer_sizes_[2]);
219     if (layer_sizes_[2] != 1) { throw std::runtime_error("Invalid size of the third layer"); }
220     d2[0] = a2[0] - y[mid];
221 
222     // Propagates the error back (backpropagation) to compute the delta at the hidden layer
223     auto d1 = std::vector<T>(layer_sizes_[1]);
224     for (auto id1=size_t{1}; id1<layer_sizes_[1]+1; ++id1) {
225       auto value = static_cast<T>(0);
226       for (auto id2=size_t{0}; id2<layer_sizes_[2]; ++id2) {
227         value += d2[id2] * theta2_[id2*(layer_sizes_[1]+1) + id1];
228       }
229       d1[id1-1] = value * SigmoidGradient(z1[id1]);
230     }
231 
232     // Accumulates the partial gradients
233     for (auto id0=size_t{0}; id0<layer_sizes_[0]+1; ++id0) {
234       for (auto id1=size_t{0}; id1<layer_sizes_[1]; ++id1) {
235         gradient1[id1*(layer_sizes_[0]+1) + id0] += d1[id1] * a0[id0];
236       }
237     }
238     for (auto id1=size_t{0}; id1<layer_sizes_[1]+1; ++id1) {
239       for (auto id2=size_t{0}; id2<layer_sizes_[2]; ++id2) {
240         gradient2[id2*(layer_sizes_[1]+1) + id1] += d2[id2] * a1[id1];
241       }
242     }
243   }
244 
245   // Computes the final gradients, adding regularization
246   for (auto id0=size_t{0}; id0<layer_sizes_[0]+1; ++id0) {
247     for (auto id1=size_t{0}; id1<layer_sizes_[1]; ++id1) {
248       auto index = id1*(layer_sizes_[0]+1) + id0;
249       if (id0 != 0) { // Don't add regularization for the bias term
250         gradient1[index] += lambda * theta1_[index];
251       }
252       gradient1[index] /= static_cast<T>(m);
253     }
254   }
255   for (auto id1=size_t{0}; id1<layer_sizes_[1]+1; ++id1) {
256     for (auto id2=size_t{0}; id2<layer_sizes_[2]; ++id2) {
257       auto index = id2*(layer_sizes_[1]+1) + id1;
258       if (id1 != 0) { // Don't add regularization for the bias term
259         gradient2[index] += lambda * theta2_[index];
260       }
261       gradient2[index] /= static_cast<T>(m);
262     }
263   }
264 
265   // Sets the new values of theta
266   for (auto id0=size_t{0}; id0<layer_sizes_[0]+1; ++id0) {
267     for (auto id1=size_t{0}; id1<layer_sizes_[1]; ++id1) {
268       auto index = id1*(layer_sizes_[0]+1) + id0;
269       theta1_[index] = theta1_[index] - alpha * gradient1[index];
270     }
271   }
272   for (auto id1=size_t{0}; id1<layer_sizes_[1]+1; ++id1) {
273     for (auto id2=size_t{0}; id2<layer_sizes_[2]; ++id2) {
274       auto index = id2*(layer_sizes_[1]+1) + id1;
275       theta2_[index] = theta2_[index] - alpha * gradient2[index];
276     }
277   }
278 }
279 
280 // =================================================================================================
281 
282 // Feed-forward function: input layer (with bias unit)
283 template <typename T>
FeedForward0(const std::vector<T> & x) const284 std::vector<T> NeuralNetwork<T>::FeedForward0(const std::vector<T> &x) const {
285   auto a0 = std::vector<T>(layer_sizes_[0]+1);
286   a0[0] = static_cast<T>(1);
287   for (auto id0=size_t{0}; id0<layer_sizes_[0]; ++id0) {
288     a0[id0 + 1] = x[id0];
289   }
290   return a0;
291 }
292 
293 // Feed-forward function: hidden layer (with bias unit and optionally a sigmoid activation function)
294 template <typename T>
FeedForward1(const std::vector<T> & a0,const bool sigmoid) const295 std::vector<T> NeuralNetwork<T>::FeedForward1(const std::vector<T> &a0, const bool sigmoid) const {
296   auto a1 = std::vector<T>(layer_sizes_[1]+1);
297   a1[0] = static_cast<T>(1);
298   for (auto id1=size_t{0}; id1<layer_sizes_[1]; ++id1) {
299     auto z1 = static_cast<T>(0);
300     for (auto id0=size_t{0}; id0<layer_sizes_[0]+1; ++id0) {
301       z1 += a0[id0] * theta1_[id1*(layer_sizes_[0]+1) + id0];
302     }
303     a1[id1 + 1] = (sigmoid) ? Sigmoid(z1) : z1;
304   }
305   return a1;
306 }
307 
308 // Feed-forward function: output layer
309 template <typename T>
FeedForward2(const std::vector<T> & a1) const310 std::vector<T> NeuralNetwork<T>::FeedForward2(const std::vector<T> &a1) const {
311   auto a2 = std::vector<T>(layer_sizes_[2]);
312   for (auto id2=size_t{0}; id2<layer_sizes_[2]; ++id2) {
313     auto z2 = static_cast<T>(0);
314     for (auto id1=size_t{0}; id1<layer_sizes_[1]+1; ++id1) {
315       z2 += a1[id1] * theta2_[id2*(layer_sizes_[1]+1) + id1];
316     }
317     a2[id2] = z2; // No sigmoid activation function in the output layer
318   }
319   return a2;
320 }
321 
322 // =================================================================================================
323 
324 // Compiles the class
325 template class NeuralNetwork<float>;
326 
327 // =================================================================================================
328 } // namespace cltune
329