1 /*
2 Copyright (c) by respective owners including Yahoo!, Microsoft, and
3 individual contributors. All rights reserved.  Released under a BSD (revised)
4 license as described in the file LICENSE.
5  */
6 #include <float.h>
7 #include <math.h>
8 #include <stdio.h>
9 #include <sstream>
10 
11 #include "reductions.h"
12 #include "rand48.h"
13 #include "gd.h"
14 
15 using namespace std;
16 using namespace LEARNER;
17 
18 const float hidden_min_activation = -3;
19 const float hidden_max_activation = 3;
20 const uint32_t nn_constant = 533357803;
21 
22 struct nn {
23   uint32_t k;
24   loss_function* squared_loss;
25   example output_layer;
26   example hiddenbias;
27   example outputweight;
28   float prediction;
29   size_t increment;
30   bool dropout;
31   uint64_t xsubi;
32   uint64_t save_xsubi;
33   bool inpass;
34   bool finished_setup;
35 
36   vw* all;//many things
37 };
38 
39 #define cast_uint32_t static_cast<uint32_t>
40 
41   static inline float
fastpow2(float p)42   fastpow2 (float p)
43   {
44     float offset = (p < 0) ? 1.0f : 0.0f;
45     float clipp = (p < -126) ? -126.0f : p;
46     int w = (int)clipp;
47     float z = clipp - w + offset;
48     union { uint32_t i; float f; } v = { cast_uint32_t ( (1 << 23) * (clipp + 121.2740575f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z) ) };
49 
50     return v.f;
51   }
52 
53   static inline float
fastexp(float p)54   fastexp (float p)
55   {
56     return fastpow2 (1.442695040f * p);
57   }
58 
59   static inline float
fasttanh(float p)60   fasttanh (float p)
61   {
62     return -1.0f + 2.0f / (1.0f + fastexp (-2.0f * p));
63   }
64 
finish_setup(nn & n,vw & all)65   void finish_setup (nn& n, vw& all)
66   {
67     // TODO: output_layer audit
68 
69     memset (&n.output_layer, 0, sizeof (n.output_layer));
70     n.output_layer.indices.push_back(nn_output_namespace);
71     feature output = {1., nn_constant << all.reg.stride_shift};
72 
73     for (unsigned int i = 0; i < n.k; ++i)
74       {
75         n.output_layer.atomics[nn_output_namespace].push_back(output);
76         ++n.output_layer.num_features;
77         output.weight_index += (uint32_t)n.increment;
78       }
79 
80     if (! n.inpass)
81       {
82         n.output_layer.atomics[nn_output_namespace].push_back(output);
83         ++n.output_layer.num_features;
84       }
85 
86     n.output_layer.in_use = true;
87 
88     // TODO: not correct if --noconstant
89     memset (&n.hiddenbias, 0, sizeof (n.hiddenbias));
90     n.hiddenbias.indices.push_back(constant_namespace);
91     feature temp = {1,(uint32_t) constant};
92     n.hiddenbias.atomics[constant_namespace].push_back(temp);
93     n.hiddenbias.total_sum_feat_sq++;
94     n.hiddenbias.l.simple.label = FLT_MAX;
95     n.hiddenbias.l.simple.weight = 1;
96     n.hiddenbias.in_use = true;
97 
98     memset (&n.outputweight, 0, sizeof (n.outputweight));
99     n.outputweight.indices.push_back(nn_output_namespace);
100     n.outputweight.atomics[nn_output_namespace].push_back(n.output_layer.atomics[nn_output_namespace][0]);
101     n.outputweight.atomics[nn_output_namespace][0].x = 1;
102     n.outputweight.total_sum_feat_sq++;
103     n.outputweight.l.simple.label = FLT_MAX;
104     n.outputweight.l.simple.weight = 1;
105     n.outputweight.in_use = true;
106 
107     n.finished_setup = true;
108   }
109 
end_pass(nn & n)110   void end_pass(nn& n)
111   {
112     if (n.all->bfgs)
113       n.xsubi = n.save_xsubi;
114   }
115 
116   template <bool is_learn>
predict_or_learn(nn & n,base_learner & base,example & ec)117   void predict_or_learn(nn& n, base_learner& base, example& ec)
118   {
119     bool shouldOutput = n.all->raw_prediction > 0;
120 
121     if (! n.finished_setup)
122       finish_setup (n, *(n.all));
123 
124     shared_data sd;
125     memcpy (&sd, n.all->sd, sizeof(shared_data));
126     shared_data* save_sd = n.all->sd;
127     n.all->sd = &sd;
128 
129     label_data ld = ec.l.simple;
130     void (*save_set_minmax) (shared_data*, float) = n.all->set_minmax;
131     float save_min_label;
132     float save_max_label;
133     float dropscale = n.dropout ? 2.0f : 1.0f;
134     loss_function* save_loss = n.all->loss;
135 
136     float* hidden_units = (float*) alloca (n.k * sizeof (float));
137     bool* dropped_out = (bool*) alloca (n.k * sizeof (bool));
138 
139     string outputString;
140     stringstream outputStringStream(outputString);
141 
142     n.all->set_minmax = noop_mm;
143     n.all->loss = n.squared_loss;
144     save_min_label = n.all->sd->min_label;
145     n.all->sd->min_label = hidden_min_activation;
146     save_max_label = n.all->sd->max_label;
147     n.all->sd->max_label = hidden_max_activation;
148 
149     n.hiddenbias.ft_offset = ec.ft_offset;
150 
151     for (unsigned int i = 0; i < n.k; ++i)
152       {
153         base.predict(n.hiddenbias, i);
154         float wf = n.hiddenbias.pred.scalar;
155 
156         // avoid saddle point at 0
157         if (wf == 0)
158           {
159             n.hiddenbias.l.simple.label = (float) (frand48 () - 0.5);
160             base.learn(n.hiddenbias, i);
161             n.hiddenbias.l.simple.label = FLT_MAX;
162           }
163 
164 	base.predict(ec, i);
165 
166         hidden_units[i] = ec.pred.scalar;
167 
168         dropped_out[i] = (n.dropout && merand48 (n.xsubi) < 0.5);
169 
170         if (shouldOutput) {
171           if (i > 0) outputStringStream << ' ';
172           outputStringStream << i << ':' << ec.partial_prediction << ',' << fasttanh (hidden_units[i]);
173         }
174       }
175     n.all->loss = save_loss;
176     n.all->set_minmax = save_set_minmax;
177     n.all->sd->min_label = save_min_label;
178     n.all->sd->max_label = save_max_label;
179 
180     bool converse = false;
181     float save_partial_prediction = 0;
182     float save_final_prediction = 0;
183     float save_ec_loss = 0;
184 
185 CONVERSE: // That's right, I'm using goto.  So sue me.
186 
187     n.output_layer.total_sum_feat_sq = 1;
188     n.output_layer.sum_feat_sq[nn_output_namespace] = 1;
189 
190     n.outputweight.ft_offset = ec.ft_offset;
191 
192     n.all->set_minmax = noop_mm;
193     n.all->loss = n.squared_loss;
194     save_min_label = n.all->sd->min_label;
195     n.all->sd->min_label = -1;
196     save_max_label = n.all->sd->max_label;
197     n.all->sd->max_label = 1;
198 
199     for (unsigned int i = 0; i < n.k; ++i)
200       {
201         float sigmah =
202           (dropped_out[i]) ? 0.0f : dropscale * fasttanh (hidden_units[i]);
203         n.output_layer.atomics[nn_output_namespace][i].x = sigmah;
204 
205         n.output_layer.total_sum_feat_sq += sigmah * sigmah;
206         n.output_layer.sum_feat_sq[nn_output_namespace] += sigmah * sigmah;
207 
208         n.outputweight.atomics[nn_output_namespace][0].weight_index =
209           n.output_layer.atomics[nn_output_namespace][i].weight_index;
210         base.predict(n.outputweight, n.k);
211         float wf = n.outputweight.pred.scalar;
212 
213         // avoid saddle point at 0
214         if (wf == 0)
215           {
216             float sqrtk = sqrt ((float)n.k);
217             n.outputweight.l.simple.label = (float) (frand48 () - 0.5) / sqrtk;
218             base.learn(n.outputweight, n.k);
219             n.outputweight.l.simple.label = FLT_MAX;
220           }
221       }
222 
223     n.all->loss = save_loss;
224     n.all->set_minmax = save_set_minmax;
225     n.all->sd->min_label = save_min_label;
226     n.all->sd->max_label = save_max_label;
227 
228     if (n.inpass) {
229       // TODO: this is not correct if there is something in the
230       // nn_output_namespace but at least it will not leak memory
231       // in that case
232 
233       ec.indices.push_back (nn_output_namespace);
234       v_array<feature> save_nn_output_namespace = ec.atomics[nn_output_namespace];
235       ec.atomics[nn_output_namespace] = n.output_layer.atomics[nn_output_namespace];
236       ec.sum_feat_sq[nn_output_namespace] = n.output_layer.sum_feat_sq[nn_output_namespace];
237       ec.total_sum_feat_sq += n.output_layer.sum_feat_sq[nn_output_namespace];
238       if (is_learn)
239 	base.learn(ec, n.k);
240       else
241 	base.predict(ec, n.k);
242       n.output_layer.partial_prediction = ec.partial_prediction;
243       n.output_layer.loss = ec.loss;
244       ec.total_sum_feat_sq -= n.output_layer.sum_feat_sq[nn_output_namespace];
245       ec.sum_feat_sq[nn_output_namespace] = 0;
246       ec.atomics[nn_output_namespace] = save_nn_output_namespace;
247       ec.indices.pop ();
248     }
249     else {
250       n.output_layer.ft_offset = ec.ft_offset;
251       n.output_layer.l = ec.l;
252       n.output_layer.partial_prediction = 0;
253       n.output_layer.example_t = ec.example_t;
254       if (is_learn)
255 	base.learn(n.output_layer, n.k);
256       else
257 	base.predict(n.output_layer, n.k);
258       ec.l = n.output_layer.l;
259     }
260 
261     n.prediction = GD::finalize_prediction (n.all->sd, n.output_layer.partial_prediction);
262 
263     if (shouldOutput) {
264       outputStringStream << ' ' << n.output_layer.partial_prediction;
265       n.all->print_text(n.all->raw_prediction, outputStringStream.str(), ec.tag);
266     }
267 
268     if (is_learn && n.all->training && ld.label != FLT_MAX) {
269       float gradient = n.all->loss->first_derivative(n.all->sd,
270 						     n.prediction,
271 						     ld.label);
272 
273       if (fabs (gradient) > 0) {
274         n.all->loss = n.squared_loss;
275         n.all->set_minmax = noop_mm;
276         save_min_label = n.all->sd->min_label;
277         n.all->sd->min_label = hidden_min_activation;
278         save_max_label = n.all->sd->max_label;
279         n.all->sd->max_label = hidden_max_activation;
280 
281         for (unsigned int i = 0; i < n.k; ++i) {
282           if (! dropped_out[i]) {
283             float sigmah =
284               n.output_layer.atomics[nn_output_namespace][i].x / dropscale;
285             float sigmahprime = dropscale * (1.0f - sigmah * sigmah);
286             n.outputweight.atomics[nn_output_namespace][0].weight_index =
287               n.output_layer.atomics[nn_output_namespace][i].weight_index;
288             base.predict(n.outputweight, n.k);
289             float nu = n.outputweight.pred.scalar;
290             float gradhw = 0.5f * nu * gradient * sigmahprime;
291 
292             ec.l.simple.label = GD::finalize_prediction (n.all->sd, hidden_units[i] - gradhw);
293             if (ec.l.simple.label != hidden_units[i])
294               base.learn(ec, i);
295           }
296         }
297 
298         n.all->loss = save_loss;
299         n.all->set_minmax = save_set_minmax;
300         n.all->sd->min_label = save_min_label;
301         n.all->sd->max_label = save_max_label;
302       }
303     }
304 
305     ec.l.simple.label = ld.label;
306 
307     if (! converse) {
308       save_partial_prediction = n.output_layer.partial_prediction;
309       save_final_prediction = n.prediction;
310       save_ec_loss = n.output_layer.loss;
311     }
312 
313     if (n.dropout && ! converse)
314       {
315         for (unsigned int i = 0; i < n.k; ++i)
316           {
317             dropped_out[i] = ! dropped_out[i];
318           }
319 
320         converse = true;
321         goto CONVERSE;
322       }
323 
324     ec.partial_prediction = save_partial_prediction;
325     ec.pred.scalar = save_final_prediction;
326     ec.loss = save_ec_loss;
327 
328     n.all->sd = save_sd;
329     n.all->set_minmax (n.all->sd, sd.min_label);
330     n.all->set_minmax (n.all->sd, sd.max_label);
331   }
332 
finish_example(vw & all,nn &,example & ec)333   void finish_example(vw& all, nn&, example& ec)
334   {
335     int save_raw_prediction = all.raw_prediction;
336     all.raw_prediction = -1;
337     return_simple_example(all, NULL, ec);
338     all.raw_prediction = save_raw_prediction;
339   }
340 
finish(nn & n)341   void finish(nn& n)
342   {
343     delete n.squared_loss;
344     dealloc_example (NULL, n.output_layer);
345     dealloc_example (NULL, n.hiddenbias);
346     dealloc_example (NULL, n.outputweight);
347   }
348 
nn_setup(vw & all)349   base_learner* nn_setup(vw& all)
350   {
351     if (missing_option<size_t, true>(all, "nn", "Sigmoidal feedforward network with <k> hidden units"))
352       return NULL;
353     new_options(all, "Neural Network options")
354       ("inpass", "Train or test sigmoidal feedforward network with input passthrough.")
355       ("dropout", "Train or test sigmoidal feedforward network using dropout.")
356       ("meanfield", "Train or test sigmoidal feedforward network using mean field.");
357     add_options(all);
358 
359     po::variables_map& vm = all.vm;
360     nn& n = calloc_or_die<nn>();
361     n.all = &all;
362     //first parse for number of hidden units
363     n.k = (uint32_t)vm["nn"].as<size_t>();
364 
365     if ( vm.count("dropout") ) {
366       n.dropout = true;
367       *all.file_options << " --dropout ";
368     }
369 
370     if ( vm.count("meanfield") ) {
371       n.dropout = false;
372       if (! all.quiet)
373         std::cerr << "using mean field for neural network "
374                   << (all.training ? "training" : "testing")
375                   << std::endl;
376     }
377 
378     if (n.dropout)
379       if (! all.quiet)
380         std::cerr << "using dropout for neural network "
381                   << (all.training ? "training" : "testing")
382                   << std::endl;
383 
384     if (vm.count ("inpass")) {
385       n.inpass = true;
386       *all.file_options << " --inpass";
387 
388     }
389 
390     if (n.inpass && ! all.quiet)
391       std::cerr << "using input passthrough for neural network "
392                 << (all.training ? "training" : "testing")
393                 << std::endl;
394 
395     n.finished_setup = false;
396     n.squared_loss = getLossFunction (all, "squared", 0);
397 
398     n.xsubi = 0;
399 
400     if (vm.count("random_seed"))
401       n.xsubi = vm["random_seed"].as<size_t>();
402 
403     n.save_xsubi = n.xsubi;
404 
405     base_learner* base = setup_base(all);
406     n.increment = base->increment;//Indexing of output layer is odd.
407     learner<nn>& l = init_learner(&n, base, predict_or_learn<true>,
408 				  predict_or_learn<false>, n.k+1);
409     l.set_finish(finish);
410     l.set_finish_example(finish_example);
411     l.set_end_pass(end_pass);
412 
413     return make_base(l);
414   }
415