1 /* 2 Copyright (c) by respective owners including Yahoo!, Microsoft, and 3 individual contributors. All rights reserved. Released under a BSD (revised) 4 license as described in the file LICENSE. 5 */ 6 #include <float.h> 7 #include <math.h> 8 #include <stdio.h> 9 #include <sstream> 10 11 #include "reductions.h" 12 #include "rand48.h" 13 #include "gd.h" 14 15 using namespace std; 16 using namespace LEARNER; 17 18 const float hidden_min_activation = -3; 19 const float hidden_max_activation = 3; 20 const uint32_t nn_constant = 533357803; 21 22 struct nn { 23 uint32_t k; 24 loss_function* squared_loss; 25 example output_layer; 26 example hiddenbias; 27 example outputweight; 28 float prediction; 29 size_t increment; 30 bool dropout; 31 uint64_t xsubi; 32 uint64_t save_xsubi; 33 bool inpass; 34 bool finished_setup; 35 36 vw* all;//many things 37 }; 38 39 #define cast_uint32_t static_cast<uint32_t> 40 41 static inline float fastpow2(float p)42 fastpow2 (float p) 43 { 44 float offset = (p < 0) ? 1.0f : 0.0f; 45 float clipp = (p < -126) ? -126.0f : p; 46 int w = (int)clipp; 47 float z = clipp - w + offset; 48 union { uint32_t i; float f; } v = { cast_uint32_t ( (1 << 23) * (clipp + 121.2740575f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z) ) }; 49 50 return v.f; 51 } 52 53 static inline float fastexp(float p)54 fastexp (float p) 55 { 56 return fastpow2 (1.442695040f * p); 57 } 58 59 static inline float fasttanh(float p)60 fasttanh (float p) 61 { 62 return -1.0f + 2.0f / (1.0f + fastexp (-2.0f * p)); 63 } 64 finish_setup(nn & n,vw & all)65 void finish_setup (nn& n, vw& all) 66 { 67 // TODO: output_layer audit 68 69 memset (&n.output_layer, 0, sizeof (n.output_layer)); 70 n.output_layer.indices.push_back(nn_output_namespace); 71 feature output = {1., nn_constant << all.reg.stride_shift}; 72 73 for (unsigned int i = 0; i < n.k; ++i) 74 { 75 n.output_layer.atomics[nn_output_namespace].push_back(output); 76 ++n.output_layer.num_features; 77 output.weight_index += (uint32_t)n.increment; 78 } 79 80 if (! n.inpass) 81 { 82 n.output_layer.atomics[nn_output_namespace].push_back(output); 83 ++n.output_layer.num_features; 84 } 85 86 n.output_layer.in_use = true; 87 88 // TODO: not correct if --noconstant 89 memset (&n.hiddenbias, 0, sizeof (n.hiddenbias)); 90 n.hiddenbias.indices.push_back(constant_namespace); 91 feature temp = {1,(uint32_t) constant}; 92 n.hiddenbias.atomics[constant_namespace].push_back(temp); 93 n.hiddenbias.total_sum_feat_sq++; 94 n.hiddenbias.l.simple.label = FLT_MAX; 95 n.hiddenbias.l.simple.weight = 1; 96 n.hiddenbias.in_use = true; 97 98 memset (&n.outputweight, 0, sizeof (n.outputweight)); 99 n.outputweight.indices.push_back(nn_output_namespace); 100 n.outputweight.atomics[nn_output_namespace].push_back(n.output_layer.atomics[nn_output_namespace][0]); 101 n.outputweight.atomics[nn_output_namespace][0].x = 1; 102 n.outputweight.total_sum_feat_sq++; 103 n.outputweight.l.simple.label = FLT_MAX; 104 n.outputweight.l.simple.weight = 1; 105 n.outputweight.in_use = true; 106 107 n.finished_setup = true; 108 } 109 end_pass(nn & n)110 void end_pass(nn& n) 111 { 112 if (n.all->bfgs) 113 n.xsubi = n.save_xsubi; 114 } 115 116 template <bool is_learn> predict_or_learn(nn & n,base_learner & base,example & ec)117 void predict_or_learn(nn& n, base_learner& base, example& ec) 118 { 119 bool shouldOutput = n.all->raw_prediction > 0; 120 121 if (! n.finished_setup) 122 finish_setup (n, *(n.all)); 123 124 shared_data sd; 125 memcpy (&sd, n.all->sd, sizeof(shared_data)); 126 shared_data* save_sd = n.all->sd; 127 n.all->sd = &sd; 128 129 label_data ld = ec.l.simple; 130 void (*save_set_minmax) (shared_data*, float) = n.all->set_minmax; 131 float save_min_label; 132 float save_max_label; 133 float dropscale = n.dropout ? 2.0f : 1.0f; 134 loss_function* save_loss = n.all->loss; 135 136 float* hidden_units = (float*) alloca (n.k * sizeof (float)); 137 bool* dropped_out = (bool*) alloca (n.k * sizeof (bool)); 138 139 string outputString; 140 stringstream outputStringStream(outputString); 141 142 n.all->set_minmax = noop_mm; 143 n.all->loss = n.squared_loss; 144 save_min_label = n.all->sd->min_label; 145 n.all->sd->min_label = hidden_min_activation; 146 save_max_label = n.all->sd->max_label; 147 n.all->sd->max_label = hidden_max_activation; 148 149 n.hiddenbias.ft_offset = ec.ft_offset; 150 151 for (unsigned int i = 0; i < n.k; ++i) 152 { 153 base.predict(n.hiddenbias, i); 154 float wf = n.hiddenbias.pred.scalar; 155 156 // avoid saddle point at 0 157 if (wf == 0) 158 { 159 n.hiddenbias.l.simple.label = (float) (frand48 () - 0.5); 160 base.learn(n.hiddenbias, i); 161 n.hiddenbias.l.simple.label = FLT_MAX; 162 } 163 164 base.predict(ec, i); 165 166 hidden_units[i] = ec.pred.scalar; 167 168 dropped_out[i] = (n.dropout && merand48 (n.xsubi) < 0.5); 169 170 if (shouldOutput) { 171 if (i > 0) outputStringStream << ' '; 172 outputStringStream << i << ':' << ec.partial_prediction << ',' << fasttanh (hidden_units[i]); 173 } 174 } 175 n.all->loss = save_loss; 176 n.all->set_minmax = save_set_minmax; 177 n.all->sd->min_label = save_min_label; 178 n.all->sd->max_label = save_max_label; 179 180 bool converse = false; 181 float save_partial_prediction = 0; 182 float save_final_prediction = 0; 183 float save_ec_loss = 0; 184 185 CONVERSE: // That's right, I'm using goto. So sue me. 186 187 n.output_layer.total_sum_feat_sq = 1; 188 n.output_layer.sum_feat_sq[nn_output_namespace] = 1; 189 190 n.outputweight.ft_offset = ec.ft_offset; 191 192 n.all->set_minmax = noop_mm; 193 n.all->loss = n.squared_loss; 194 save_min_label = n.all->sd->min_label; 195 n.all->sd->min_label = -1; 196 save_max_label = n.all->sd->max_label; 197 n.all->sd->max_label = 1; 198 199 for (unsigned int i = 0; i < n.k; ++i) 200 { 201 float sigmah = 202 (dropped_out[i]) ? 0.0f : dropscale * fasttanh (hidden_units[i]); 203 n.output_layer.atomics[nn_output_namespace][i].x = sigmah; 204 205 n.output_layer.total_sum_feat_sq += sigmah * sigmah; 206 n.output_layer.sum_feat_sq[nn_output_namespace] += sigmah * sigmah; 207 208 n.outputweight.atomics[nn_output_namespace][0].weight_index = 209 n.output_layer.atomics[nn_output_namespace][i].weight_index; 210 base.predict(n.outputweight, n.k); 211 float wf = n.outputweight.pred.scalar; 212 213 // avoid saddle point at 0 214 if (wf == 0) 215 { 216 float sqrtk = sqrt ((float)n.k); 217 n.outputweight.l.simple.label = (float) (frand48 () - 0.5) / sqrtk; 218 base.learn(n.outputweight, n.k); 219 n.outputweight.l.simple.label = FLT_MAX; 220 } 221 } 222 223 n.all->loss = save_loss; 224 n.all->set_minmax = save_set_minmax; 225 n.all->sd->min_label = save_min_label; 226 n.all->sd->max_label = save_max_label; 227 228 if (n.inpass) { 229 // TODO: this is not correct if there is something in the 230 // nn_output_namespace but at least it will not leak memory 231 // in that case 232 233 ec.indices.push_back (nn_output_namespace); 234 v_array<feature> save_nn_output_namespace = ec.atomics[nn_output_namespace]; 235 ec.atomics[nn_output_namespace] = n.output_layer.atomics[nn_output_namespace]; 236 ec.sum_feat_sq[nn_output_namespace] = n.output_layer.sum_feat_sq[nn_output_namespace]; 237 ec.total_sum_feat_sq += n.output_layer.sum_feat_sq[nn_output_namespace]; 238 if (is_learn) 239 base.learn(ec, n.k); 240 else 241 base.predict(ec, n.k); 242 n.output_layer.partial_prediction = ec.partial_prediction; 243 n.output_layer.loss = ec.loss; 244 ec.total_sum_feat_sq -= n.output_layer.sum_feat_sq[nn_output_namespace]; 245 ec.sum_feat_sq[nn_output_namespace] = 0; 246 ec.atomics[nn_output_namespace] = save_nn_output_namespace; 247 ec.indices.pop (); 248 } 249 else { 250 n.output_layer.ft_offset = ec.ft_offset; 251 n.output_layer.l = ec.l; 252 n.output_layer.partial_prediction = 0; 253 n.output_layer.example_t = ec.example_t; 254 if (is_learn) 255 base.learn(n.output_layer, n.k); 256 else 257 base.predict(n.output_layer, n.k); 258 ec.l = n.output_layer.l; 259 } 260 261 n.prediction = GD::finalize_prediction (n.all->sd, n.output_layer.partial_prediction); 262 263 if (shouldOutput) { 264 outputStringStream << ' ' << n.output_layer.partial_prediction; 265 n.all->print_text(n.all->raw_prediction, outputStringStream.str(), ec.tag); 266 } 267 268 if (is_learn && n.all->training && ld.label != FLT_MAX) { 269 float gradient = n.all->loss->first_derivative(n.all->sd, 270 n.prediction, 271 ld.label); 272 273 if (fabs (gradient) > 0) { 274 n.all->loss = n.squared_loss; 275 n.all->set_minmax = noop_mm; 276 save_min_label = n.all->sd->min_label; 277 n.all->sd->min_label = hidden_min_activation; 278 save_max_label = n.all->sd->max_label; 279 n.all->sd->max_label = hidden_max_activation; 280 281 for (unsigned int i = 0; i < n.k; ++i) { 282 if (! dropped_out[i]) { 283 float sigmah = 284 n.output_layer.atomics[nn_output_namespace][i].x / dropscale; 285 float sigmahprime = dropscale * (1.0f - sigmah * sigmah); 286 n.outputweight.atomics[nn_output_namespace][0].weight_index = 287 n.output_layer.atomics[nn_output_namespace][i].weight_index; 288 base.predict(n.outputweight, n.k); 289 float nu = n.outputweight.pred.scalar; 290 float gradhw = 0.5f * nu * gradient * sigmahprime; 291 292 ec.l.simple.label = GD::finalize_prediction (n.all->sd, hidden_units[i] - gradhw); 293 if (ec.l.simple.label != hidden_units[i]) 294 base.learn(ec, i); 295 } 296 } 297 298 n.all->loss = save_loss; 299 n.all->set_minmax = save_set_minmax; 300 n.all->sd->min_label = save_min_label; 301 n.all->sd->max_label = save_max_label; 302 } 303 } 304 305 ec.l.simple.label = ld.label; 306 307 if (! converse) { 308 save_partial_prediction = n.output_layer.partial_prediction; 309 save_final_prediction = n.prediction; 310 save_ec_loss = n.output_layer.loss; 311 } 312 313 if (n.dropout && ! converse) 314 { 315 for (unsigned int i = 0; i < n.k; ++i) 316 { 317 dropped_out[i] = ! dropped_out[i]; 318 } 319 320 converse = true; 321 goto CONVERSE; 322 } 323 324 ec.partial_prediction = save_partial_prediction; 325 ec.pred.scalar = save_final_prediction; 326 ec.loss = save_ec_loss; 327 328 n.all->sd = save_sd; 329 n.all->set_minmax (n.all->sd, sd.min_label); 330 n.all->set_minmax (n.all->sd, sd.max_label); 331 } 332 finish_example(vw & all,nn &,example & ec)333 void finish_example(vw& all, nn&, example& ec) 334 { 335 int save_raw_prediction = all.raw_prediction; 336 all.raw_prediction = -1; 337 return_simple_example(all, NULL, ec); 338 all.raw_prediction = save_raw_prediction; 339 } 340 finish(nn & n)341 void finish(nn& n) 342 { 343 delete n.squared_loss; 344 dealloc_example (NULL, n.output_layer); 345 dealloc_example (NULL, n.hiddenbias); 346 dealloc_example (NULL, n.outputweight); 347 } 348 nn_setup(vw & all)349 base_learner* nn_setup(vw& all) 350 { 351 if (missing_option<size_t, true>(all, "nn", "Sigmoidal feedforward network with <k> hidden units")) 352 return NULL; 353 new_options(all, "Neural Network options") 354 ("inpass", "Train or test sigmoidal feedforward network with input passthrough.") 355 ("dropout", "Train or test sigmoidal feedforward network using dropout.") 356 ("meanfield", "Train or test sigmoidal feedforward network using mean field."); 357 add_options(all); 358 359 po::variables_map& vm = all.vm; 360 nn& n = calloc_or_die<nn>(); 361 n.all = &all; 362 //first parse for number of hidden units 363 n.k = (uint32_t)vm["nn"].as<size_t>(); 364 365 if ( vm.count("dropout") ) { 366 n.dropout = true; 367 *all.file_options << " --dropout "; 368 } 369 370 if ( vm.count("meanfield") ) { 371 n.dropout = false; 372 if (! all.quiet) 373 std::cerr << "using mean field for neural network " 374 << (all.training ? "training" : "testing") 375 << std::endl; 376 } 377 378 if (n.dropout) 379 if (! all.quiet) 380 std::cerr << "using dropout for neural network " 381 << (all.training ? "training" : "testing") 382 << std::endl; 383 384 if (vm.count ("inpass")) { 385 n.inpass = true; 386 *all.file_options << " --inpass"; 387 388 } 389 390 if (n.inpass && ! all.quiet) 391 std::cerr << "using input passthrough for neural network " 392 << (all.training ? "training" : "testing") 393 << std::endl; 394 395 n.finished_setup = false; 396 n.squared_loss = getLossFunction (all, "squared", 0); 397 398 n.xsubi = 0; 399 400 if (vm.count("random_seed")) 401 n.xsubi = vm["random_seed"].as<size_t>(); 402 403 n.save_xsubi = n.xsubi; 404 405 base_learner* base = setup_base(all); 406 n.increment = base->increment;//Indexing of output layer is odd. 407 learner<nn>& l = init_learner(&n, base, predict_or_learn<true>, 408 predict_or_learn<false>, n.k+1); 409 l.set_finish(finish); 410 l.set_finish_example(finish_example); 411 l.set_end_pass(end_pass); 412 413 return make_base(l); 414 } 415