1 #include <float.h>
2 #include <cassert>
3 
4 #include "gd.h"
5 #include "accumulate.h"
6 #include "reductions.h"
7 #include "vw.h"
8 
9 //#define MAGIC_ARGUMENT //MAY IT NEVER DIE
10 
11 using namespace std;
12 using namespace LEARNER;
13 
14   static const uint32_t parent_bit = 1;
15   static const uint32_t cycle_bit = 2;
16   static const uint32_t tree_atomics = 134;
17   static const float tolerance = 1e-9f;
18   static const uint32_t indicator_bit = 128;
19   static const uint32_t default_depth = 127;
20 
21   struct sort_data {
22     float wval;
23     uint32_t wid;
24   };
25 
26   struct stagewise_poly
27   {
28     vw *all; // many uses, unmodular reduction
29 
30     float sched_exponent;
31     uint32_t batch_sz;
32     bool batch_sz_double;
33 
34     sort_data *sd;
35     uint32_t sd_len;
36     uint8_t *depthsbits; //interleaved array storing depth information and parent/cycle bits
37 
38     uint64_t sum_sparsity; //of synthetic example
39     uint64_t sum_input_sparsity; //of input example
40     uint64_t num_examples;
41     //following three are for parallel (see end_pass())
42     uint64_t sum_sparsity_sync;
43     uint64_t sum_input_sparsity_sync;
44     uint64_t num_examples_sync;
45 
46     example synth_ec;
47     //following is bookkeeping in synth_ec creation (dfs)
48     feature synth_rec_f;
49     example *original_ec;
50     uint32_t cur_depth;
51     bool training;
52     uint64_t last_example_counter;
53     size_t numpasses;
54     uint32_t next_batch_sz;
55     bool update_support;
56 
57 #ifdef DEBUG
58     uint32_t max_depth;
59     uint32_t depths[100000];
60 #endif //DEBUG
61 
62 #ifdef MAGIC_ARGUMENT
63     float magic_argument;
64 #endif //MAGIC_ARGUMENT
65   };
66 
67 
stride_shift(const stagewise_poly & poly,uint32_t idx)68   inline uint32_t stride_shift(const stagewise_poly &poly, uint32_t idx)
69   {
70     return idx << poly.all->reg.stride_shift;
71   }
72 
stride_un_shift(const stagewise_poly & poly,uint32_t idx)73   inline uint32_t stride_un_shift(const stagewise_poly &poly, uint32_t idx)
74   {
75     return idx >> poly.all->reg.stride_shift;
76   }
77 
do_ft_offset(const stagewise_poly & poly,uint32_t idx)78   inline uint32_t do_ft_offset(const stagewise_poly &poly, uint32_t idx)
79   {
80     //cout << poly.synth_ec.ft_offset << "  " << poly.original_ec->ft_offset << endl;
81     assert(!poly.original_ec || poly.synth_ec.ft_offset == poly.original_ec->ft_offset);
82     return idx + poly.synth_ec.ft_offset;
83   }
84 
un_ft_offset(const stagewise_poly & poly,uint32_t idx)85   inline uint32_t un_ft_offset(const stagewise_poly &poly, uint32_t idx)
86   {
87     assert(!poly.original_ec || poly.synth_ec.ft_offset == poly.original_ec->ft_offset);
88     if (poly.synth_ec.ft_offset == 0)
89       return idx;
90     else {
91       while (idx < poly.synth_ec.ft_offset) {
92         idx += (uint32_t) poly.all->length() << poly.all->reg.stride_shift;
93       }
94       return idx - poly.synth_ec.ft_offset;
95     }
96   }
97 
wid_mask(const stagewise_poly & poly,uint32_t wid)98   inline uint32_t wid_mask(const stagewise_poly &poly, uint32_t wid)
99   {
100     return wid & poly.all->reg.weight_mask;
101   }
102 
wid_mask_un_shifted(const stagewise_poly & poly,uint32_t wid)103   inline uint32_t wid_mask_un_shifted(const stagewise_poly &poly, uint32_t wid)
104   {
105     return stride_un_shift(poly, wid & poly.all->reg.weight_mask);
106   }
107 
constant_feat(const stagewise_poly & poly)108   inline uint32_t constant_feat(const stagewise_poly &poly)
109   {
110     return stride_shift(poly, constant * poly.all->wpp);
111   }
112 
constant_feat_masked(const stagewise_poly & poly)113   inline uint32_t constant_feat_masked(const stagewise_poly &poly)
114   {
115     return wid_mask(poly, constant_feat(poly));
116   }
117 
118 
depthsbits_sizeof(const stagewise_poly & poly)119   inline uint32_t depthsbits_sizeof(const stagewise_poly &poly)
120   {
121     return (uint32_t)(2 * poly.all->length() * sizeof(uint8_t));
122   }
123 
depthsbits_create(stagewise_poly & poly)124   void depthsbits_create(stagewise_poly &poly)
125   {
126     poly.depthsbits = calloc_or_die<uint8_t>(2 * poly.all->length());
127     for (uint32_t i = 0; i < poly.all->length() * 2; i += 2) {
128       poly.depthsbits[i] = default_depth;
129       poly.depthsbits[i+1] = indicator_bit;
130     }
131   }
132 
depthsbits_destroy(stagewise_poly & poly)133   void depthsbits_destroy(stagewise_poly &poly)
134   {
135     free(poly.depthsbits);
136   }
137 
parent_get(const stagewise_poly & poly,uint32_t wid)138   inline bool parent_get(const stagewise_poly &poly, uint32_t wid)
139   {
140     assert(wid % stride_shift(poly, 1) == 0);
141     assert(do_ft_offset(poly, wid) % stride_shift(poly, 1) == 0);
142     return poly.depthsbits[wid_mask_un_shifted(poly, do_ft_offset(poly, wid)) * 2 + 1] & parent_bit;
143   }
144 
parent_toggle(stagewise_poly & poly,uint32_t wid)145   inline void parent_toggle(stagewise_poly &poly, uint32_t wid)
146   {
147     assert(wid % stride_shift(poly, 1) == 0);
148     assert(do_ft_offset(poly, wid) % stride_shift(poly, 1) == 0);
149     poly.depthsbits[wid_mask_un_shifted(poly, do_ft_offset(poly, wid)) * 2 + 1] ^= parent_bit;
150   }
151 
cycle_get(const stagewise_poly & poly,uint32_t wid)152   inline bool cycle_get(const stagewise_poly &poly, uint32_t wid)
153   {
154     //note: intentionally leaving out ft_offset.
155     assert(wid % stride_shift(poly, 1) == 0);
156 	if ((poly.depthsbits[wid_mask_un_shifted(poly, wid) * 2 + 1] & cycle_bit) > 0)
157 		return true;
158 	else
159 		return false;
160   }
161 
cycle_toggle(stagewise_poly & poly,uint32_t wid)162   inline void cycle_toggle(stagewise_poly &poly, uint32_t wid)
163   {
164     //note: intentionally leaving out ft_offset.
165     assert(wid % stride_shift(poly, 1) == 0);
166     poly.depthsbits[wid_mask_un_shifted(poly, wid) * 2 + 1] ^= cycle_bit;
167   }
168 
min_depths_get(const stagewise_poly & poly,uint32_t wid)169   inline uint8_t min_depths_get(const stagewise_poly &poly, uint32_t wid)
170   {
171     assert(wid % stride_shift(poly, 1) == 0);
172     assert(do_ft_offset(poly, wid) % stride_shift(poly, 1) == 0);
173     return poly.depthsbits[stride_un_shift(poly, do_ft_offset(poly, wid)) * 2];
174   }
175 
min_depths_set(stagewise_poly & poly,uint32_t wid,uint8_t depth)176   inline void min_depths_set(stagewise_poly &poly, uint32_t wid, uint8_t depth)
177   {
178     assert(wid % stride_shift(poly, 1) == 0);
179     assert(do_ft_offset(poly, wid) % stride_shift(poly, 1) == 0);
180     poly.depthsbits[stride_un_shift(poly, do_ft_offset(poly, wid)) * 2] = depth;
181   }
182 
183 #ifndef NDEBUG
sanity_check_state(stagewise_poly & poly)184   void sanity_check_state(stagewise_poly &poly)
185   {
186     for (uint32_t i = 0; i != poly.all->length(); ++i)
187     {
188       uint32_t wid = stride_shift(poly, i);
189 
190       assert( ! cycle_get(poly,wid) );
191 
192       assert( ! (min_depths_get(poly, wid) == default_depth && parent_get(poly, wid)) );
193 
194       assert( ! (min_depths_get(poly, wid) == default_depth && fabsf(poly.all->reg.weight_vector[wid]) > 0) );
195       //assert( min_depths_get(poly, wid) != default_depth && fabsf(poly.all->reg.weight_vector[wid]) < tolerance );
196 
197       assert( ! (poly.depthsbits[wid_mask_un_shifted(poly, wid) * 2 + 1] & ~(parent_bit + cycle_bit + indicator_bit)) );
198     }
199   }
200 #endif //NDEBUG
201 
202   //Note.  OUTPUT & INPUT masked.
203   //It is very important that this function is invariant to stride.
child_wid(const stagewise_poly & poly,uint32_t wi_atomic,uint32_t wi_general)204   inline uint32_t child_wid(const stagewise_poly &poly, uint32_t wi_atomic, uint32_t wi_general)
205   {
206     assert(wi_atomic == wid_mask(poly, wi_atomic));
207     assert(wi_general == wid_mask(poly, wi_general));
208     assert((wi_atomic & (stride_shift(poly, 1) - 1)) == 0);
209     assert((wi_general & (stride_shift(poly, 1) - 1)) == 0);
210 
211     if (wi_atomic == constant_feat_masked(poly))
212       return wi_general;
213     else if (wi_general == constant_feat_masked(poly))
214       return wi_atomic;
215     else {
216       //This is basically the "Fowler–Noll–Vo" hash.  Ideally, the hash would be invariant
217       //to the monomial, whereas this here is sensitive to the path followed, but whatever.
218       //the two main big differences with FNV are: (1) the "*constant" case should also have
219       //a big prime (so the default hash shouldn't be identity on small things, and (2) the
220       //size should not just be a power of 2, but some big prime.
221       return wid_mask(poly, stride_shift(poly, stride_un_shift(poly, wi_atomic)
222             ^ (16777619 * stride_un_shift(poly, wi_general))));
223     }
224   }
225 
sort_data_create(stagewise_poly & poly)226   void sort_data_create(stagewise_poly &poly)
227   {
228     poly.sd = NULL;
229     poly.sd_len = 0;
230   }
231 
sort_data_ensure_sz(stagewise_poly & poly,uint32_t len)232   void sort_data_ensure_sz(stagewise_poly &poly, uint32_t len)
233   {
234     if (poly.sd_len < len) {
235       uint32_t len_candidate = 2 * len;
236 #ifdef DEBUG
237       cout << "resizing sort buffer; current size " << poly.sd_len;
238 #endif //DEBUG
239       poly.sd_len = (len_candidate > poly.all->length()) ? (uint32_t)poly.all->length() : len_candidate;
240 #ifdef DEBUG
241       cout << ", new size " << poly.sd_len << endl;
242 #endif //DEBUG
243       free(poly.sd); //okay for null.
244       poly.sd = calloc_or_die<sort_data>(poly.sd_len);
245     }
246     assert(len <= poly.sd_len);
247   }
248 
sort_data_destroy(stagewise_poly & poly)249   void sort_data_destroy(stagewise_poly &poly)
250   {
251     free(poly.sd);
252   }
253 
254 #ifdef DEBUG
sort_data_compar(const void * a_v,const void * b_v)255   int sort_data_compar(const void *a_v, const void *b_v)
256   {
257     return 2 * ( ((sort_data *) a_v)->wval < ((sort_data *) b_v)->wval ) - 1;
258   }
259 #endif //DEBUG
260 
sort_data_compar_heap(sort_data & a_v,sort_data & b_v)261   int sort_data_compar_heap(sort_data &a_v, sort_data &b_v)
262   {
263     return (a_v.wval > b_v.wval);
264   }
265 
266   /*
267    * Performance note.
268    *
269    * On my laptop (Intel(R) Core(TM) i7-3520M CPU @ 2.90GHz), with compiler
270    * optimizations enabled, this routine takes ~0.001 seconds with -b 18 and
271    * ~0.06 seconds with -b 24.  Since it is intended to run ~8 times in 1-pass
272    * mode and otherwise once per pass, it is considered adequate.
273    *
274    * Another choice (implemented in another version) is to never process the
275    * whole weight vector (e.g., by updating a hash set of nonzero weights after
276    * creating the synthetic example, and only processing that here).  This
277    * choice was implemented in a similar algorithm and performed well.
278    */
sort_data_update_support(stagewise_poly & poly)279   void sort_data_update_support(stagewise_poly &poly)
280   {
281     assert(poly.num_examples);
282 
283     //ft_offset affects parent_set / parent_get.  This state must be reset at end.
284     uint32_t pop_ft_offset = poly.original_ec->ft_offset;
285     poly.synth_ec.ft_offset = 0;
286     assert(poly.original_ec);
287     poly.original_ec->ft_offset = 0;
288 
289     uint32_t num_new_features = (uint32_t)pow(poly.sum_input_sparsity * 1.0f / poly.num_examples, poly.sched_exponent);
290     num_new_features = (num_new_features > poly.all->length()) ? (uint32_t)poly.all->length() : num_new_features;
291     sort_data_ensure_sz(poly, num_new_features);
292 
293     sort_data *heap_end = poly.sd;
294     make_heap(poly.sd, heap_end, sort_data_compar_heap); //redundant
295     for (uint32_t i = 0; i != poly.all->length(); ++i) {
296       uint32_t wid = stride_shift(poly, i);
297       if (!parent_get(poly, wid) && wid != constant_feat_masked(poly)) {
298         float wval = (fabsf(poly.all->reg.weight_vector[wid])
299             * poly.all->reg.weight_vector[poly.all->normalized_idx + (wid)])
300           /*
301            * here's some depth penalization code.  It was found to not improve
302            * statistical performance, and meanwhile it is verified as giving
303            * a nontrivial computational hit, thus commented out.
304            *
305            * - poly.magic_argument
306            * sqrtf(min_depths_get(poly, stride_shift(poly, i)) * 1.0 / poly.num_examples)
307            */
308           ;
309         if (wval > tolerance) {
310           assert(heap_end >= poly.sd);
311           assert(heap_end <= poly.sd + num_new_features);
312 
313           if (heap_end - poly.sd == (int)num_new_features && poly.sd->wval < wval) {
314             pop_heap(poly.sd, heap_end, sort_data_compar_heap);
315             --heap_end;
316           }
317 
318           assert(heap_end >= poly.sd);
319           assert(heap_end < poly.sd + poly.sd_len);
320 
321           if (heap_end - poly.sd < (int)num_new_features) {
322             heap_end->wval = wval;
323             heap_end->wid = wid;
324             ++heap_end;
325             push_heap(poly.sd, heap_end, sort_data_compar_heap);
326           }
327         }
328       }
329     }
330     num_new_features = (uint32_t) (heap_end - poly.sd);
331 
332 #ifdef DEBUG
333     //eyeballing weights a pain if unsorted.
334     qsort(poly.sd, num_new_features, sizeof(sort_data), sort_data_compar);
335 #endif //DEBUG
336 
337     for (uint32_t pos = 0; pos < num_new_features && pos < poly.sd_len; ++pos) {
338       assert(!parent_get(poly, poly.sd[pos].wid)
339           && poly.sd[pos].wval > tolerance
340           && poly.sd[pos].wid != constant_feat_masked(poly));
341       parent_toggle(poly, poly.sd[pos].wid);
342 #ifdef DEBUG
343       cout
344         << "Adding feature " << pos << "/" << num_new_features
345         << " || wid " << poly.sd[pos].wid
346         << " || sort value " << poly.sd[pos].wval
347         << endl;
348 #endif //DEBUG
349     }
350 
351 #ifdef DEBUG
352     cout << "depths:";
353     for (uint32_t depth = 0; depth <= poly.max_depth && depth < sizeof(poly.depths) / sizeof(*poly.depths); ++depth)
354       cout << "  [" << depth << "] = " << poly.depths[depth];
355     cout << endl;
356 
357     cout << "Sanity check after sort... " << flush;
358     sanity_check_state(poly);
359     cout << "done" << endl;
360 #endif //DEBUG
361 
362     //it's okay that these may have been initially unequal; synth_ec value irrelevant so far.
363     poly.original_ec->ft_offset = pop_ft_offset;
364     poly.synth_ec.ft_offset = pop_ft_offset;
365   }
366 
synthetic_reset(stagewise_poly & poly,example & ec)367   void synthetic_reset(stagewise_poly &poly, example &ec)
368   {
369     poly.synth_ec.l = ec.l;
370     poly.synth_ec.tag = ec.tag;
371     poly.synth_ec.example_counter = ec.example_counter;
372 
373     /**
374      * Some comments on ft_offset.
375      *
376      * The plan is to do the feature mapping dfs with weight indices ignoring
377      * the ft_offset.  This is because ft_offset is then added at the end,
378      * guaranteeing local/strided access on synth_ec.  This might not matter
379      * too much in this implementation (where, e.g., --oaa runs one after the
380      * other, not interleaved), but who knows.
381      *
382      * (The other choice is to basically ignore adjusting for ft_offset when
383      * doing the traversal, which means synth_ec.ft_offset is 0 here...)
384      *
385      * Anyway, so here is how ft_offset matters:
386      *   - synthetic_create_rec must "normalize it out" of the fed weight value
387      *   - parent and min_depths set/get are adjusted for it.
388      *   - cycle set/get are not adjusted for it, since it doesn't matter for them.
389      *   - operations on the whole weight vector (sorting, save_load, all_reduce)
390      *     ignore ft_offset, just treat the thing as a flat vector.
391      **/
392     poly.synth_ec.ft_offset = ec.ft_offset;
393 
394     poly.synth_ec.test_only = ec.test_only;
395     poly.synth_ec.end_pass = ec.end_pass;
396     poly.synth_ec.sorted = ec.sorted;
397     poly.synth_ec.in_use = ec.in_use;
398 
399     poly.synth_ec.atomics[tree_atomics].erase();
400     poly.synth_ec.audit_features[tree_atomics].erase();
401     poly.synth_ec.num_features = 0;
402     poly.synth_ec.total_sum_feat_sq = 0;
403     poly.synth_ec.sum_feat_sq[tree_atomics] = 0;
404     poly.synth_ec.example_t = ec.example_t;
405 
406     if (poly.synth_ec.indices.size()==0)
407       poly.synth_ec.indices.push_back(tree_atomics);
408   }
409 
synthetic_decycle(stagewise_poly & poly)410   void synthetic_decycle(stagewise_poly &poly)
411   {
412     for (feature *f = poly.synth_ec.atomics[tree_atomics].begin;
413         f != poly.synth_ec.atomics[tree_atomics].end; ++f) {
414       assert(cycle_get(poly, f->weight_index));
415       cycle_toggle(poly, f->weight_index);
416     }
417   }
418 
synthetic_create_rec(stagewise_poly & poly,float v,float & w)419   void synthetic_create_rec(stagewise_poly &poly, float v, float &w)
420   {
421     //Note: need to un_ft_shift since gd::foreach_feature bakes in the offset.
422     uint32_t wid_atomic = wid_mask(poly, un_ft_offset(poly, (uint32_t)((&w - poly.all->reg.weight_vector))));
423     uint32_t wid_cur = child_wid(poly, wid_atomic, poly.synth_rec_f.weight_index);
424     assert(wid_atomic % stride_shift(poly, 1) == 0);
425 
426     //Note: only mutate learner state when in training mode.  This is because
427     //the average test errors across multiple data sets should be equal to
428     //the test error on the merged dataset (which is violated if the code
429     //below is run at training time).
430     if (poly.cur_depth < min_depths_get(poly, wid_cur) && poly.training) {
431       if (parent_get(poly, wid_cur)) {
432 #ifdef DEBUG
433         cout
434           << "FOUND A TRANSPLANT!!! moving [" << wid_cur
435           << "] from depth " << (uint32_t) min_depths_get(poly, wid_cur)
436           << " to depth " << poly.cur_depth << endl;
437 #endif //DEBUG
438         //XXX arguably, should also fear transplants that occured with
439         //a different ft_offset ; e.g., need to look out for cross-reduction
440         //collisions.  Have not played with this issue yet...
441         parent_toggle(poly, wid_cur);
442       }
443       min_depths_set(poly, wid_cur, poly.cur_depth);
444     }
445 
446     if ( ! cycle_get(poly, wid_cur)
447         && ((poly.cur_depth > default_depth ? default_depth : poly.cur_depth) == min_depths_get(poly, wid_cur))
448        ) {
449       cycle_toggle(poly, wid_cur);
450 
451 #ifdef DEBUG
452       ++poly.depths[poly.cur_depth];
453 #endif //DEBUG
454 
455       feature new_f = { v * poly.synth_rec_f.x, wid_cur };
456       poly.synth_ec.atomics[tree_atomics].push_back(new_f);
457       poly.synth_ec.num_features++;
458       poly.synth_ec.sum_feat_sq[tree_atomics] += new_f.x * new_f.x;
459 
460       if (parent_get(poly, new_f.weight_index)) {
461         feature parent_f = poly.synth_rec_f;
462         poly.synth_rec_f = new_f;
463         ++poly.cur_depth;
464 #ifdef DEBUG
465         poly.max_depth = (poly.max_depth > poly.cur_depth) ? poly.max_depth : poly.cur_depth;
466 #endif //DEBUG
467         GD::foreach_feature<stagewise_poly, synthetic_create_rec>(*(poly.all), *(poly.original_ec), poly);
468         --poly.cur_depth;
469         poly.synth_rec_f = parent_f;
470       }
471     }
472   }
473 
synthetic_create(stagewise_poly & poly,example & ec,bool training)474   void synthetic_create(stagewise_poly &poly, example &ec, bool training)
475   {
476     synthetic_reset(poly, ec);
477 
478     poly.cur_depth = 0;
479 
480     poly.synth_rec_f.x = 1.0;
481     poly.synth_rec_f.weight_index = constant_feat_masked(poly); //note: not ft_offset'd
482     poly.training = training;
483     /*
484      * Another choice is to mark the constant feature as the single initial
485      * parent, and recurse just on that feature (which arguably correctly interprets poly.cur_depth).
486      * Problem with this is if there is a collision with the root...
487      */
488     GD::foreach_feature<stagewise_poly, synthetic_create_rec>(*poly.all, *poly.original_ec, poly);
489     synthetic_decycle(poly);
490     poly.synth_ec.total_sum_feat_sq = poly.synth_ec.sum_feat_sq[tree_atomics];
491 
492     if (training) {
493       poly.sum_sparsity += poly.synth_ec.num_features;
494       poly.sum_input_sparsity += ec.num_features;
495       poly.num_examples += 1;
496     }
497   }
498 
predict(stagewise_poly & poly,base_learner & base,example & ec)499   void predict(stagewise_poly &poly, base_learner &base, example &ec)
500   {
501     poly.original_ec = &ec;
502     synthetic_create(poly, ec, false);
503     base.predict(poly.synth_ec);
504     ec.partial_prediction = poly.synth_ec.partial_prediction;
505     ec.updated_prediction = poly.synth_ec.updated_prediction;
506     ec.pred.scalar = poly.synth_ec.pred.scalar;
507   }
508 
learn(stagewise_poly & poly,base_learner & base,example & ec)509   void learn(stagewise_poly &poly, base_learner &base, example &ec)
510   {
511     bool training = poly.all->training && ec.l.simple.label != FLT_MAX;
512     poly.original_ec = &ec;
513 
514     if (training) {
515       if(poly.update_support) {
516         sort_data_update_support(poly);
517         poly.update_support = false;
518       }
519 
520       synthetic_create(poly, ec, training);
521       base.learn(poly.synth_ec);
522       ec.partial_prediction = poly.synth_ec.partial_prediction;
523       ec.updated_prediction = poly.synth_ec.updated_prediction;
524       ec.pred.scalar = poly.synth_ec.pred.scalar;
525 
526       if (ec.example_counter
527           //following line is to avoid repeats when multiple reductions on same example.
528           //XXX ideally, would get all "copies" of an example before scheduling the support
529           //update, but how do we know?
530           && poly.last_example_counter != ec.example_counter
531           && poly.batch_sz
532           && ( (poly.batch_sz_double && !(ec.example_counter % poly.next_batch_sz))
533             || (!poly.batch_sz_double && !(ec.example_counter % poly.batch_sz)))) {
534         poly.next_batch_sz *= 2; //no effect when !poly.batch_sz_double
535         poly.update_support = (poly.all->span_server == "" || poly.numpasses == 1);
536       }
537       poly.last_example_counter = ec.example_counter;
538     } else
539       predict(poly, base, ec);
540   }
541 
542 
reduce_min(uint8_t & v1,const uint8_t & v2)543   void reduce_min(uint8_t &v1,const uint8_t &v2)
544   {
545     if(v1 == default_depth)
546       v1 = v2;
547     else if(v2 != default_depth)
548       v1 = (v1 <= v2) ? v1 : v2;
549   }
550 
reduce_min_max(uint8_t & v1,const uint8_t & v2)551   void reduce_min_max(uint8_t &v1,const uint8_t &v2)
552   {
553 	  bool parent_or_depth;
554 	  if (v1 & indicator_bit)
555 		  parent_or_depth = true;
556 	  else
557 		  parent_or_depth = false;
558 	  bool p_or_d2;
559 	  if (v2 & indicator_bit)
560 		  p_or_d2 = true;
561 	  else
562 		  p_or_d2 = false;
563     if(parent_or_depth != p_or_d2) {
564 #ifdef DEBUG
565       cout << "Reducing parent with depth!!!!!";
566 #endif //DEBUG
567       return;
568     }
569 
570     if(parent_or_depth)
571       v1 = (v1 >= v2) ? v1 : v2;
572     else {
573       if(v1 == default_depth)
574         v1 = v2;
575       else if(v2 != default_depth)
576         v1 = (v1 <= v2) ? v1 : v2;
577     }
578   }
579 
end_pass(stagewise_poly & poly)580   void end_pass(stagewise_poly &poly)
581   {
582     if (!!poly.batch_sz || (poly.all->span_server != "" && poly.numpasses > 1))
583       return;
584 
585     uint64_t sum_sparsity_inc = poly.sum_sparsity - poly.sum_sparsity_sync;
586     uint64_t sum_input_sparsity_inc = poly.sum_input_sparsity - poly.sum_input_sparsity_sync;
587     uint64_t num_examples_inc = poly.num_examples - poly.num_examples_sync;
588 
589 #ifdef DEBUG
590     cout << "Sanity before allreduce\n";
591     sanity_check_state(poly);
592 #endif //DEBUG
593 
594     vw &all = *poly.all;
595     if (all.span_server != "") {
596       /*
597        * The following is inconsistent with the transplant code in
598        * synthetic_create_rec(), which clears parent bits on depth mismatches.
599        * But it's unclear what the right behavior is in general for either
600        * case...
601        */
602       all_reduce<uint8_t, reduce_min_max>(poly.depthsbits, depthsbits_sizeof(poly),
603           all.span_server, all.unique_id, all.total, all.node, all.socks);
604 
605       sum_input_sparsity_inc = (uint64_t)accumulate_scalar(all, all.span_server, (float)sum_input_sparsity_inc);
606       sum_sparsity_inc = (uint64_t)accumulate_scalar(all, all.span_server, (float)sum_sparsity_inc);
607       num_examples_inc = (uint64_t)accumulate_scalar(all, all.span_server, (float)num_examples_inc);
608     }
609 
610     poly.sum_input_sparsity_sync = poly.sum_input_sparsity_sync + sum_input_sparsity_inc;
611     poly.sum_input_sparsity = poly.sum_input_sparsity_sync;
612     poly.sum_sparsity_sync = poly.sum_sparsity_sync + sum_sparsity_inc;
613     poly.sum_sparsity = poly.sum_sparsity_sync;
614     poly.num_examples_sync = poly.num_examples_sync + num_examples_inc;
615     poly.num_examples = poly.num_examples_sync;
616 
617 #ifdef DEBUG
618     cout << "Sanity after allreduce\n";
619     sanity_check_state(poly);
620 #endif //DEBUG
621 
622     if (poly.numpasses != poly.all->numpasses) {
623       poly.update_support = true;
624       poly.numpasses++;
625     }
626   }
627 
finish_example(vw & all,stagewise_poly & poly,example & ec)628   void finish_example(vw &all, stagewise_poly &poly, example &ec)
629   {
630     size_t temp_num_features = ec.num_features;
631     ec.num_features = poly.synth_ec.num_features;
632     output_and_account_example(all, ec);
633     ec.num_features = temp_num_features;
634     VW::finish_example(all, &ec);
635   }
636 
finish(stagewise_poly & poly)637   void finish(stagewise_poly &poly)
638   {
639 #ifdef DEBUG
640     cout << "total feature number (after poly expansion!) = " << poly.sum_sparsity << endl;
641 #endif //DEBUG
642 
643     poly.synth_ec.atomics[tree_atomics].delete_v();
644     sort_data_destroy(poly);
645     depthsbits_destroy(poly);
646   }
647 
648 
save_load(stagewise_poly & poly,io_buf & model_file,bool read,bool text)649   void save_load(stagewise_poly &poly, io_buf &model_file, bool read, bool text)
650   {
651     if (model_file.files.size() > 0)
652       bin_text_read_write_fixed(model_file, (char *) poly.depthsbits, depthsbits_sizeof(poly), "", read, "", 0, text);
653 
654     //unfortunately, following can't go here since save_load called before gd::save_load and thus
655     //weight vector state uninitialiazed.
656     //#ifdef DEBUG
657     //      cout << "Sanity check after save_load... " << flush;
658     //      sanity_check_state(poly);
659     //      cout << "done" << endl;
660     //#endif //DEBUG
661   }
662 
stagewise_poly_setup(vw & all)663   base_learner *stagewise_poly_setup(vw &all)
664   {
665     if (missing_option(all, true, "stage_poly", "use stagewise polynomial feature learning"))
666       return NULL;
667 
668     new_options(all, "Stagewise poly options")
669       ("sched_exponent", po::value<float>(), "exponent controlling quantity of included features")
670       ("batch_sz", po::value<uint32_t>(), "multiplier on batch size before including more features")
671       ("batch_sz_no_doubling", "batch_sz does not double")
672 #ifdef MAGIC_ARGUMENT
673       ("magic_argument", po::value<float>(), "magical feature flag")
674 #endif //MAGIC_ARGUMENT
675       ;
676     add_options(all);
677 
678     po::variables_map &vm = all.vm;
679     stagewise_poly& poly = calloc_or_die<stagewise_poly>();
680     poly.all = &all;
681     depthsbits_create(poly);
682     sort_data_create(poly);
683 
684     poly.sched_exponent = vm.count("sched_exponent") ? vm["sched_exponent"].as<float>() : 1.f;
685     poly.batch_sz = vm.count("batch_sz") ? vm["batch_sz"].as<uint32_t>() : 1000;
686     poly.batch_sz_double = vm.count("batch_sz_no_doubling") ? false : true;
687 #ifdef MAGIC_ARGUMENT
688     poly.magic_argument = vm.count("magic_argument") ? vm["magic_argument"].as<float>() : 0.;
689 #endif //MAGIC_ARGUMENT
690 
691     poly.sum_sparsity = 0;
692     poly.sum_input_sparsity = 0;
693     poly.num_examples = 0;
694     poly.sum_sparsity_sync = 0;
695     poly.sum_input_sparsity_sync = 0;
696     poly.num_examples_sync = 0;
697     poly.last_example_counter = -1;
698     poly.numpasses = 1;
699     poly.update_support = false;
700     poly.original_ec = NULL;
701     poly.next_batch_sz = poly.batch_sz;
702 
703     learner<stagewise_poly>& l = init_learner(&poly, setup_base(all), learn, predict);
704     l.set_finish(finish);
705     l.set_save_load(save_load);
706     l.set_finish_example(finish_example);
707     l.set_end_pass(end_pass);
708 
709     return make_base(l);
710   }
711