vowpal_wabbit-7.10/vowpalwabbit/stagewise_poly.cc

#include <float.h>
#include <cassert>

#include "gd.h"
#include "accumulate.h"
#include "reductions.h"
#include "vw.h"

//#define MAGIC_ARGUMENT //MAY IT NEVER DIE

using namespace std;
using namespace LEARNER;

  static const uint32_t parent_bit = 1;
  static const uint32_t cycle_bit = 2;
  static const uint32_t tree_atomics = 134;
  static const float tolerance = 1e-9f;
  static const uint32_t indicator_bit = 128;
  static const uint32_t default_depth = 127;

  struct sort_data {
    float wval;
    uint32_t wid;
  };

  struct stagewise_poly
  {
    vw *all; // many uses, unmodular reduction

    float sched_exponent;
    uint32_t batch_sz;
    bool batch_sz_double;

    sort_data *sd;
    uint32_t sd_len;
    uint8_t *depthsbits; //interleaved array storing depth information and parent/cycle bits

    uint64_t sum_sparsity; //of synthetic example
    uint64_t sum_input_sparsity; //of input example
    uint64_t num_examples;
    //following three are for parallel (see end_pass())
    uint64_t sum_sparsity_sync;
    uint64_t sum_input_sparsity_sync;
    uint64_t num_examples_sync;

    example synth_ec;
    //following is bookkeeping in synth_ec creation (dfs)
    feature synth_rec_f;
    example *original_ec;
    uint32_t cur_depth;
    bool training;
    uint64_t last_example_counter;
    size_t numpasses;
    uint32_t next_batch_sz;
    bool update_support;

#ifdef DEBUG
    uint32_t max_depth;
    uint32_t depths[100000];
#endif //DEBUG

#ifdef MAGIC_ARGUMENT
    float magic_argument;
#endif //MAGIC_ARGUMENT
  };


  inline uint32_t stride_shift(const stagewise_poly &poly, uint32_t idx)
  {
    return idx << poly.all->reg.stride_shift;
  }

  inline uint32_t stride_un_shift(const stagewise_poly &poly, uint32_t idx)
  {
    return idx >> poly.all->reg.stride_shift;
  }

  inline uint32_t do_ft_offset(const stagewise_poly &poly, uint32_t idx)
  {
    //cout << poly.synth_ec.ft_offset << "  " << poly.original_ec->ft_offset << endl;
    assert(!poly.original_ec || poly.synth_ec.ft_offset == poly.original_ec->ft_offset);
    return idx + poly.synth_ec.ft_offset;
  }

  inline uint32_t un_ft_offset(const stagewise_poly &poly, uint32_t idx)
  {
    assert(!poly.original_ec || poly.synth_ec.ft_offset == poly.original_ec->ft_offset);
    if (poly.synth_ec.ft_offset == 0)
      return idx;
    else {
      while (idx < poly.synth_ec.ft_offset) {
        idx += (uint32_t) poly.all->length() << poly.all->reg.stride_shift;
      }
      return idx - poly.synth_ec.ft_offset;
    }
  }

  inline uint32_t wid_mask(const stagewise_poly &poly, uint32_t wid)
  {
    return wid & poly.all->reg.weight_mask;
  }

  inline uint32_t wid_mask_un_shifted(const stagewise_poly &poly, uint32_t wid)
  {
    return stride_un_shift(poly, wid & poly.all->reg.weight_mask);
  }

  inline uint32_t constant_feat(const stagewise_poly &poly)
  {
    return stride_shift(poly, constant * poly.all->wpp);
  }

  inline uint32_t constant_feat_masked(const stagewise_poly &poly)
  {
    return wid_mask(poly, constant_feat(poly));
  }


  inline uint32_t depthsbits_sizeof(const stagewise_poly &poly)
  {
    return (uint32_t)(2 * poly.all->length() * sizeof(uint8_t));
  }

  void depthsbits_create(stagewise_poly &poly)
  {
    poly.depthsbits = calloc_or_die<uint8_t>(2 * poly.all->length());
    for (uint32_t i = 0; i < poly.all->length() * 2; i += 2) {
      poly.depthsbits[i] = default_depth;
      poly.depthsbits[i+1] = indicator_bit;
    }
  }

  void depthsbits_destroy(stagewise_poly &poly)
  {
    free(poly.depthsbits);
  }

  inline bool parent_get(const stagewise_poly &poly, uint32_t wid)
  {
    assert(wid % stride_shift(poly, 1) == 0);
    assert(do_ft_offset(poly, wid) % stride_shift(poly, 1) == 0);
    return poly.depthsbits[wid_mask_un_shifted(poly, do_ft_offset(poly, wid)) * 2 + 1] & parent_bit;
  }

  inline void parent_toggle(stagewise_poly &poly, uint32_t wid)
  {
    assert(wid % stride_shift(poly, 1) == 0);
    assert(do_ft_offset(poly, wid) % stride_shift(poly, 1) == 0);
    poly.depthsbits[wid_mask_un_shifted(poly, do_ft_offset(poly, wid)) * 2 + 1] ^= parent_bit;
  }

  inline bool cycle_get(const stagewise_poly &poly, uint32_t wid)
  {
    //note: intentionally leaving out ft_offset.
    assert(wid % stride_shift(poly, 1) == 0);
	if ((poly.depthsbits[wid_mask_un_shifted(poly, wid) * 2 + 1] & cycle_bit) > 0)
		return true;
	else
		return false;
  }

  inline void cycle_toggle(stagewise_poly &poly, uint32_t wid)
  {
    //note: intentionally leaving out ft_offset.
    assert(wid % stride_shift(poly, 1) == 0);
    poly.depthsbits[wid_mask_un_shifted(poly, wid) * 2 + 1] ^= cycle_bit;
  }

  inline uint8_t min_depths_get(const stagewise_poly &poly, uint32_t wid)
  {
    assert(wid % stride_shift(poly, 1) == 0);
    assert(do_ft_offset(poly, wid) % stride_shift(poly, 1) == 0);
    return poly.depthsbits[stride_un_shift(poly, do_ft_offset(poly, wid)) * 2];
  }

  inline void min_depths_set(stagewise_poly &poly, uint32_t wid, uint8_t depth)
  {
    assert(wid % stride_shift(poly, 1) == 0);
    assert(do_ft_offset(poly, wid) % stride_shift(poly, 1) == 0);
    poly.depthsbits[stride_un_shift(poly, do_ft_offset(poly, wid)) * 2] = depth;
  }

#ifndef NDEBUG
  void sanity_check_state(stagewise_poly &poly)
  {
    for (uint32_t i = 0; i != poly.all->length(); ++i)
    {
      uint32_t wid = stride_shift(poly, i);

      assert( ! cycle_get(poly,wid) );

      assert( ! (min_depths_get(poly, wid) == default_depth && parent_get(poly, wid)) );

      assert( ! (min_depths_get(poly, wid) == default_depth && fabsf(poly.all->reg.weight_vector[wid]) > 0) );
      //assert( min_depths_get(poly, wid) != default_depth && fabsf(poly.all->reg.weight_vector[wid]) < tolerance );

      assert( ! (poly.depthsbits[wid_mask_un_shifted(poly, wid) * 2 + 1] & ~(parent_bit + cycle_bit + indicator_bit)) );
    }
  }
#endif //NDEBUG

  //Note.  OUTPUT & INPUT masked.
  //It is very important that this function is invariant to stride.
  inline uint32_t child_wid(const stagewise_poly &poly, uint32_t wi_atomic, uint32_t wi_general)
  {
    assert(wi_atomic == wid_mask(poly, wi_atomic));
    assert(wi_general == wid_mask(poly, wi_general));
    assert((wi_atomic & (stride_shift(poly, 1) - 1)) == 0);
    assert((wi_general & (stride_shift(poly, 1) - 1)) == 0);

    if (wi_atomic == constant_feat_masked(poly))
      return wi_general;
    else if (wi_general == constant_feat_masked(poly))
      return wi_atomic;
    else {
      //This is basically the "Fowler–Noll–Vo" hash.  Ideally, the hash would be invariant
      //to the monomial, whereas this here is sensitive to the path followed, but whatever.
      //the two main big differences with FNV are: (1) the "*constant" case should also have
      //a big prime (so the default hash shouldn't be identity on small things, and (2) the
      //size should not just be a power of 2, but some big prime.
      return wid_mask(poly, stride_shift(poly, stride_un_shift(poly, wi_atomic)
            ^ (16777619 * stride_un_shift(poly, wi_general))));
    }
  }

  void sort_data_create(stagewise_poly &poly)
  {
    poly.sd = NULL;
    poly.sd_len = 0;
  }

  void sort_data_ensure_sz(stagewise_poly &poly, uint32_t len)
  {
    if (poly.sd_len < len) {
      uint32_t len_candidate = 2 * len;
#ifdef DEBUG
      cout << "resizing sort buffer; current size " << poly.sd_len;
#endif //DEBUG
      poly.sd_len = (len_candidate > poly.all->length()) ? (uint32_t)poly.all->length() : len_candidate;
#ifdef DEBUG
      cout << ", new size " << poly.sd_len << endl;
#endif //DEBUG
      free(poly.sd); //okay for null.
      poly.sd = calloc_or_die<sort_data>(poly.sd_len);
    }
    assert(len <= poly.sd_len);
  }

  void sort_data_destroy(stagewise_poly &poly)
  {
    free(poly.sd);
  }

#ifdef DEBUG
  int sort_data_compar(const void *a_v, const void *b_v)
  {
    return 2 * ( ((sort_data *) a_v)->wval < ((sort_data *) b_v)->wval ) - 1;
  }
#endif //DEBUG

  int sort_data_compar_heap(sort_data &a_v, sort_data &b_v)
  {
    return (a_v.wval > b_v.wval);
  }

  /*
   * Performance note.
   *
   * On my laptop (Intel(R) Core(TM) i7-3520M CPU @ 2.90GHz), with compiler
   * optimizations enabled, this routine takes ~0.001 seconds with -b 18 and
   * ~0.06 seconds with -b 24.  Since it is intended to run ~8 times in 1-pass
   * mode and otherwise once per pass, it is considered adequate.
   *
   * Another choice (implemented in another version) is to never process the
   * whole weight vector (e.g., by updating a hash set of nonzero weights after
   * creating the synthetic example, and only processing that here).  This
   * choice was implemented in a similar algorithm and performed well.
   */
  void sort_data_update_support(stagewise_poly &poly)
  {
    assert(poly.num_examples);

    //ft_offset affects parent_set / parent_get.  This state must be reset at end.
    uint32_t pop_ft_offset = poly.original_ec->ft_offset;
    poly.synth_ec.ft_offset = 0;
    assert(poly.original_ec);
    poly.original_ec->ft_offset = 0;

    uint32_t num_new_features = (uint32_t)pow(poly.sum_input_sparsity * 1.0f / poly.num_examples, poly.sched_exponent);
    num_new_features = (num_new_features > poly.all->length()) ? (uint32_t)poly.all->length() : num_new_features;
    sort_data_ensure_sz(poly, num_new_features);

    sort_data *heap_end = poly.sd;
    make_heap(poly.sd, heap_end, sort_data_compar_heap); //redundant
    for (uint32_t i = 0; i != poly.all->length(); ++i) {
      uint32_t wid = stride_shift(poly, i);
      if (!parent_get(poly, wid) && wid != constant_feat_masked(poly)) {
        float wval = (fabsf(poly.all->reg.weight_vector[wid])
            * poly.all->reg.weight_vector[poly.all->normalized_idx + (wid)])
          /*
           * here's some depth penalization code.  It was found to not improve
           * statistical performance, and meanwhile it is verified as giving
           * a nontrivial computational hit, thus commented out.
           *
           * - poly.magic_argument
           * sqrtf(min_depths_get(poly, stride_shift(poly, i)) * 1.0 / poly.num_examples)
           */
          ;
        if (wval > tolerance) {
          assert(heap_end >= poly.sd);
          assert(heap_end <= poly.sd + num_new_features);

          if (heap_end - poly.sd == (int)num_new_features && poly.sd->wval < wval) {
            pop_heap(poly.sd, heap_end, sort_data_compar_heap);
            --heap_end;
          }

          assert(heap_end >= poly.sd);
          assert(heap_end < poly.sd + poly.sd_len);

          if (heap_end - poly.sd < (int)num_new_features) {
            heap_end->wval = wval;
            heap_end->wid = wid;
            ++heap_end;
            push_heap(poly.sd, heap_end, sort_data_compar_heap);
          }
        }
      }
    }
    num_new_features = (uint32_t) (heap_end - poly.sd);

#ifdef DEBUG
    //eyeballing weights a pain if unsorted.
    qsort(poly.sd, num_new_features, sizeof(sort_data), sort_data_compar);
#endif //DEBUG

    for (uint32_t pos = 0; pos < num_new_features && pos < poly.sd_len; ++pos) {
      assert(!parent_get(poly, poly.sd[pos].wid)
          && poly.sd[pos].wval > tolerance
          && poly.sd[pos].wid != constant_feat_masked(poly));
      parent_toggle(poly, poly.sd[pos].wid);
#ifdef DEBUG
      cout
        << "Adding feature " << pos << "/" << num_new_features
        << " || wid " << poly.sd[pos].wid
        << " || sort value " << poly.sd[pos].wval
        << endl;
#endif //DEBUG
    }

#ifdef DEBUG
    cout << "depths:";
    for (uint32_t depth = 0; depth <= poly.max_depth && depth < sizeof(poly.depths) / sizeof(*poly.depths); ++depth)
      cout << "  [" << depth << "] = " << poly.depths[depth];
    cout << endl;

    cout << "Sanity check after sort... " << flush;
    sanity_check_state(poly);
    cout << "done" << endl;
#endif //DEBUG

    //it's okay that these may have been initially unequal; synth_ec value irrelevant so far.
    poly.original_ec->ft_offset = pop_ft_offset;
    poly.synth_ec.ft_offset = pop_ft_offset;
  }

  void synthetic_reset(stagewise_poly &poly, example &ec)
  {
    poly.synth_ec.l = ec.l;
    poly.synth_ec.tag = ec.tag;
    poly.synth_ec.example_counter = ec.example_counter;

    /**
     * Some comments on ft_offset.
     *
     * The plan is to do the feature mapping dfs with weight indices ignoring
     * the ft_offset.  This is because ft_offset is then added at the end,
     * guaranteeing local/strided access on synth_ec.  This might not matter
     * too much in this implementation (where, e.g., --oaa runs one after the
     * other, not interleaved), but who knows.
     *
     * (The other choice is to basically ignore adjusting for ft_offset when
     * doing the traversal, which means synth_ec.ft_offset is 0 here...)
     *
     * Anyway, so here is how ft_offset matters:
     *   - synthetic_create_rec must "normalize it out" of the fed weight value
     *   - parent and min_depths set/get are adjusted for it.
     *   - cycle set/get are not adjusted for it, since it doesn't matter for them.
     *   - operations on the whole weight vector (sorting, save_load, all_reduce)
     *     ignore ft_offset, just treat the thing as a flat vector.
     **/
    poly.synth_ec.ft_offset = ec.ft_offset;

    poly.synth_ec.test_only = ec.test_only;
    poly.synth_ec.end_pass = ec.end_pass;
    poly.synth_ec.sorted = ec.sorted;
    poly.synth_ec.in_use = ec.in_use;

    poly.synth_ec.atomics[tree_atomics].erase();
    poly.synth_ec.audit_features[tree_atomics].erase();
    poly.synth_ec.num_features = 0;
    poly.synth_ec.total_sum_feat_sq = 0;
    poly.synth_ec.sum_feat_sq[tree_atomics] = 0;
    poly.synth_ec.example_t = ec.example_t;

    if (poly.synth_ec.indices.size()==0)
      poly.synth_ec.indices.push_back(tree_atomics);
  }

  void synthetic_decycle(stagewise_poly &poly)
  {
    for (feature *f = poly.synth_ec.atomics[tree_atomics].begin;
        f != poly.synth_ec.atomics[tree_atomics].end; ++f) {
      assert(cycle_get(poly, f->weight_index));
      cycle_toggle(poly, f->weight_index);
    }
  }

  void synthetic_create_rec(stagewise_poly &poly, float v, float &w)
  {
    //Note: need to un_ft_shift since gd::foreach_feature bakes in the offset.
    uint32_t wid_atomic = wid_mask(poly, un_ft_offset(poly, (uint32_t)((&w - poly.all->reg.weight_vector))));
    uint32_t wid_cur = child_wid(poly, wid_atomic, poly.synth_rec_f.weight_index);
    assert(wid_atomic % stride_shift(poly, 1) == 0);

    //Note: only mutate learner state when in training mode.  This is because
    //the average test errors across multiple data sets should be equal to
    //the test error on the merged dataset (which is violated if the code
    //below is run at training time).
    if (poly.cur_depth < min_depths_get(poly, wid_cur) && poly.training) {
      if (parent_get(poly, wid_cur)) {
#ifdef DEBUG
        cout
          << "FOUND A TRANSPLANT!!! moving [" << wid_cur
          << "] from depth " << (uint32_t) min_depths_get(poly, wid_cur)
          << " to depth " << poly.cur_depth << endl;
#endif //DEBUG
        //XXX arguably, should also fear transplants that occured with
        //a different ft_offset ; e.g., need to look out for cross-reduction
        //collisions.  Have not played with this issue yet...
        parent_toggle(poly, wid_cur);
      }
      min_depths_set(poly, wid_cur, poly.cur_depth);
    }

    if ( ! cycle_get(poly, wid_cur)
        && ((poly.cur_depth > default_depth ? default_depth : poly.cur_depth) == min_depths_get(poly, wid_cur))
       ) {
      cycle_toggle(poly, wid_cur);

#ifdef DEBUG
      ++poly.depths[poly.cur_depth];
#endif //DEBUG

      feature new_f = { v * poly.synth_rec_f.x, wid_cur };
      poly.synth_ec.atomics[tree_atomics].push_back(new_f);
      poly.synth_ec.num_features++;
      poly.synth_ec.sum_feat_sq[tree_atomics] += new_f.x * new_f.x;

      if (parent_get(poly, new_f.weight_index)) {
        feature parent_f = poly.synth_rec_f;
        poly.synth_rec_f = new_f;
        ++poly.cur_depth;
#ifdef DEBUG
        poly.max_depth = (poly.max_depth > poly.cur_depth) ? poly.max_depth : poly.cur_depth;
#endif //DEBUG
        GD::foreach_feature<stagewise_poly, synthetic_create_rec>(*(poly.all), *(poly.original_ec), poly);
        --poly.cur_depth;
        poly.synth_rec_f = parent_f;
      }
    }
  }

  void synthetic_create(stagewise_poly &poly, example &ec, bool training)
  {
    synthetic_reset(poly, ec);

    poly.cur_depth = 0;

    poly.synth_rec_f.x = 1.0;
    poly.synth_rec_f.weight_index = constant_feat_masked(poly); //note: not ft_offset'd
    poly.training = training;
    /*
     * Another choice is to mark the constant feature as the single initial
     * parent, and recurse just on that feature (which arguably correctly interprets poly.cur_depth).
     * Problem with this is if there is a collision with the root...
     */
    GD::foreach_feature<stagewise_poly, synthetic_create_rec>(*poly.all, *poly.original_ec, poly);
    synthetic_decycle(poly);
    poly.synth_ec.total_sum_feat_sq = poly.synth_ec.sum_feat_sq[tree_atomics];

    if (training) {
      poly.sum_sparsity += poly.synth_ec.num_features;
      poly.sum_input_sparsity += ec.num_features;
      poly.num_examples += 1;
    }
  }

  void predict(stagewise_poly &poly, base_learner &base, example &ec)
  {
    poly.original_ec = &ec;
    synthetic_create(poly, ec, false);
    base.predict(poly.synth_ec);
    ec.partial_prediction = poly.synth_ec.partial_prediction;
    ec.updated_prediction = poly.synth_ec.updated_prediction;
    ec.pred.scalar = poly.synth_ec.pred.scalar;
  }

  void learn(stagewise_poly &poly, base_learner &base, example &ec)
  {
    bool training = poly.all->training && ec.l.simple.label != FLT_MAX;
    poly.original_ec = &ec;

    if (training) {
      if(poly.update_support) {
        sort_data_update_support(poly);
        poly.update_support = false;
      }

      synthetic_create(poly, ec, training);
      base.learn(poly.synth_ec);
      ec.partial_prediction = poly.synth_ec.partial_prediction;
      ec.updated_prediction = poly.synth_ec.updated_prediction;
      ec.pred.scalar = poly.synth_ec.pred.scalar;

      if (ec.example_counter
          //following line is to avoid repeats when multiple reductions on same example.
          //XXX ideally, would get all "copies" of an example before scheduling the support
          //update, but how do we know?
          && poly.last_example_counter != ec.example_counter
          && poly.batch_sz
          && ( (poly.batch_sz_double && !(ec.example_counter % poly.next_batch_sz))
            || (!poly.batch_sz_double && !(ec.example_counter % poly.batch_sz)))) {
        poly.next_batch_sz *= 2; //no effect when !poly.batch_sz_double
        poly.update_support = (poly.all->span_server == "" || poly.numpasses == 1);
      }
      poly.last_example_counter = ec.example_counter;
    } else
      predict(poly, base, ec);
  }


  void reduce_min(uint8_t &v1,const uint8_t &v2)
  {
    if(v1 == default_depth)
      v1 = v2;
    else if(v2 != default_depth)
      v1 = (v1 <= v2) ? v1 : v2;
  }

  void reduce_min_max(uint8_t &v1,const uint8_t &v2)
  {
	  bool parent_or_depth;
	  if (v1 & indicator_bit)
		  parent_or_depth = true;
	  else
		  parent_or_depth = false;
	  bool p_or_d2;
	  if (v2 & indicator_bit)
		  p_or_d2 = true;
	  else
		  p_or_d2 = false;
    if(parent_or_depth != p_or_d2) {
#ifdef DEBUG
      cout << "Reducing parent with depth!!!!!";
#endif //DEBUG
      return;
    }

    if(parent_or_depth)
      v1 = (v1 >= v2) ? v1 : v2;
    else {
      if(v1 == default_depth)
        v1 = v2;
      else if(v2 != default_depth)
        v1 = (v1 <= v2) ? v1 : v2;
    }
  }

  void end_pass(stagewise_poly &poly)
  {
    if (!!poly.batch_sz || (poly.all->span_server != "" && poly.numpasses > 1))
      return;

    uint64_t sum_sparsity_inc = poly.sum_sparsity - poly.sum_sparsity_sync;
    uint64_t sum_input_sparsity_inc = poly.sum_input_sparsity - poly.sum_input_sparsity_sync;
    uint64_t num_examples_inc = poly.num_examples - poly.num_examples_sync;

#ifdef DEBUG
    cout << "Sanity before allreduce\n";
    sanity_check_state(poly);
#endif //DEBUG

    vw &all = *poly.all;
    if (all.span_server != "") {
      /*
       * The following is inconsistent with the transplant code in
       * synthetic_create_rec(), which clears parent bits on depth mismatches.
       * But it's unclear what the right behavior is in general for either
       * case...
       */
      all_reduce<uint8_t, reduce_min_max>(poly.depthsbits, depthsbits_sizeof(poly),
          all.span_server, all.unique_id, all.total, all.node, all.socks);

      sum_input_sparsity_inc = (uint64_t)accumulate_scalar(all, all.span_server, (float)sum_input_sparsity_inc);
      sum_sparsity_inc = (uint64_t)accumulate_scalar(all, all.span_server, (float)sum_sparsity_inc);
      num_examples_inc = (uint64_t)accumulate_scalar(all, all.span_server, (float)num_examples_inc);
    }

    poly.sum_input_sparsity_sync = poly.sum_input_sparsity_sync + sum_input_sparsity_inc;
    poly.sum_input_sparsity = poly.sum_input_sparsity_sync;
    poly.sum_sparsity_sync = poly.sum_sparsity_sync + sum_sparsity_inc;
    poly.sum_sparsity = poly.sum_sparsity_sync;
    poly.num_examples_sync = poly.num_examples_sync + num_examples_inc;
    poly.num_examples = poly.num_examples_sync;

#ifdef DEBUG
    cout << "Sanity after allreduce\n";
    sanity_check_state(poly);
#endif //DEBUG

    if (poly.numpasses != poly.all->numpasses) {
      poly.update_support = true;
      poly.numpasses++;
    }
  }

  void finish_example(vw &all, stagewise_poly &poly, example &ec)
  {
    size_t temp_num_features = ec.num_features;
    ec.num_features = poly.synth_ec.num_features;
    output_and_account_example(all, ec);
    ec.num_features = temp_num_features;
    VW::finish_example(all, &ec);
  }

  void finish(stagewise_poly &poly)
  {
#ifdef DEBUG
    cout << "total feature number (after poly expansion!) = " << poly.sum_sparsity << endl;
#endif //DEBUG

    poly.synth_ec.atomics[tree_atomics].delete_v();
    sort_data_destroy(poly);
    depthsbits_destroy(poly);
  }


  void save_load(stagewise_poly &poly, io_buf &model_file, bool read, bool text)
  {
    if (model_file.files.size() > 0)
      bin_text_read_write_fixed(model_file, (char *) poly.depthsbits, depthsbits_sizeof(poly), "", read, "", 0, text);

    //unfortunately, following can't go here since save_load called before gd::save_load and thus
    //weight vector state uninitialiazed.
    //#ifdef DEBUG
    //      cout << "Sanity check after save_load... " << flush;
    //      sanity_check_state(poly);
    //      cout << "done" << endl;
    //#endif //DEBUG
  }

  base_learner *stagewise_poly_setup(vw &all)
  {
    if (missing_option(all, true, "stage_poly", "use stagewise polynomial feature learning"))
      return NULL;

    new_options(all, "Stagewise poly options")
      ("sched_exponent", po::value<float>(), "exponent controlling quantity of included features")
      ("batch_sz", po::value<uint32_t>(), "multiplier on batch size before including more features")
      ("batch_sz_no_doubling", "batch_sz does not double")
#ifdef MAGIC_ARGUMENT
      ("magic_argument", po::value<float>(), "magical feature flag")
#endif //MAGIC_ARGUMENT
      ;
    add_options(all);

    po::variables_map &vm = all.vm;
    stagewise_poly& poly = calloc_or_die<stagewise_poly>();
    poly.all = &all;
    depthsbits_create(poly);
    sort_data_create(poly);

    poly.sched_exponent = vm.count("sched_exponent") ? vm["sched_exponent"].as<float>() : 1.f;
    poly.batch_sz = vm.count("batch_sz") ? vm["batch_sz"].as<uint32_t>() : 1000;
    poly.batch_sz_double = vm.count("batch_sz_no_doubling") ? false : true;
#ifdef MAGIC_ARGUMENT
    poly.magic_argument = vm.count("magic_argument") ? vm["magic_argument"].as<float>() : 0.;
#endif //MAGIC_ARGUMENT

    poly.sum_sparsity = 0;
    poly.sum_input_sparsity = 0;
    poly.num_examples = 0;
    poly.sum_sparsity_sync = 0;
    poly.sum_input_sparsity_sync = 0;
    poly.num_examples_sync = 0;
    poly.last_example_counter = -1;
    poly.numpasses = 1;
    poly.update_support = false;
    poly.original_ec = NULL;
    poly.next_batch_sz = poly.batch_sz;

    learner<stagewise_poly>& l = init_learner(&poly, setup_base(all), learn, predict);
    l.set_finish(finish);
    l.set_save_load(save_load);
    l.set_finish_example(finish_example);
    l.set_end_pass(end_pass);

    return make_base(l);
  }