1 /*
2 Copyright (c) by respective owners including Yahoo!, Microsoft, and
3 individual contributors. All rights reserved.  Released under a BSD (revised)
4 license as described in the file LICENSE.
5  */
6 #include "cache.h"
7 #include "unique_sort.h"
8 #include "global_data.h"
9 
10 const size_t neg_1 = 1;
11 const size_t general = 2;
12 
run_len_decode(char * p,uint32_t & i)13 char* run_len_decode(char *p, uint32_t& i)
14 {// read an int 7 bits at a time.
15   size_t count = 0;
16   while(*p & 128)\
17     i = i | ((*(p++) & 127) << 7*count++);
18   i = i | (*(p++) << 7*count);
19   return p;
20 }
21 
ZigZagDecode(uint32_t n)22 inline int32_t ZigZagDecode(uint32_t n) { return (n >> 1) ^ -static_cast<int32_t>(n & 1); }
23 
read_cached_tag(io_buf & cache,example * ae)24 size_t read_cached_tag(io_buf& cache, example* ae)
25 {
26   char* c;
27   size_t tag_size;
28   if (buf_read(cache, c, sizeof(tag_size)) < sizeof(tag_size))
29     return 0;
30   tag_size = *(size_t*)c;
31   c += sizeof(tag_size);
32   cache.set(c);
33   if (buf_read(cache, c, tag_size) < tag_size)
34     return 0;
35 
36   ae->tag.erase();
37   push_many(ae->tag, c, tag_size);
38   return tag_size+sizeof(tag_size);
39 }
40 
41 struct one_float { float f; }
42 #ifndef _WIN32
43 __attribute__((packed))
44 #endif
45 	;
46 
read_cached_features(void * in,example * ec)47 int read_cached_features(void* in, example* ec)
48 {
49   vw* all = (vw*)in;
50   example* ae = (example*)ec;
51   ae->sorted = all->p->sorted_cache;
52   io_buf* input = all->p->input;
53 
54   size_t total = all->p->lp.read_cached_label(all->sd, &ae->l, *input);
55   if (total == 0)
56     return 0;
57   if (read_cached_tag(*input,ae) == 0)
58     return 0;
59   char* c;
60   unsigned char num_indices = 0;
61   if (buf_read(*input, c, sizeof(num_indices)) < sizeof(num_indices))
62     return 0;
63   num_indices = *(unsigned char*)c;
64   c += sizeof(num_indices);
65 
66   all->p->input->set(c);
67   for (;num_indices > 0; num_indices--)
68     {
69       size_t temp;
70       unsigned char index = 0;
71       if((temp = buf_read(*input,c,sizeof(index) + sizeof(size_t))) < sizeof(index) + sizeof(size_t)) {
72 	cerr << "truncated example! " << temp << " " << char_size + sizeof(size_t) << endl;
73 	return 0;
74       }
75 
76       index = *(unsigned char*)c;
77       c+= sizeof(index);
78       ae->indices.push_back((size_t)index);
79       v_array<feature>* ours = ae->atomics+index;
80       float* our_sum_feat_sq = ae->sum_feat_sq+index;
81       size_t storage = *(size_t *)c;
82       c += sizeof(size_t);
83       all->p->input->set(c);
84       total += storage;
85      if (buf_read(*input,c,storage) < storage) {
86 	cerr << "truncated example! wanted: " << storage << " bytes" << endl;
87 	return 0;
88       }
89 
90       char *end = c+storage;
91 
92       uint32_t last = 0;
93 
94       for (;c!= end;)
95 	{
96 	  feature f = {1., 0};
97 	  c = run_len_decode(c,f.weight_index);
98 	  if (f.weight_index & neg_1)
99 	    f.x = -1.;
100 	  else if (f.weight_index & general)	    {
101 	      f.x = ((one_float *)c)->f;
102 	      c += sizeof(float);
103 	    }
104 	  *our_sum_feat_sq += f.x*f.x;
105           uint32_t diff = f.weight_index >> 2;
106 
107           int32_t s_diff = ZigZagDecode(diff);
108 	  if (s_diff < 0)
109 	    ae->sorted = false;
110 	  f.weight_index = last + s_diff;
111 	  last = f.weight_index;
112 	  ours->push_back(f);
113 	}
114       all->p->input->set(c);
115     }
116 
117   return (int)total;
118 }
119 
run_len_encode(char * p,size_t i)120 char* run_len_encode(char *p, size_t i)
121 {// store an int 7 bits at a time.
122   while (i >= 128)
123     {
124       *(p++) = (i & 127) | 128;
125       i = i >> 7;
126     }
127   *(p++) = (i & 127);
128   return p;
129 }
130 
ZigZagEncode(int32_t n)131 inline uint32_t ZigZagEncode(int32_t n) {
132   uint32_t ret = (n << 1) ^ (n >> 31);
133   return ret;
134 }
135 
output_byte(io_buf & cache,unsigned char s)136 void output_byte(io_buf& cache, unsigned char s)
137 {
138   char *c;
139 
140   buf_write(cache, c, 1);
141   *(c++) = s;
142   cache.set(c);
143 }
144 
output_features(io_buf & cache,unsigned char index,feature * begin,feature * end,uint32_t mask)145 void output_features(io_buf& cache, unsigned char index, feature* begin, feature* end, uint32_t mask)
146 {
147   char* c;
148   size_t storage = (end-begin) * int_size;
149   for (feature* i = begin; i != end; i++)
150     if (i->x != 1. && i->x != -1.)
151       storage+=sizeof(float);
152   buf_write(cache, c, sizeof(index) + storage + sizeof(size_t));
153   *(unsigned char*)c = index;
154   c += sizeof(index);
155 
156   char *storage_size_loc = c;
157   c += sizeof(size_t);
158 
159   uint32_t last = 0;
160 
161   for (feature* i = begin; i != end; i++)
162     {
163       uint32_t cache_index = (i->weight_index) & mask;
164       int32_t s_diff = (cache_index - last);
165       size_t diff = ZigZagEncode(s_diff) << 2;
166       last = cache_index;
167       if (i->x == 1.)
168 	c = run_len_encode(c, diff);
169       else if (i->x == -1.)
170 	c = run_len_encode(c, diff | neg_1);
171       else {
172 	c = run_len_encode(c, diff | general);
173 	*(float *)c = i->x;
174 	c += sizeof(float);
175       }
176     }
177   cache.set(c);
178   *(size_t*)storage_size_loc = c - storage_size_loc - sizeof(size_t);
179 }
180 
cache_tag(io_buf & cache,v_array<char> tag)181 void cache_tag(io_buf& cache, v_array<char> tag)
182 {
183   char *c;
184   buf_write(cache, c, sizeof(size_t)+tag.size());
185   *(size_t*)c = tag.size();
186   c += sizeof(size_t);
187   memcpy(c, tag.begin, tag.size());
188   c += tag.size();
189   cache.set(c);
190 }
191 
cache_features(io_buf & cache,example * ae,uint32_t mask)192 void cache_features(io_buf& cache, example* ae, uint32_t mask)
193 {
194   cache_tag(cache,ae->tag);
195   output_byte(cache, (unsigned char) ae->indices.size());
196   for (unsigned char* b = ae->indices.begin; b != ae->indices.end; b++)
197     output_features(cache, *b, ae->atomics[*b].begin,ae->atomics[*b].end, mask);
198 }
199