1 /*
2 Copyright (c) by respective owners including Yahoo!, Microsoft, and
3 individual contributors. All rights reserved. Released under a BSD (revised)
4 license as described in the file LICENSE.
5 */
6 #include "cache.h"
7 #include "unique_sort.h"
8 #include "global_data.h"
9
10 const size_t neg_1 = 1;
11 const size_t general = 2;
12
run_len_decode(char * p,uint32_t & i)13 char* run_len_decode(char *p, uint32_t& i)
14 {// read an int 7 bits at a time.
15 size_t count = 0;
16 while(*p & 128)\
17 i = i | ((*(p++) & 127) << 7*count++);
18 i = i | (*(p++) << 7*count);
19 return p;
20 }
21
ZigZagDecode(uint32_t n)22 inline int32_t ZigZagDecode(uint32_t n) { return (n >> 1) ^ -static_cast<int32_t>(n & 1); }
23
read_cached_tag(io_buf & cache,example * ae)24 size_t read_cached_tag(io_buf& cache, example* ae)
25 {
26 char* c;
27 size_t tag_size;
28 if (buf_read(cache, c, sizeof(tag_size)) < sizeof(tag_size))
29 return 0;
30 tag_size = *(size_t*)c;
31 c += sizeof(tag_size);
32 cache.set(c);
33 if (buf_read(cache, c, tag_size) < tag_size)
34 return 0;
35
36 ae->tag.erase();
37 push_many(ae->tag, c, tag_size);
38 return tag_size+sizeof(tag_size);
39 }
40
41 struct one_float { float f; }
42 #ifndef _WIN32
43 __attribute__((packed))
44 #endif
45 ;
46
read_cached_features(void * in,example * ec)47 int read_cached_features(void* in, example* ec)
48 {
49 vw* all = (vw*)in;
50 example* ae = (example*)ec;
51 ae->sorted = all->p->sorted_cache;
52 io_buf* input = all->p->input;
53
54 size_t total = all->p->lp.read_cached_label(all->sd, &ae->l, *input);
55 if (total == 0)
56 return 0;
57 if (read_cached_tag(*input,ae) == 0)
58 return 0;
59 char* c;
60 unsigned char num_indices = 0;
61 if (buf_read(*input, c, sizeof(num_indices)) < sizeof(num_indices))
62 return 0;
63 num_indices = *(unsigned char*)c;
64 c += sizeof(num_indices);
65
66 all->p->input->set(c);
67 for (;num_indices > 0; num_indices--)
68 {
69 size_t temp;
70 unsigned char index = 0;
71 if((temp = buf_read(*input,c,sizeof(index) + sizeof(size_t))) < sizeof(index) + sizeof(size_t)) {
72 cerr << "truncated example! " << temp << " " << char_size + sizeof(size_t) << endl;
73 return 0;
74 }
75
76 index = *(unsigned char*)c;
77 c+= sizeof(index);
78 ae->indices.push_back((size_t)index);
79 v_array<feature>* ours = ae->atomics+index;
80 float* our_sum_feat_sq = ae->sum_feat_sq+index;
81 size_t storage = *(size_t *)c;
82 c += sizeof(size_t);
83 all->p->input->set(c);
84 total += storage;
85 if (buf_read(*input,c,storage) < storage) {
86 cerr << "truncated example! wanted: " << storage << " bytes" << endl;
87 return 0;
88 }
89
90 char *end = c+storage;
91
92 uint32_t last = 0;
93
94 for (;c!= end;)
95 {
96 feature f = {1., 0};
97 c = run_len_decode(c,f.weight_index);
98 if (f.weight_index & neg_1)
99 f.x = -1.;
100 else if (f.weight_index & general) {
101 f.x = ((one_float *)c)->f;
102 c += sizeof(float);
103 }
104 *our_sum_feat_sq += f.x*f.x;
105 uint32_t diff = f.weight_index >> 2;
106
107 int32_t s_diff = ZigZagDecode(diff);
108 if (s_diff < 0)
109 ae->sorted = false;
110 f.weight_index = last + s_diff;
111 last = f.weight_index;
112 ours->push_back(f);
113 }
114 all->p->input->set(c);
115 }
116
117 return (int)total;
118 }
119
run_len_encode(char * p,size_t i)120 char* run_len_encode(char *p, size_t i)
121 {// store an int 7 bits at a time.
122 while (i >= 128)
123 {
124 *(p++) = (i & 127) | 128;
125 i = i >> 7;
126 }
127 *(p++) = (i & 127);
128 return p;
129 }
130
ZigZagEncode(int32_t n)131 inline uint32_t ZigZagEncode(int32_t n) {
132 uint32_t ret = (n << 1) ^ (n >> 31);
133 return ret;
134 }
135
output_byte(io_buf & cache,unsigned char s)136 void output_byte(io_buf& cache, unsigned char s)
137 {
138 char *c;
139
140 buf_write(cache, c, 1);
141 *(c++) = s;
142 cache.set(c);
143 }
144
output_features(io_buf & cache,unsigned char index,feature * begin,feature * end,uint32_t mask)145 void output_features(io_buf& cache, unsigned char index, feature* begin, feature* end, uint32_t mask)
146 {
147 char* c;
148 size_t storage = (end-begin) * int_size;
149 for (feature* i = begin; i != end; i++)
150 if (i->x != 1. && i->x != -1.)
151 storage+=sizeof(float);
152 buf_write(cache, c, sizeof(index) + storage + sizeof(size_t));
153 *(unsigned char*)c = index;
154 c += sizeof(index);
155
156 char *storage_size_loc = c;
157 c += sizeof(size_t);
158
159 uint32_t last = 0;
160
161 for (feature* i = begin; i != end; i++)
162 {
163 uint32_t cache_index = (i->weight_index) & mask;
164 int32_t s_diff = (cache_index - last);
165 size_t diff = ZigZagEncode(s_diff) << 2;
166 last = cache_index;
167 if (i->x == 1.)
168 c = run_len_encode(c, diff);
169 else if (i->x == -1.)
170 c = run_len_encode(c, diff | neg_1);
171 else {
172 c = run_len_encode(c, diff | general);
173 *(float *)c = i->x;
174 c += sizeof(float);
175 }
176 }
177 cache.set(c);
178 *(size_t*)storage_size_loc = c - storage_size_loc - sizeof(size_t);
179 }
180
cache_tag(io_buf & cache,v_array<char> tag)181 void cache_tag(io_buf& cache, v_array<char> tag)
182 {
183 char *c;
184 buf_write(cache, c, sizeof(size_t)+tag.size());
185 *(size_t*)c = tag.size();
186 c += sizeof(size_t);
187 memcpy(c, tag.begin, tag.size());
188 c += tag.size();
189 cache.set(c);
190 }
191
cache_features(io_buf & cache,example * ae,uint32_t mask)192 void cache_features(io_buf& cache, example* ae, uint32_t mask)
193 {
194 cache_tag(cache,ae->tag);
195 output_byte(cache, (unsigned char) ae->indices.size());
196 for (unsigned char* b = ae->indices.begin; b != ae->indices.end; b++)
197 output_features(cache, *b, ae->atomics[*b].begin,ae->atomics[*b].end, mask);
198 }
199