1 /**
2  * @file   dd_compressor.cc
3  *
4  * @section LICENSE
5  *
6  * The MIT License
7  *
8  * @copyright Copyright (c) 2017-2021 TileDB, Inc.
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  *
28  * @section DESCRIPTION
29  *
30  * This file implements the double delta compressor class.
31  */
32 
33 #include "tiledb/sm/compressors/dd_compressor.h"
34 #include "tiledb/common/logger.h"
35 #include "tiledb/sm/buffer/buffer.h"
36 #include "tiledb/sm/enums/datatype.h"
37 
38 /* ****************************** */
39 /*             MACROS             */
40 /* ****************************** */
41 
42 #define ABS(x) ((x) < 0) ? uint64_t(-(x)) : uint64_t(x)
43 
44 using namespace tiledb::common;
45 
46 namespace tiledb {
47 namespace sm {
48 
49 const uint64_t DoubleDelta::OVERHEAD = 17;
50 
51 /* ****************************** */
52 /*               API              */
53 /* ****************************** */
54 
compress(Datatype type,ConstBuffer * input_buffer,Buffer * output_buffer)55 Status DoubleDelta::compress(
56     Datatype type, ConstBuffer* input_buffer, Buffer* output_buffer) {
57   switch (type) {
58     case Datatype::INT8:
59       return DoubleDelta::compress<int8_t>(input_buffer, output_buffer);
60     case Datatype::UINT8:
61       return DoubleDelta::compress<uint8_t>(input_buffer, output_buffer);
62     case Datatype::INT16:
63       return DoubleDelta::compress<int16_t>(input_buffer, output_buffer);
64     case Datatype::UINT16:
65       return DoubleDelta::compress<uint16_t>(input_buffer, output_buffer);
66     case Datatype::INT32:
67       return DoubleDelta::compress<int>(input_buffer, output_buffer);
68     case Datatype::UINT32:
69       return DoubleDelta::compress<uint32_t>(input_buffer, output_buffer);
70     case Datatype::INT64:
71       return DoubleDelta::compress<int64_t>(input_buffer, output_buffer);
72     case Datatype::UINT64:
73       return DoubleDelta::compress<uint64_t>(input_buffer, output_buffer);
74     case Datatype::CHAR:
75       return DoubleDelta::compress<char>(input_buffer, output_buffer);
76     case Datatype::DATETIME_YEAR:
77     case Datatype::DATETIME_MONTH:
78     case Datatype::DATETIME_WEEK:
79     case Datatype::DATETIME_DAY:
80     case Datatype::DATETIME_HR:
81     case Datatype::DATETIME_MIN:
82     case Datatype::DATETIME_SEC:
83     case Datatype::DATETIME_MS:
84     case Datatype::DATETIME_US:
85     case Datatype::DATETIME_NS:
86     case Datatype::DATETIME_PS:
87     case Datatype::DATETIME_FS:
88     case Datatype::DATETIME_AS:
89     case Datatype::TIME_HR:
90     case Datatype::TIME_MIN:
91     case Datatype::TIME_SEC:
92     case Datatype::TIME_MS:
93     case Datatype::TIME_US:
94     case Datatype::TIME_NS:
95     case Datatype::TIME_PS:
96     case Datatype::TIME_FS:
97     case Datatype::TIME_AS:
98       return DoubleDelta::compress<int64_t>(input_buffer, output_buffer);
99     case Datatype::STRING_ASCII:
100     case Datatype::STRING_UTF8:
101     case Datatype::STRING_UTF16:
102     case Datatype::STRING_UTF32:
103     case Datatype::STRING_UCS2:
104     case Datatype::STRING_UCS4:
105     case Datatype::ANY:
106       return DoubleDelta::compress<uint8_t>(input_buffer, output_buffer);
107     case Datatype::FLOAT32:
108     case Datatype::FLOAT64:
109       return LOG_STATUS(Status::CompressionError(
110           "Cannot compress tile with DoubleDelta; Float "
111           "datatypes are not supported"));
112   }
113 
114   assert(false);
115   return LOG_STATUS(Status::CompressionError(
116       "Cannot compress tile with DoubleDelta; Not supported datatype"));
117 }
118 
decompress(Datatype type,ConstBuffer * input_buffer,PreallocatedBuffer * output_buffer)119 Status DoubleDelta::decompress(
120     Datatype type,
121     ConstBuffer* input_buffer,
122     PreallocatedBuffer* output_buffer) {
123   switch (type) {
124     case Datatype::INT8:
125       return DoubleDelta::decompress<int8_t>(input_buffer, output_buffer);
126     case Datatype::UINT8:
127       return DoubleDelta::decompress<uint8_t>(input_buffer, output_buffer);
128     case Datatype::INT16:
129       return DoubleDelta::decompress<int16_t>(input_buffer, output_buffer);
130     case Datatype::UINT16:
131       return DoubleDelta::decompress<uint16_t>(input_buffer, output_buffer);
132     case Datatype::INT32:
133       return DoubleDelta::decompress<int>(input_buffer, output_buffer);
134     case Datatype::UINT32:
135       return DoubleDelta::decompress<uint32_t>(input_buffer, output_buffer);
136     case Datatype::INT64:
137       return DoubleDelta::decompress<int64_t>(input_buffer, output_buffer);
138     case Datatype::UINT64:
139       return DoubleDelta::decompress<uint64_t>(input_buffer, output_buffer);
140     case Datatype::CHAR:
141       return DoubleDelta::decompress<char>(input_buffer, output_buffer);
142     case Datatype::DATETIME_YEAR:
143     case Datatype::DATETIME_MONTH:
144     case Datatype::DATETIME_WEEK:
145     case Datatype::DATETIME_DAY:
146     case Datatype::DATETIME_HR:
147     case Datatype::DATETIME_MIN:
148     case Datatype::DATETIME_SEC:
149     case Datatype::DATETIME_MS:
150     case Datatype::DATETIME_US:
151     case Datatype::DATETIME_NS:
152     case Datatype::DATETIME_PS:
153     case Datatype::DATETIME_FS:
154     case Datatype::DATETIME_AS:
155     case Datatype::TIME_HR:
156     case Datatype::TIME_MIN:
157     case Datatype::TIME_SEC:
158     case Datatype::TIME_MS:
159     case Datatype::TIME_US:
160     case Datatype::TIME_NS:
161     case Datatype::TIME_PS:
162     case Datatype::TIME_FS:
163     case Datatype::TIME_AS:
164       return DoubleDelta::decompress<int64_t>(input_buffer, output_buffer);
165     case Datatype::STRING_ASCII:
166     case Datatype::STRING_UTF8:
167     case Datatype::STRING_UTF16:
168     case Datatype::STRING_UTF32:
169     case Datatype::STRING_UCS2:
170     case Datatype::STRING_UCS4:
171     case Datatype::ANY:
172       return DoubleDelta::decompress<uint8_t>(input_buffer, output_buffer);
173     case Datatype::FLOAT32:
174     case Datatype::FLOAT64:
175       return LOG_STATUS(Status::CompressionError(
176           "Cannot decompress tile with DoubleDelta; Float "
177           "datatypes are not supported"));
178   }
179 
180   assert(false);
181   return LOG_STATUS(Status::CompressionError(
182       "Cannot decompress tile with DoubleDelta; Not supported datatype"));
183 }
184 
overhead(uint64_t nbytes)185 uint64_t DoubleDelta::overhead(uint64_t nbytes) {
186   // DoubleDelta has a fixed size overhead
187   (void)nbytes;
188   return DoubleDelta::OVERHEAD;
189 }
190 
191 /* ****************************** */
192 /*         PRIVATE METHODS        */
193 /* ****************************** */
194 
195 template <class T>
compress(ConstBuffer * input_buffer,Buffer * output_buffer)196 Status DoubleDelta::compress(ConstBuffer* input_buffer, Buffer* output_buffer) {
197   // Calculate number of values and handle trivial case
198   uint64_t value_size = sizeof(T);
199   uint64_t num = input_buffer->size() / value_size;
200   assert(num > 0 && (input_buffer->size() % value_size == 0));
201 
202   // Calculate bitsize (ignoring the sign bit)
203   auto in = (T*)input_buffer->data();
204   unsigned int bitsize;
205   RETURN_NOT_OK(compute_bitsize(in, num, &bitsize));
206   assert(bitsize <= std::numeric_limits<uint8_t>::max());
207   auto bitsize_c = static_cast<uint8_t>(bitsize);
208 
209   // Write bitsize and number of values
210   RETURN_NOT_OK(output_buffer->write(&bitsize_c, sizeof(uint8_t)));
211   RETURN_NOT_OK(output_buffer->write(&num, sizeof(uint64_t)));
212 
213   // Trivial case - no compression
214   if (bitsize >= sizeof(T) * 8 - 1) {
215     RETURN_NOT_OK(output_buffer->write(in, input_buffer->size()));
216     return Status::Ok();
217   }
218 
219   // Write first value
220   RETURN_NOT_OK(output_buffer->write(&in[0], value_size));
221   if (num == 1)
222     return Status::Ok();
223 
224   // Write second value
225   RETURN_NOT_OK(output_buffer->write(&in[1], value_size));
226   if (num == 2)
227     return Status::Ok();
228 
229   // Write double deltas
230   int64_t prev_delta = int64_t(in[1]) - int64_t(in[0]);
231   int bit_in_chunk = 63;  // Leftmost bit (MSB)
232   uint64_t chunk = 0;
233   for (uint64_t i = 2; i < num; ++i) {
234     int64_t cur_delta = int64_t(in[i]) - int64_t(in[i - 1]);
235     int64_t dd = cur_delta - prev_delta;
236     RETURN_NOT_OK(
237         write_double_delta(output_buffer, dd, bitsize, &chunk, &bit_in_chunk));
238     prev_delta = cur_delta;
239   }
240 
241   // Write whatever is left in the chunk
242   if (bit_in_chunk < 63)
243     RETURN_NOT_OK(output_buffer->write(&chunk, sizeof(uint64_t)));
244 
245   return Status::Ok();
246 }
247 
248 template <class T>
compute_bitsize(T * in,uint64_t num,unsigned int * bitsize)249 Status DoubleDelta::compute_bitsize(
250     T* in, uint64_t num, unsigned int* bitsize) {
251   // Find maximum absolute double delta
252   *bitsize = 0;
253   int64_t max = 0;
254   if (num <= 2) {
255     *bitsize = 0;
256     return Status::Ok();
257   }
258   int64_t prev_delta = int64_t(in[1]) - int64_t(in[0]);
259   char delta_out_of_bounds = 0;
260   for (uint64_t i = 2; i < num; ++i) {
261     int64_t cur_delta = int64_t(in[i]) - int64_t(in[i - 1]);
262     int64_t dd = cur_delta - prev_delta;
263     delta_out_of_bounds |= (char)(cur_delta < 0 && prev_delta > 0 && dd > 0);
264     delta_out_of_bounds |= (char)(cur_delta > 0 && prev_delta < 0 && dd < 0);
265     max = std::max(std::abs(dd), max);
266     prev_delta = cur_delta;
267   }
268   // Handle error
269   if (delta_out_of_bounds) {
270     return LOG_STATUS(
271         Status::CompressionError("Cannot compress with DoubleDelta; Some "
272                                  "negative double delta is out of bounds"));
273   }
274   // Calculate bitsize of the maximum absolute double delta
275   do {
276     ++(*bitsize);
277     max >>= 1;
278   } while (max);
279   return Status::Ok();
280 }
281 
282 template <class T>
decompress(ConstBuffer * input_buffer,PreallocatedBuffer * output_buffer)283 Status DoubleDelta::decompress(
284     ConstBuffer* input_buffer, PreallocatedBuffer* output_buffer) {
285   // Read bitsize and number of values
286   uint8_t bitsize_c = 0;
287   uint64_t num = 0;
288   uint64_t value_size = sizeof(T);
289   RETURN_NOT_OK(input_buffer->read(&bitsize_c, sizeof(uint8_t)));
290   RETURN_NOT_OK(input_buffer->read(&num, sizeof(uint64_t)));
291   auto bitsize = static_cast<unsigned int>(bitsize_c);
292   auto out = (T*)output_buffer->cur_data();
293 
294   // Trivial case - no compression
295   if (bitsize >= sizeof(T) * 8 - 1) {
296     RETURN_NOT_OK(output_buffer->write(
297         input_buffer->cur_data(), input_buffer->nbytes_left_to_read()));
298     return Status::Ok();
299   }
300 
301   // Read first value
302   T value;
303   RETURN_NOT_OK(input_buffer->read(&value, value_size));
304   RETURN_NOT_OK(output_buffer->write(&value, value_size));
305   if (num == 1)
306     return Status::Ok();
307 
308   // Read second value
309   RETURN_NOT_OK(input_buffer->read(&value, value_size));
310   RETURN_NOT_OK(output_buffer->write(&value, value_size));
311   if (num == 2)
312     return Status::Ok();
313 
314   // Read first chunk
315   uint64_t chunk;
316   RETURN_NOT_OK(input_buffer->read(&chunk, sizeof(uint64_t)));
317   int bit_in_chunk = 63;  // Leftmost bit (MSB)
318 
319   // Decompress rest of the values
320   int64_t dd;
321   for (uint64_t i = 2; i < num; ++i) {
322     RETURN_NOT_OK(
323         read_double_delta(input_buffer, &dd, bitsize, &chunk, &bit_in_chunk));
324     value = (T)(dd + 2 * (int64_t)out[i - 1] - (int64_t)out[i - 2]);
325     RETURN_NOT_OK(output_buffer->write(&value, value_size));
326   }
327 
328   return Status::Ok();
329 }
330 
read_double_delta(ConstBuffer * buff,int64_t * double_delta,int bitsize,uint64_t * chunk,int * bit_in_chunk)331 Status DoubleDelta::read_double_delta(
332     ConstBuffer* buff,
333     int64_t* double_delta,
334     int bitsize,
335     uint64_t* chunk,
336     int* bit_in_chunk) {
337   // Read sign
338   uint64_t to_and = 1;
339   to_and <<= (*bit_in_chunk);
340   int sign = (((*chunk) & to_and) == 0) ? 1 : -1;
341   --(*bit_in_chunk);
342 
343   // Read chunk and reset
344   if (*bit_in_chunk < 0) {
345     RETURN_NOT_OK(buff->read(chunk, sizeof(uint64_t)));
346     *bit_in_chunk = 63;
347   }
348 
349   // Read double delta
350   int bits_left_to_read = bitsize;
351   int bits_to_read_from_chunk = std::min(*bit_in_chunk + 1, bits_left_to_read);
352   uint64_t tmp_chunk;
353   int bit_in_dd = bitsize - 1;
354   *double_delta = 0;
355   while (bits_left_to_read > 0) {
356     // Read from chunk
357     if (bits_to_read_from_chunk > 0) {
358       tmp_chunk = (((*chunk) << (63 - *bit_in_chunk)) >> (63 - bit_in_dd));
359       (*double_delta) |= tmp_chunk;
360       bit_in_dd -= bits_to_read_from_chunk;
361       (*bit_in_chunk) -= bits_to_read_from_chunk;
362       bits_left_to_read -= bits_to_read_from_chunk;
363     }
364 
365     // Read chunk and reset
366     if (*bit_in_chunk < 0 && buff->offset() != buff->size()) {
367       RETURN_NOT_OK(buff->read(chunk, sizeof(uint64_t)));
368       *bit_in_chunk = 63;
369       bits_to_read_from_chunk = std::min(*bit_in_chunk + 1, bits_left_to_read);
370     }
371   }
372 
373   // Set sign
374   (*double_delta) *= sign;
375 
376   return Status::Ok();
377 }
378 
write_double_delta(Buffer * buff,int64_t double_delta,int bitsize,uint64_t * chunk,int * bit_in_chunk)379 Status DoubleDelta::write_double_delta(
380     Buffer* buff,
381     int64_t double_delta,
382     int bitsize,
383     uint64_t* chunk,
384     int* bit_in_chunk) {
385   // Write sign
386   uint64_t to_or = (double_delta < 0) ? 1 : 0;
387   to_or <<= (*bit_in_chunk);
388   (*chunk) |= to_or;
389   --(*bit_in_chunk);
390 
391   // Write chunk and reset
392   if (*bit_in_chunk < 0) {
393     RETURN_NOT_OK(buff->write(chunk, sizeof(uint64_t)));
394     *bit_in_chunk = 63;
395     *chunk = 0;
396   }
397 
398   // Write rest of bits
399   int bits_left_to_write = bitsize;
400   int bit_in_dd = bitsize - 1;
401   int bits_to_fill_in_chunk = std::min((*bit_in_chunk) + 1, bits_left_to_write);
402   uint64_t abs_dd = ABS(double_delta);
403   uint64_t tmp_abs_dd;
404 
405   while (bits_left_to_write > 0) {
406     if (bits_to_fill_in_chunk > 0) {
407       tmp_abs_dd = (abs_dd << (63 - bit_in_dd));
408       tmp_abs_dd >>= (63 - (*bit_in_chunk));
409       (*chunk) |= tmp_abs_dd;
410       (*bit_in_chunk) -= bits_to_fill_in_chunk;
411       bit_in_dd -= bits_to_fill_in_chunk;
412       bits_left_to_write -= bits_to_fill_in_chunk;
413     }
414 
415     // Write chunk and reset
416     if (*bit_in_chunk < 0) {
417       RETURN_NOT_OK(buff->write(chunk, sizeof(uint64_t)));
418       *bit_in_chunk = 63;
419       *chunk = 0;
420       bits_to_fill_in_chunk = std::min((*bit_in_chunk) + 1, bits_left_to_write);
421     }
422   }
423 
424   return Status::Ok();
425 }
426 
427 // Explicit template instantiations
428 
429 template Status DoubleDelta::compress<char>(
430     ConstBuffer* input_buffer, Buffer* output_buffer);
431 template Status DoubleDelta::compress<int8_t>(
432     ConstBuffer* input_buffer, Buffer* output_buffer);
433 template Status DoubleDelta::compress<uint8_t>(
434     ConstBuffer* input_buffer, Buffer* output_buffer);
435 template Status DoubleDelta::compress<int16_t>(
436     ConstBuffer* input_buffer, Buffer* output_buffer);
437 template Status DoubleDelta::compress<uint16_t>(
438     ConstBuffer* input_buffer, Buffer* output_buffer);
439 template Status DoubleDelta::compress<int>(
440     ConstBuffer* input_buffer, Buffer* output_buffer);
441 template Status DoubleDelta::compress<uint32_t>(
442     ConstBuffer* input_buffer, Buffer* output_buffer);
443 template Status DoubleDelta::compress<int64_t>(
444     ConstBuffer* input_buffer, Buffer* output_buffer);
445 template Status DoubleDelta::compress<uint64_t>(
446     ConstBuffer* input_buffer, Buffer* output_buffer);
447 
448 template Status DoubleDelta::decompress<char>(
449     ConstBuffer* input_buffer, PreallocatedBuffer* output_buffer);
450 template Status DoubleDelta::decompress<int8_t>(
451     ConstBuffer* input_buffer, PreallocatedBuffer* output_buffer);
452 template Status DoubleDelta::decompress<uint8_t>(
453     ConstBuffer* input_buffer, PreallocatedBuffer* output_buffer);
454 template Status DoubleDelta::decompress<int16_t>(
455     ConstBuffer* input_buffer, PreallocatedBuffer* output_buffer);
456 template Status DoubleDelta::decompress<uint16_t>(
457     ConstBuffer* input_buffer, PreallocatedBuffer* output_buffer);
458 template Status DoubleDelta::decompress<int>(
459     ConstBuffer* input_buffer, PreallocatedBuffer* output_buffer);
460 template Status DoubleDelta::decompress<uint32_t>(
461     ConstBuffer* input_buffer, PreallocatedBuffer* output_buffer);
462 template Status DoubleDelta::decompress<int64_t>(
463     ConstBuffer* input_buffer, PreallocatedBuffer* output_buffer);
464 template Status DoubleDelta::decompress<uint64_t>(
465     ConstBuffer* input_buffer, PreallocatedBuffer* output_buffer);
466 
467 };  // namespace sm
468 }  // namespace tiledb
469