1 /*
2  * Copyright (C) 2015 Nicolas Bonnefon and other contributors
3  *
4  * This file is part of glogg.
5  *
6  * glogg is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * glogg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with glogg.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include <cassert>
21 #include <cstdlib>
22 
23 #include "utils.h"
24 
25 #include "data/compressedlinestorage.h"
26 
27 #ifdef HAVE_HTONS
28 #  include <arpa/inet.h>
29 #else
30 #  define htons(a) glogg_htons(a)
31 #endif
32 
33 namespace {
34     // Functions to manipulate blocks
35 
36     // Create a new 32 bits block of the passed size,
37     // initialised at the passed position
block32_new(int block_size,uint32_t initial_position,char ** block_ptr)38     char* block32_new( int block_size, uint32_t initial_position,
39            char** block_ptr )
40     {
41         // malloc a block of the maximum possible size
42         // (where every line is >16384)
43         char* ptr = static_cast<char*>( malloc( 4 + block_size * 6 ) );
44 
45         if ( ptr ) {
46             // Write the initial_position
47             *(reinterpret_cast<uint32_t*>(ptr)) = initial_position;
48             *block_ptr = ptr + 4;
49         }
50 
51         return ptr;
52     }
53 
54     // Create a new 64 bits block of the passed size,
55     // initialised at the passed position
block64_new(int block_size,uint64_t initial_position,char ** block_ptr)56     char* block64_new( int block_size, uint64_t initial_position,
57            char** block_ptr )
58     {
59         // malloc a block of the maximum possible size
60         // (where every line is >16384)
61         char* ptr = static_cast<char*>( malloc( 8 + block_size * 10 ) );
62 
63         if ( ptr ) {
64             // Write the initial_position
65             *(reinterpret_cast<uint64_t*>(ptr)) = initial_position;
66             *block_ptr = ptr + 8;
67         }
68 
69         return ptr;
70     }
71 
72 
73     // Add a one byte relative delta (0-127) and inc pointer
74     // First bit is always 0
block_add_one_byte_relative(char ** ptr,uint8_t value)75     void block_add_one_byte_relative( char** ptr, uint8_t value )
76     {
77         **ptr = value;
78         *ptr += sizeof( value );
79     }
80 
81     // Add a two bytes relative delta (0-16383) and inc pointer
82     // First 2 bits are always 10
block_add_two_bytes_relative(char ** ptr,uint16_t value)83     void block_add_two_bytes_relative( char** ptr, uint16_t value )
84     {
85         // Stored in big endian format in order to recognise the initial pattern:
86         // 10xx xxxx xxxx xxxx
87         //  HO byte | LO byte
88         *(reinterpret_cast<uint16_t*>(*ptr)) = htons( value | (1 << 15) );
89         *ptr += sizeof( value );
90     }
91 
92     // Add a signal and a 4 bytes absolute position and inc pointer
block32_add_absolute(char ** ptr,uint32_t value)93     void block32_add_absolute( char** ptr, uint32_t value )
94     {
95         // 2 bytes marker (actually only the first two bits are tested)
96         *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF;
97         *ptr += sizeof( uint16_t );
98         // Absolute value (machine endian)
99         *(reinterpret_cast<uint32_t*>(*ptr)) = value;
100         *ptr += sizeof( uint32_t );
101     }
102 
103     // Initialise the passed block for reading, returning
104     // the initial position and a pointer to the second entry.
block32_initial_pos(char * block,char ** ptr)105     uint64_t block32_initial_pos( char* block, char** ptr )
106     {
107         *ptr = block + sizeof( uint32_t );
108         return *(reinterpret_cast<uint32_t*>(block));
109     }
110 
111     // Give the next position in the block based on the previous
112     // position, then increase the pointer.
block32_next_pos(char ** ptr,uint64_t previous_pos)113     uint64_t block32_next_pos( char** ptr, uint64_t previous_pos )
114     {
115         uint64_t pos = previous_pos;
116 
117         uint8_t byte = **ptr;
118         ++(*ptr);
119         if ( ! ( byte & 0x80 ) ) {
120             // High order bit is 0
121             pos += byte;
122         }
123         else if ( ( byte & 0xC0 ) == 0x80 ) {
124             // We need to read the low order byte
125             uint8_t lo_byte = **ptr;
126             ++(*ptr);
127             // Remove the starting 10b
128             byte &= ~0xC0;
129             // And form the displacement (stored as big endian)
130             pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte;
131         }
132         else {
133             // Skip the next byte (not used)
134             ++(*ptr);
135             // And read the new absolute pos (machine endian)
136             pos = *(reinterpret_cast<uint32_t*>(*ptr));
137             *ptr += sizeof( uint32_t );
138         }
139 
140         return pos;
141     }
142 
143     // Add a signal and a 8 bytes absolute position and inc pointer
block64_add_absolute(char ** ptr,uint64_t value)144     void block64_add_absolute( char** ptr, uint64_t value )
145     {
146         // This is unaligned, can cause problem on some CPUs
147 
148         // 2 bytes marker (actually only the first two bits are tested)
149         *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF;
150         *ptr += sizeof( uint16_t );
151         // Absolute value (machine endian)
152         *(reinterpret_cast<uint64_t*>(*ptr)) = value;
153         *ptr += sizeof( uint64_t );
154     }
155 
156     // Initialise the passed block for reading, returning
157     // the initial position and a pointer to the second entry.
block64_initial_pos(char * block,char ** ptr)158     uint64_t block64_initial_pos( char* block, char** ptr )
159     {
160         *ptr = block + sizeof( uint64_t );
161         return *(reinterpret_cast<uint64_t*>(block));
162     }
163 
164     // Give the next position in the block based on the previous
165     // position, then increase the pointer.
block64_next_pos(char ** ptr,uint64_t previous_pos)166     uint64_t block64_next_pos( char** ptr, uint64_t previous_pos )
167     {
168         uint64_t pos = previous_pos;
169 
170         uint8_t byte = **ptr;
171         ++(*ptr);
172         if ( ! ( byte & 0x80 ) ) {
173             // High order bit is 0
174             pos += byte;
175         }
176         else if ( ( byte & 0xC0 ) == 0x80 ) {
177             // We need to read the low order byte
178             uint8_t lo_byte = **ptr;
179             ++(*ptr);
180             // Remove the starting 10b
181             byte &= ~0xC0;
182             // And form the displacement (stored as big endian)
183             pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte;
184         }
185         else {
186             // Skip the next byte (not used)
187             ++(*ptr);
188             // And read the new absolute pos (machine endian)
189             pos = *(reinterpret_cast<uint64_t*>(*ptr));
190             *ptr += sizeof( uint64_t );
191         }
192 
193         return pos;
194     }
195 }
196 
move_from(CompressedLinePositionStorage && orig)197 void CompressedLinePositionStorage::move_from(
198         CompressedLinePositionStorage&& orig )
199 {
200     nb_lines_        = orig.nb_lines_;
201     first_long_line_ = orig.first_long_line_;
202     current_pos_     = orig.current_pos_;
203     block_pointer_   = orig.block_pointer_;
204     previous_block_pointer_ = orig.previous_block_pointer_;
205 
206     orig.nb_lines_   = 0;
207 }
208 
209 // Move constructor
CompressedLinePositionStorage(CompressedLinePositionStorage && orig)210 CompressedLinePositionStorage::CompressedLinePositionStorage(
211         CompressedLinePositionStorage&& orig )
212     : block32_index_( std::move( orig.block32_index_ ) ),
213       block64_index_( std::move( orig.block64_index_ ) )
214 {
215     move_from( std::move( orig ) );
216 }
217 
free_blocks()218 void CompressedLinePositionStorage::free_blocks()
219 {
220     for ( char* block : block32_index_ ) {
221         void* p = static_cast<void*>( block );
222         free( p );
223     }
224 
225     for ( char* block : block64_index_ ) {
226         void* p = static_cast<void*>( block );
227         free( p );
228     }
229 }
230 
231 // Move assignement
operator =(CompressedLinePositionStorage && orig)232 CompressedLinePositionStorage& CompressedLinePositionStorage::operator=(
233         CompressedLinePositionStorage&& orig )
234 {
235     free_blocks();
236 
237     block32_index_ = std::move( orig.block32_index_ );
238     block64_index_ = std::move( orig.block64_index_ );
239     move_from( std::move( orig ) );
240 
241     return *this;
242 }
243 
~CompressedLinePositionStorage()244 CompressedLinePositionStorage::~CompressedLinePositionStorage()
245 {
246     free_blocks();
247 }
248 
249 // template<int BLOCK_SIZE>
append(uint64_t pos)250 void CompressedLinePositionStorage::append( uint64_t pos )
251 {
252     // Lines must be stored in order
253     assert( ( pos > current_pos_ ) || ( pos == 0 ) );
254 
255     // Save the pointer in case we need to "pop_back"
256     previous_block_pointer_ = block_pointer_;
257 
258     bool store_in_big = false;
259     if ( pos > UINT32_MAX ) {
260         store_in_big = true;
261         if ( first_long_line_ == UINT32_MAX ) {
262             // First "big" end of line, we will start a new (64) block
263             first_long_line_ = nb_lines_;
264             block_pointer_ = nullptr;
265         }
266     }
267 
268     if ( ! block_pointer_ ) {
269         // We need to start a new block
270         if ( ! store_in_big )
271             block32_index_.push_back(
272                 block32_new( BLOCK_SIZE, pos, &block_pointer_ ) );
273         else
274             block64_index_.push_back(
275                 block64_new( BLOCK_SIZE, pos, &block_pointer_ ) );
276     }
277     else {
278         uint64_t delta = pos - current_pos_;
279         if ( delta < 128 ) {
280             // Code relative on one byte
281             block_add_one_byte_relative( &block_pointer_, delta );
282         }
283         else if ( delta < 16384 ) {
284             // Code relative on two bytes
285             block_add_two_bytes_relative( &block_pointer_, delta );
286         }
287         else {
288             // Code absolute
289             if ( ! store_in_big )
290                 block32_add_absolute( &block_pointer_, pos );
291             else
292                 block64_add_absolute( &block_pointer_, pos );
293         }
294     }
295 
296     current_pos_ = pos;
297     ++nb_lines_;
298 
299     if ( ! store_in_big ) {
300         if ( nb_lines_ % BLOCK_SIZE == 0 ) {
301             // We have finished the block
302 
303             // Let's reduce its size to what is actually used
304             int block_index = nb_lines_ / BLOCK_SIZE - 1;
305             char* block = block32_index_[block_index];
306             // We allocate extra space for the last element in case it
307             // is replaced by an absolute value in the future (following a pop_back)
308             size_t new_size = ( previous_block_pointer_
309                     + sizeof( uint16_t ) + sizeof( uint32_t ) ) - block;
310             void* new_location = realloc( block, new_size );
311             if ( new_location )
312                 block32_index_[block_index] = static_cast<char*>( new_location );
313 
314             block_pointer_ = nullptr;
315         }
316     }
317     else {
318         if ( ( nb_lines_ - first_long_line_ ) % BLOCK_SIZE == 0 ) {
319             // We have finished the block
320 
321             // Let's reduce its size to what is actually used
322             int block_index = ( nb_lines_ - first_long_line_ ) / BLOCK_SIZE - 1;
323             char* block = block64_index_[block_index];
324             // We allocate extra space for the last element in case it
325             // is replaced by an absolute value in the future (following a pop_back)
326             size_t new_size = ( previous_block_pointer_
327                     + sizeof( uint16_t ) + sizeof( uint64_t ) ) - block;
328             void* new_location = realloc( block, new_size );
329             if ( new_location )
330                 block64_index_[block_index] = static_cast<char*>( new_location );
331 
332             block_pointer_ = nullptr;
333         }
334     }
335 }
336 
337 // template<int BLOCK_SIZE>
at(uint32_t index) const338 uint64_t CompressedLinePositionStorage::at( uint32_t index ) const
339 {
340     char* ptr;
341     uint64_t position;
342 
343     Cache* last_read = last_read_.getPtr();
344 
345     if ( index < first_long_line_ ) {
346         if ( ( index == last_read->index + 1 ) && ( index % BLOCK_SIZE != 0 ) ) {
347             position = last_read->position;
348             ptr      = last_read->ptr;
349 
350             position = block32_next_pos( &ptr, position );
351         }
352         else {
353             char* block = block32_index_[ index / BLOCK_SIZE ];
354             position = block32_initial_pos( block, &ptr );
355 
356             for ( uint32_t i = 0; i < index % BLOCK_SIZE; i++ ) {
357                 // Go through all the lines in the block till the one we want
358                 position = block32_next_pos( &ptr, position );
359             }
360         }
361     }
362     else {
363         const uint32_t index_in_64 = index - first_long_line_;
364         if ( ( index == last_read->index + 1 ) && ( index_in_64 % BLOCK_SIZE != 0 ) ) {
365             position = last_read->position;
366             ptr      = last_read->ptr;
367 
368             position = block64_next_pos( &ptr, position );
369         }
370         else {
371             char* block = block64_index_[ index_in_64 / BLOCK_SIZE ];
372             position = block64_initial_pos( block, &ptr );
373 
374             for ( uint32_t i = 0; i < index_in_64 % BLOCK_SIZE; i++ ) {
375                 // Go through all the lines in the block till the one we want
376                 position = block64_next_pos( &ptr, position );
377             }
378         }
379     }
380 
381     // Populate our cache ready for next consecutive read
382     last_read->index    = index;
383     last_read->position = position;
384     last_read->ptr      = ptr;
385 
386     return position;
387 }
388 
append_list(const std::vector<uint64_t> & positions)389 void CompressedLinePositionStorage::append_list(
390         const std::vector<uint64_t>& positions )
391 {
392     // This is not very clever, but caching should make it
393     // reasonably fast.
394     for ( uint32_t i = 0; i < positions.size(); i++ )
395         append( positions.at( i ) );
396 }
397 
398 // template<int BLOCK_SIZE>
pop_back()399 void CompressedLinePositionStorage::pop_back()
400 {
401     // Removing the last entered data, there are two cases
402     if ( previous_block_pointer_ ) {
403         // The last append was a normal entry in an existing block,
404         // so we can just revert the pointer
405         block_pointer_ = previous_block_pointer_;
406         previous_block_pointer_ = nullptr;
407     }
408     else {
409         // A new block has been created for the last entry, we need
410         // to de-alloc it.
411 
412         if ( first_long_line_ == UINT32_MAX ) {
413             // If we try to pop_back() twice, we're dead!
414             assert( ( nb_lines_ - 1 ) % BLOCK_SIZE == 0 );
415 
416             char* block = block32_index_.back();
417             block32_index_.pop_back();
418             free( block );
419         }
420         else {
421             // If we try to pop_back() twice, we're dead!
422             assert( ( nb_lines_ - first_long_line_ - 1 ) % BLOCK_SIZE == 0 );
423 
424             char* block = block64_index_.back();
425             block64_index_.pop_back();
426             free( block );
427         }
428 
429         block_pointer_ = nullptr;
430     }
431 
432     --nb_lines_;
433     current_pos_ = at( nb_lines_ - 1 );
434 }
435