1 /*
2 * Copyright (C) 2015 Nicolas Bonnefon and other contributors
3 *
4 * This file is part of glogg.
5 *
6 * glogg is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * glogg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with glogg. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include <cassert>
21 #include <cstdlib>
22
23 #include "utils.h"
24
25 #include "data/compressedlinestorage.h"
26
27 #ifdef HAVE_HTONS
28 # include <arpa/inet.h>
29 #else
30 # define htons(a) glogg_htons(a)
31 #endif
32
33 namespace {
34 // Functions to manipulate blocks
35
36 // Create a new 32 bits block of the passed size,
37 // initialised at the passed position
block32_new(int block_size,uint32_t initial_position,char ** block_ptr)38 char* block32_new( int block_size, uint32_t initial_position,
39 char** block_ptr )
40 {
41 // malloc a block of the maximum possible size
42 // (where every line is >16384)
43 char* ptr = static_cast<char*>( malloc( 4 + block_size * 6 ) );
44
45 if ( ptr ) {
46 // Write the initial_position
47 *(reinterpret_cast<uint32_t*>(ptr)) = initial_position;
48 *block_ptr = ptr + 4;
49 }
50
51 return ptr;
52 }
53
54 // Create a new 64 bits block of the passed size,
55 // initialised at the passed position
block64_new(int block_size,uint64_t initial_position,char ** block_ptr)56 char* block64_new( int block_size, uint64_t initial_position,
57 char** block_ptr )
58 {
59 // malloc a block of the maximum possible size
60 // (where every line is >16384)
61 char* ptr = static_cast<char*>( malloc( 8 + block_size * 10 ) );
62
63 if ( ptr ) {
64 // Write the initial_position
65 *(reinterpret_cast<uint64_t*>(ptr)) = initial_position;
66 *block_ptr = ptr + 8;
67 }
68
69 return ptr;
70 }
71
72
73 // Add a one byte relative delta (0-127) and inc pointer
74 // First bit is always 0
block_add_one_byte_relative(char ** ptr,uint8_t value)75 void block_add_one_byte_relative( char** ptr, uint8_t value )
76 {
77 **ptr = value;
78 *ptr += sizeof( value );
79 }
80
81 // Add a two bytes relative delta (0-16383) and inc pointer
82 // First 2 bits are always 10
block_add_two_bytes_relative(char ** ptr,uint16_t value)83 void block_add_two_bytes_relative( char** ptr, uint16_t value )
84 {
85 // Stored in big endian format in order to recognise the initial pattern:
86 // 10xx xxxx xxxx xxxx
87 // HO byte | LO byte
88 *(reinterpret_cast<uint16_t*>(*ptr)) = htons( value | (1 << 15) );
89 *ptr += sizeof( value );
90 }
91
92 // Add a signal and a 4 bytes absolute position and inc pointer
block32_add_absolute(char ** ptr,uint32_t value)93 void block32_add_absolute( char** ptr, uint32_t value )
94 {
95 // 2 bytes marker (actually only the first two bits are tested)
96 *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF;
97 *ptr += sizeof( uint16_t );
98 // Absolute value (machine endian)
99 *(reinterpret_cast<uint32_t*>(*ptr)) = value;
100 *ptr += sizeof( uint32_t );
101 }
102
103 // Initialise the passed block for reading, returning
104 // the initial position and a pointer to the second entry.
block32_initial_pos(char * block,char ** ptr)105 uint64_t block32_initial_pos( char* block, char** ptr )
106 {
107 *ptr = block + sizeof( uint32_t );
108 return *(reinterpret_cast<uint32_t*>(block));
109 }
110
111 // Give the next position in the block based on the previous
112 // position, then increase the pointer.
block32_next_pos(char ** ptr,uint64_t previous_pos)113 uint64_t block32_next_pos( char** ptr, uint64_t previous_pos )
114 {
115 uint64_t pos = previous_pos;
116
117 uint8_t byte = **ptr;
118 ++(*ptr);
119 if ( ! ( byte & 0x80 ) ) {
120 // High order bit is 0
121 pos += byte;
122 }
123 else if ( ( byte & 0xC0 ) == 0x80 ) {
124 // We need to read the low order byte
125 uint8_t lo_byte = **ptr;
126 ++(*ptr);
127 // Remove the starting 10b
128 byte &= ~0xC0;
129 // And form the displacement (stored as big endian)
130 pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte;
131 }
132 else {
133 // Skip the next byte (not used)
134 ++(*ptr);
135 // And read the new absolute pos (machine endian)
136 pos = *(reinterpret_cast<uint32_t*>(*ptr));
137 *ptr += sizeof( uint32_t );
138 }
139
140 return pos;
141 }
142
143 // Add a signal and a 8 bytes absolute position and inc pointer
block64_add_absolute(char ** ptr,uint64_t value)144 void block64_add_absolute( char** ptr, uint64_t value )
145 {
146 // This is unaligned, can cause problem on some CPUs
147
148 // 2 bytes marker (actually only the first two bits are tested)
149 *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF;
150 *ptr += sizeof( uint16_t );
151 // Absolute value (machine endian)
152 *(reinterpret_cast<uint64_t*>(*ptr)) = value;
153 *ptr += sizeof( uint64_t );
154 }
155
156 // Initialise the passed block for reading, returning
157 // the initial position and a pointer to the second entry.
block64_initial_pos(char * block,char ** ptr)158 uint64_t block64_initial_pos( char* block, char** ptr )
159 {
160 *ptr = block + sizeof( uint64_t );
161 return *(reinterpret_cast<uint64_t*>(block));
162 }
163
164 // Give the next position in the block based on the previous
165 // position, then increase the pointer.
block64_next_pos(char ** ptr,uint64_t previous_pos)166 uint64_t block64_next_pos( char** ptr, uint64_t previous_pos )
167 {
168 uint64_t pos = previous_pos;
169
170 uint8_t byte = **ptr;
171 ++(*ptr);
172 if ( ! ( byte & 0x80 ) ) {
173 // High order bit is 0
174 pos += byte;
175 }
176 else if ( ( byte & 0xC0 ) == 0x80 ) {
177 // We need to read the low order byte
178 uint8_t lo_byte = **ptr;
179 ++(*ptr);
180 // Remove the starting 10b
181 byte &= ~0xC0;
182 // And form the displacement (stored as big endian)
183 pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte;
184 }
185 else {
186 // Skip the next byte (not used)
187 ++(*ptr);
188 // And read the new absolute pos (machine endian)
189 pos = *(reinterpret_cast<uint64_t*>(*ptr));
190 *ptr += sizeof( uint64_t );
191 }
192
193 return pos;
194 }
195 }
196
move_from(CompressedLinePositionStorage && orig)197 void CompressedLinePositionStorage::move_from(
198 CompressedLinePositionStorage&& orig )
199 {
200 nb_lines_ = orig.nb_lines_;
201 first_long_line_ = orig.first_long_line_;
202 current_pos_ = orig.current_pos_;
203 block_pointer_ = orig.block_pointer_;
204 previous_block_pointer_ = orig.previous_block_pointer_;
205
206 orig.nb_lines_ = 0;
207 }
208
209 // Move constructor
CompressedLinePositionStorage(CompressedLinePositionStorage && orig)210 CompressedLinePositionStorage::CompressedLinePositionStorage(
211 CompressedLinePositionStorage&& orig )
212 : block32_index_( std::move( orig.block32_index_ ) ),
213 block64_index_( std::move( orig.block64_index_ ) )
214 {
215 move_from( std::move( orig ) );
216 }
217
free_blocks()218 void CompressedLinePositionStorage::free_blocks()
219 {
220 for ( char* block : block32_index_ ) {
221 void* p = static_cast<void*>( block );
222 free( p );
223 }
224
225 for ( char* block : block64_index_ ) {
226 void* p = static_cast<void*>( block );
227 free( p );
228 }
229 }
230
231 // Move assignement
operator =(CompressedLinePositionStorage && orig)232 CompressedLinePositionStorage& CompressedLinePositionStorage::operator=(
233 CompressedLinePositionStorage&& orig )
234 {
235 free_blocks();
236
237 block32_index_ = std::move( orig.block32_index_ );
238 block64_index_ = std::move( orig.block64_index_ );
239 move_from( std::move( orig ) );
240
241 return *this;
242 }
243
~CompressedLinePositionStorage()244 CompressedLinePositionStorage::~CompressedLinePositionStorage()
245 {
246 free_blocks();
247 }
248
249 // template<int BLOCK_SIZE>
append(uint64_t pos)250 void CompressedLinePositionStorage::append( uint64_t pos )
251 {
252 // Lines must be stored in order
253 assert( ( pos > current_pos_ ) || ( pos == 0 ) );
254
255 // Save the pointer in case we need to "pop_back"
256 previous_block_pointer_ = block_pointer_;
257
258 bool store_in_big = false;
259 if ( pos > UINT32_MAX ) {
260 store_in_big = true;
261 if ( first_long_line_ == UINT32_MAX ) {
262 // First "big" end of line, we will start a new (64) block
263 first_long_line_ = nb_lines_;
264 block_pointer_ = nullptr;
265 }
266 }
267
268 if ( ! block_pointer_ ) {
269 // We need to start a new block
270 if ( ! store_in_big )
271 block32_index_.push_back(
272 block32_new( BLOCK_SIZE, pos, &block_pointer_ ) );
273 else
274 block64_index_.push_back(
275 block64_new( BLOCK_SIZE, pos, &block_pointer_ ) );
276 }
277 else {
278 uint64_t delta = pos - current_pos_;
279 if ( delta < 128 ) {
280 // Code relative on one byte
281 block_add_one_byte_relative( &block_pointer_, delta );
282 }
283 else if ( delta < 16384 ) {
284 // Code relative on two bytes
285 block_add_two_bytes_relative( &block_pointer_, delta );
286 }
287 else {
288 // Code absolute
289 if ( ! store_in_big )
290 block32_add_absolute( &block_pointer_, pos );
291 else
292 block64_add_absolute( &block_pointer_, pos );
293 }
294 }
295
296 current_pos_ = pos;
297 ++nb_lines_;
298
299 if ( ! store_in_big ) {
300 if ( nb_lines_ % BLOCK_SIZE == 0 ) {
301 // We have finished the block
302
303 // Let's reduce its size to what is actually used
304 int block_index = nb_lines_ / BLOCK_SIZE - 1;
305 char* block = block32_index_[block_index];
306 // We allocate extra space for the last element in case it
307 // is replaced by an absolute value in the future (following a pop_back)
308 size_t new_size = ( previous_block_pointer_
309 + sizeof( uint16_t ) + sizeof( uint32_t ) ) - block;
310 void* new_location = realloc( block, new_size );
311 if ( new_location )
312 block32_index_[block_index] = static_cast<char*>( new_location );
313
314 block_pointer_ = nullptr;
315 }
316 }
317 else {
318 if ( ( nb_lines_ - first_long_line_ ) % BLOCK_SIZE == 0 ) {
319 // We have finished the block
320
321 // Let's reduce its size to what is actually used
322 int block_index = ( nb_lines_ - first_long_line_ ) / BLOCK_SIZE - 1;
323 char* block = block64_index_[block_index];
324 // We allocate extra space for the last element in case it
325 // is replaced by an absolute value in the future (following a pop_back)
326 size_t new_size = ( previous_block_pointer_
327 + sizeof( uint16_t ) + sizeof( uint64_t ) ) - block;
328 void* new_location = realloc( block, new_size );
329 if ( new_location )
330 block64_index_[block_index] = static_cast<char*>( new_location );
331
332 block_pointer_ = nullptr;
333 }
334 }
335 }
336
337 // template<int BLOCK_SIZE>
at(uint32_t index) const338 uint64_t CompressedLinePositionStorage::at( uint32_t index ) const
339 {
340 char* ptr;
341 uint64_t position;
342
343 Cache* last_read = last_read_.getPtr();
344
345 if ( index < first_long_line_ ) {
346 if ( ( index == last_read->index + 1 ) && ( index % BLOCK_SIZE != 0 ) ) {
347 position = last_read->position;
348 ptr = last_read->ptr;
349
350 position = block32_next_pos( &ptr, position );
351 }
352 else {
353 char* block = block32_index_[ index / BLOCK_SIZE ];
354 position = block32_initial_pos( block, &ptr );
355
356 for ( uint32_t i = 0; i < index % BLOCK_SIZE; i++ ) {
357 // Go through all the lines in the block till the one we want
358 position = block32_next_pos( &ptr, position );
359 }
360 }
361 }
362 else {
363 const uint32_t index_in_64 = index - first_long_line_;
364 if ( ( index == last_read->index + 1 ) && ( index_in_64 % BLOCK_SIZE != 0 ) ) {
365 position = last_read->position;
366 ptr = last_read->ptr;
367
368 position = block64_next_pos( &ptr, position );
369 }
370 else {
371 char* block = block64_index_[ index_in_64 / BLOCK_SIZE ];
372 position = block64_initial_pos( block, &ptr );
373
374 for ( uint32_t i = 0; i < index_in_64 % BLOCK_SIZE; i++ ) {
375 // Go through all the lines in the block till the one we want
376 position = block64_next_pos( &ptr, position );
377 }
378 }
379 }
380
381 // Populate our cache ready for next consecutive read
382 last_read->index = index;
383 last_read->position = position;
384 last_read->ptr = ptr;
385
386 return position;
387 }
388
append_list(const std::vector<uint64_t> & positions)389 void CompressedLinePositionStorage::append_list(
390 const std::vector<uint64_t>& positions )
391 {
392 // This is not very clever, but caching should make it
393 // reasonably fast.
394 for ( uint32_t i = 0; i < positions.size(); i++ )
395 append( positions.at( i ) );
396 }
397
398 // template<int BLOCK_SIZE>
pop_back()399 void CompressedLinePositionStorage::pop_back()
400 {
401 // Removing the last entered data, there are two cases
402 if ( previous_block_pointer_ ) {
403 // The last append was a normal entry in an existing block,
404 // so we can just revert the pointer
405 block_pointer_ = previous_block_pointer_;
406 previous_block_pointer_ = nullptr;
407 }
408 else {
409 // A new block has been created for the last entry, we need
410 // to de-alloc it.
411
412 if ( first_long_line_ == UINT32_MAX ) {
413 // If we try to pop_back() twice, we're dead!
414 assert( ( nb_lines_ - 1 ) % BLOCK_SIZE == 0 );
415
416 char* block = block32_index_.back();
417 block32_index_.pop_back();
418 free( block );
419 }
420 else {
421 // If we try to pop_back() twice, we're dead!
422 assert( ( nb_lines_ - first_long_line_ - 1 ) % BLOCK_SIZE == 0 );
423
424 char* block = block64_index_.back();
425 block64_index_.pop_back();
426 free( block );
427 }
428
429 block_pointer_ = nullptr;
430 }
431
432 --nb_lines_;
433 current_pos_ = at( nb_lines_ - 1 );
434 }
435