1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 #include "lookup_reader.h"
28 #include "file_printer.h"
29 #include "helper.h"
30
31 #include <klib/printf.h>
32 #include <kfs/file.h>
33 #include <kfs/buffile.h>
34
35 #include <string.h>
36 #include <stdio.h>
37
38 typedef struct lookup_reader
39 {
40 const struct KFile * f;
41 const struct index_reader * index;
42 SBuffer buf;
43 uint64_t pos, f_size, max_key;
44 } lookup_reader;
45
46
release_lookup_reader(struct lookup_reader * self)47 void release_lookup_reader( struct lookup_reader * self )
48 {
49 if ( NULL != self )
50 {
51 if ( NULL != self -> f )
52 {
53 rc_t rc = KFileRelease( self -> f );
54 if ( 0 != rc )
55 {
56 ErrMsg( "release_lookup_reader().KFileRelease() -> %R", rc );
57 }
58 }
59 release_SBuffer( &( self -> buf ) ); /* helper.c */
60 free( ( void * ) self );
61 }
62 }
63
make_lookup_reader_obj(struct lookup_reader ** reader,const struct index_reader * index,const struct KFile * f)64 static rc_t make_lookup_reader_obj( struct lookup_reader ** reader,
65 const struct index_reader * index,
66 const struct KFile * f )
67 {
68 rc_t rc = 0;
69 lookup_reader * r = calloc( 1, sizeof * r );
70 if ( NULL == r )
71 {
72 rc = RC( rcVDB, rcNoTarg, rcConstructing, rcMemory, rcExhausted );
73 ErrMsg( "make_lookup_reader_obj().calloc( %d ) -> %R", ( sizeof * r ), rc );
74 }
75 else
76 {
77 r -> f = f;
78 r -> index = index;
79 rc = KFileSize( f, & r -> f_size );
80 if ( 0 == rc )
81 {
82 rc = make_SBuffer( &( r -> buf ), 4096 ); /* helper.c */
83 }
84 else
85 {
86 ErrMsg( "make_lookup_reader_obj().KFileSize() -> %R", rc );
87 }
88
89 if ( 0 == rc && NULL != index )
90 {
91 rc = get_max_key( index, & r -> max_key ); /* index.c */
92 }
93
94 if ( 0 == rc )
95 {
96 *reader = r;
97 }
98 else
99 {
100 release_lookup_reader( r );
101 }
102 }
103 return rc;
104 }
105
make_lookup_reader(const KDirectory * dir,const struct index_reader * index,struct lookup_reader ** reader,size_t buf_size,const char * fmt,...)106 rc_t make_lookup_reader( const KDirectory *dir, const struct index_reader * index,
107 struct lookup_reader ** reader, size_t buf_size, const char * fmt, ... )
108 {
109 rc_t rc;
110 const struct KFile * f = NULL;
111
112 va_list args;
113 va_start ( args, fmt );
114 rc = KDirectoryVOpenFileRead( dir, &f, fmt, args );
115 va_end ( args );
116
117 if ( 0 != rc )
118 {
119 ErrMsg( "make_lookup_reader().KDirectoryVOpenFileRead( '?' ) -> %R", rc );
120 }
121 else
122 {
123 if ( buf_size > 0 )
124 {
125 const struct KFile * temp_file = NULL;
126 rc = KBufFileMakeRead( &temp_file, f, buf_size );
127 if ( 0 != rc )
128 {
129 ErrMsg( "make_lookup_reader().KBufFileMakeRead() -> %R", rc );
130 }
131 else
132 {
133 rc = KFileRelease( f );
134 if ( 0 != rc )
135 {
136 ErrMsg( "make_lookup_reader().KFileRelease() -> %R", rc );
137 }
138 else
139 {
140 f = temp_file;
141 }
142 }
143 }
144
145 if ( 0 == rc )
146 {
147 rc = make_lookup_reader_obj( reader, index, f );
148 }
149 }
150 return rc;
151 }
152
153
read_key_and_len(struct lookup_reader * self,uint64_t pos,uint64_t * key,size_t * len)154 static rc_t read_key_and_len( struct lookup_reader * self, uint64_t pos, uint64_t *key, size_t *len )
155 {
156 size_t num_read;
157 uint8_t buffer[ 10 ];
158 rc_t rc = KFileReadAll( self -> f, pos, buffer, sizeof buffer, &num_read );
159 if ( rc != 0 )
160 {
161 ErrMsg( "read_key_and_len().KFileReadAll( at %ld, to_read %u ) -> %R", pos, sizeof buffer, rc );
162 }
163 else if ( num_read != sizeof buffer )
164 {
165 if ( 0 == num_read )
166 {
167 rc = SILENT_RC( rcVDB, rcNoTarg, rcReading, rcId, rcNotFound );
168 }
169 else
170 {
171 rc = SILENT_RC( rcVDB, rcNoTarg, rcReading, rcFormat, rcInvalid );
172 }
173 }
174 else
175 {
176 uint16_t dna_len;
177 size_t packed_len;
178 memmove( key, buffer, sizeof *key );
179 dna_len = buffer[ 8 ];
180 dna_len <<= 8;
181 dna_len |= buffer[ 9 ];
182 packed_len = ( dna_len & 1 ) ? ( dna_len + 1 ) >> 1 : dna_len >> 1;
183 *len = ( ( sizeof *key ) + ( sizeof dna_len ) + packed_len );
184 }
185 return rc;
186 }
187
188
keys_equal(uint64_t key1,uint64_t key2)189 static bool keys_equal( uint64_t key1, uint64_t key2 )
190 {
191 bool res = ( key1 == key2 );
192 if ( !res )
193 {
194 res = ( ( 0 == ( key1 & 0x01 ) ) && key2 == ( key1 + 1 ) );
195 }
196 return res;
197 }
198
loop_until_key_found(struct lookup_reader * self,uint64_t key_to_find,uint64_t * key_found,uint64_t * offset)199 static rc_t loop_until_key_found( struct lookup_reader * self, uint64_t key_to_find,
200 uint64_t *key_found , uint64_t *offset )
201 {
202 rc_t rc = 0;
203 bool done = false;
204 uint64_t curr = *offset;
205 while ( !done && 0 == rc )
206 {
207 size_t found_len;
208 rc = read_key_and_len( self, curr, key_found, &found_len );
209 if ( keys_equal( key_to_find, *key_found ) )
210 {
211 done = true;
212 *offset = curr;
213 }
214 else if ( key_to_find > *key_found )
215 {
216 curr += found_len;
217 }
218 else
219 {
220 done = true;
221 rc = SILENT_RC( rcVDB, rcNoTarg, rcReading, rcId, rcNotFound );
222 }
223 }
224 return rc;
225 }
226
227
full_table_seek(struct lookup_reader * self,uint64_t key_to_find,uint64_t * key_found)228 static rc_t full_table_seek( struct lookup_reader * self, uint64_t key_to_find, uint64_t * key_found )
229 {
230 /* we have no index! search the whole thing... */
231 uint64_t offset = 0;
232 rc_t rc = loop_until_key_found( self, key_to_find, key_found, &offset );
233 if ( 0 == rc )
234 {
235 if ( keys_equal( key_to_find, *key_found ) )
236 {
237 self -> pos = offset;
238 }
239 else
240 {
241 rc = SILENT_RC( rcVDB, rcNoTarg, rcReading, rcId, rcNotFound );
242 ErrMsg( "lookup_reader.c full_table_seek( key: %ld ) -> %R", key_to_find, rc );
243 }
244 }
245 return rc;
246 }
247
248
indexed_seek(struct lookup_reader * self,uint64_t key_to_find,uint64_t * key_found,bool exactly)249 static rc_t indexed_seek( struct lookup_reader * self, uint64_t key_to_find, uint64_t * key_found, bool exactly )
250 {
251 /* we have a index! find set pos to the found offset */
252 rc_t rc;
253 uint64_t offset = 0;
254 if ( self -> max_key > 0 && ( key_to_find > self -> max_key ) )
255 {
256 rc = SILENT_RC( rcVDB, rcNoTarg, rcReading, rcId, rcTooBig );
257 /* ErrMsg( "lookup_reader.c indexed_seek( key_to_find=%lu, max_key=%lu ) -> %R", key_to_find, self -> max_key, rc ); */
258 }
259 else
260 {
261 rc = get_nearest_offset( self -> index, key_to_find, key_found, &offset ); /* in index.c */
262 if ( 0 == rc )
263 {
264 if ( keys_equal( key_to_find, *key_found ) )
265 {
266 self -> pos = offset;
267 }
268 else
269 {
270 if ( exactly )
271 {
272 rc = loop_until_key_found( self, key_to_find, key_found, &offset );
273 if ( 0 == rc )
274 {
275 if ( keys_equal( key_to_find, *key_found ) )
276 {
277 self -> pos = offset;
278 }
279 else
280 {
281 rc = SILENT_RC( rcVDB, rcNoTarg, rcReading, rcId, rcNotFound );
282 }
283 }
284 else
285 {
286 rc = SILENT_RC( rcVDB, rcNoTarg, rcReading, rcId, rcNotFound );
287 }
288 }
289 else
290 {
291 self -> pos = offset;
292 rc = SILENT_RC( rcVDB, rcNoTarg, rcReading, rcId, rcNotFound );
293 }
294 }
295 }
296 }
297 return rc;
298 }
299
300
seek_lookup_reader(struct lookup_reader * self,uint64_t key_to_find,uint64_t * key_found,bool exactly)301 rc_t seek_lookup_reader( struct lookup_reader * self, uint64_t key_to_find, uint64_t * key_found, bool exactly )
302 {
303 rc_t rc = 0;
304 if ( NULL == self || NULL == key_found )
305 {
306 rc = RC( rcVDB, rcNoTarg, rcReading, rcParam, rcInvalid );
307 ErrMsg( "lookup_reader.c seek_lookup_reader() -> %R", rc );
308 }
309 else
310 {
311 if ( NULL != self -> index )
312 {
313 rc = indexed_seek( self, key_to_find, key_found, exactly );
314 if ( 0 != rc )
315 {
316 rc = full_table_seek( self, key_to_find, key_found );
317 }
318 }
319 else
320 {
321 rc = full_table_seek( self, key_to_find, key_found );
322 }
323 }
324 return rc;
325 }
326
327
lookup_reader_get(struct lookup_reader * self,uint64_t * key,SBuffer * packed_bases)328 rc_t lookup_reader_get( struct lookup_reader * self, uint64_t * key, SBuffer * packed_bases )
329 {
330 rc_t rc = 0;
331 if ( NULL == self || NULL == key || NULL == packed_bases )
332 {
333 rc = RC( rcVDB, rcNoTarg, rcReading, rcParam, rcInvalid );
334 ErrMsg( "lookup_reader_get() #invalid input# -> %R", rc );
335 }
336 else
337 {
338 if ( self -> pos >= ( self -> f_size - 1 ) )
339 {
340 rc = SILENT_RC( rcVDB, rcNoTarg, rcReading, rcFormat, rcInvalid );
341 }
342 else
343 {
344 size_t num_read;
345 uint8_t buffer1[ 10 ];
346
347 rc = KFileReadAll( self -> f, self -> pos, buffer1, sizeof buffer1, &num_read );
348 if ( 0 != rc )
349 {
350 /* we are not able to read 10 bytes from the file */
351 ErrMsg( "lookup_reader_get().KFileReadAll( at %ld, to_read %u ) -> %R", self -> pos, sizeof buffer1, rc );
352 }
353 else
354 {
355 if ( num_read != sizeof buffer1 )
356 {
357 rc = SILENT_RC( rcVDB, rcNoTarg, rcReading, rcFormat, rcInvalid );
358 ErrMsg( "lookup_reader_get().KFileReadAll( at %ld, to_read %lu vs %lu )", self -> pos, sizeof buffer1, num_read );
359 }
360 else
361 {
362 uint16_t dna_len;
363 size_t to_read;
364
365 /* we get the key out of the 10 bytes */
366 memmove( key, buffer1, sizeof *key );
367
368 /* we get the dna-len out of the 10 bytes */
369 dna_len = buffer1[ 8 ];
370 dna_len <<= 8;
371 dna_len |= buffer1[ 9 ];
372
373 /* adjust the number of bytes to read to the half of the dna_len */
374 to_read = ( dna_len & 1 ) ? ( dna_len + 1 ) >> 1 : dna_len >> 1;
375 if ( 0 == to_read )
376 {
377 rc = SILENT_RC( rcVDB, rcNoTarg, rcReading, rcFormat, rcInvalid );
378 ErrMsg( "lookup_reader_get() to_read == 0 at %lu", self -> pos );
379 packed_bases -> S . size = 0;
380 packed_bases -> S . len = 0;
381 self -> pos += ( 10 );
382 }
383 else
384 {
385 /* maybe we have to increase the size of the SBuffer, after seeing the real dna-length */
386 if ( packed_bases -> buffer_size < ( to_read + 2 ) )
387 {
388 rc = increase_SBuffer( packed_bases, ( to_read + 2 ) - packed_bases -> buffer_size );
389 }
390
391 if ( 0 == rc )
392 {
393 uint8_t * dst = ( uint8_t * )( packed_bases -> S . addr );
394
395 /* we write the dna-len into the first 2 bytes of the destination */
396 dst[ 0 ] = buffer1[ 8 ];
397 dst[ 1 ] = buffer1[ 9 ];
398 dst += 2;
399
400 rc = KFileReadAll( self -> f, self -> pos + 10, dst, to_read, &num_read );
401 if ( 0 != rc )
402 {
403 ErrMsg( "lookup_reader_get().KFileReadAll( at %ld, to_read %u ) -> %R", self -> pos + 10, to_read, rc );
404 }
405 else if ( num_read != to_read )
406 {
407 rc = RC( rcVDB, rcNoTarg, rcReading, rcFormat, rcInvalid );
408 ErrMsg( "lookup_reader_get().KFileReadAll( %ld ) %d vs %d -> %R", self -> pos + 10, num_read, to_read, rc );
409 }
410 else
411 {
412 packed_bases -> S . size = num_read + 2;
413 packed_bases -> S . len = ( uint32_t )packed_bases -> S . size;
414 self -> pos += ( num_read + 10 );
415 }
416 }
417 }
418 }
419 }
420 }
421 }
422 return rc;
423 }
424
lookup_bases(struct lookup_reader * self,int64_t row_id,uint32_t read_id,SBuffer * B,bool reverse)425 rc_t lookup_bases( struct lookup_reader * self, int64_t row_id, uint32_t read_id, SBuffer * B, bool reverse )
426 {
427 int64_t found_row_id;
428 uint32_t found_read_id;
429 uint64_t key;
430
431 rc_t rc = lookup_reader_get( self, &key, &self -> buf );
432 if ( 0 == rc )
433 {
434 found_row_id = key >> 1;
435 found_read_id = key & 1 ? 2 : 1;
436
437 if ( found_row_id == row_id && found_read_id == read_id )
438 {
439 rc = unpack_4na( &self -> buf . S, B, reverse ); /* helper.c */
440 }
441 else
442 {
443 /* in case the reader is not pointed to the right position, we try to seek again */
444 rc_t rc1;
445 uint64_t key_found;
446 uint64_t key_to_find = row_id;
447
448 key_to_find <<= 1;
449 if ( 1 == read_id )
450 {
451 key_to_find &= 0xFFFFFFFFFFFFFFFE;
452 }
453 else
454 {
455 key_to_find |= 1;
456 }
457
458 rc1 = seek_lookup_reader( self, key_to_find, &key_found, true );
459 if ( 0 == rc1 )
460 {
461 rc = lookup_reader_get( self, &key, &self -> buf );
462 if ( 0 == rc )
463 {
464 found_row_id = key >> 1;
465 found_read_id = key & 1 ? 2 : 1;
466
467 if ( found_row_id == row_id && found_read_id == read_id )
468 {
469 rc = unpack_4na( &self -> buf . S, B, reverse ); /* helper.c */
470 }
471 else
472 {
473 rc = RC( rcVDB, rcNoTarg, rcConstructing, rcTransfer, rcInvalid );
474 ErrMsg( "lookup_bases #2( %lu.%u ) ---> found %lu.%u (at pos=%lu)",
475 row_id, read_id, found_row_id, found_read_id, self -> pos );
476 }
477 }
478 }
479 else
480 {
481 rc = rc1;
482 ErrMsg( "lookup_bases( %lu.%u ) ---> seek failed ---> %R", row_id, read_id, rc );
483 }
484 }
485 }
486 else
487 {
488 ErrMsg( "lookup_bases( %lu.%u ) failed ---> %R", row_id, read_id, rc );
489 }
490 return rc;
491 }
492
493
lookup_check(struct lookup_reader * self)494 rc_t lookup_check( struct lookup_reader * self )
495 {
496 rc_t rc = 0;
497 int64_t last_key = 0;
498
499 while ( 0 == rc && self -> pos < self -> f_size )
500 {
501 uint64_t key;
502 size_t len;
503 rc = read_key_and_len( self, self -> pos, &key, &len );
504 if ( 0 == rc )
505 {
506 if ( last_key < key )
507 {
508 last_key = key;
509 }
510 else
511 {
512 rc = SILENT_RC( rcVDB, rcNoTarg, rcReading, rcFormat, rcInvalid );
513 ErrMsg( "lookup_reader.c lookup_check() jump from %lu to %lu at %lu", last_key, key, self -> pos );
514 }
515 self -> pos += len;
516 }
517 }
518 return rc;
519 }
520
521
lookup_check_file(const KDirectory * dir,size_t buf_size,const char * filename)522 rc_t lookup_check_file( const KDirectory *dir, size_t buf_size, const char * filename )
523 {
524 lookup_reader * reader;
525 rc_t rc = make_lookup_reader( dir, NULL, &reader, buf_size, "%s", filename );
526 if ( 0 == rc )
527 {
528 rc = lookup_check( reader );
529 release_lookup_reader( reader );
530 }
531 return rc;
532 }
533
534
lookup_count(struct lookup_reader * self,uint32_t * count)535 rc_t lookup_count( struct lookup_reader * self, uint32_t * count )
536 {
537 rc_t rc = 0;
538 int32_t n = 0;
539
540 while ( 0 == rc && self -> pos < self -> f_size )
541 {
542 uint64_t key;
543 size_t len;
544 rc = read_key_and_len( self, self -> pos, &key, &len );
545 if ( 0 == rc )
546 {
547 n++;
548 self -> pos += len;
549 }
550 }
551
552 *count = ( 0 == rc ) ? n : 0;
553 return rc;
554 }
555
556
lookup_count_file(const KDirectory * dir,size_t buf_size,const char * filename,uint32_t * count)557 rc_t lookup_count_file( const KDirectory *dir, size_t buf_size, const char * filename, uint32_t * count )
558 {
559 lookup_reader * reader;
560 rc_t rc = make_lookup_reader( dir, NULL, &reader, buf_size, "%s", filename );
561 if ( 0 == rc )
562 {
563 rc = lookup_count( reader, count );
564 release_lookup_reader( reader );
565 }
566 return rc;
567 }
568
569
write_out_lookup(const KDirectory * dir,size_t buf_size,const char * lookup_file,const char * output_file)570 rc_t write_out_lookup( const KDirectory *dir, size_t buf_size, const char * lookup_file, const char * output_file )
571 {
572 lookup_reader * reader;
573 rc_t rc = make_lookup_reader( dir, NULL, &reader, buf_size, "%s", lookup_file );
574 if ( 0 == rc )
575 {
576 struct file_printer * printer;
577 rc = make_file_printer_from_filename( dir, &printer, buf_size, 1024, "%s", output_file );
578 if ( 0 == rc )
579 {
580 while ( 0 == rc && reader -> pos < reader -> f_size )
581 {
582 uint64_t key;
583 size_t len;
584 rc = read_key_and_len( reader, reader -> pos, &key, &len );
585 if ( 0 == rc )
586 {
587 rc = file_print( printer, "%lu\n", key );
588 reader -> pos += len;
589 }
590 }
591 destroy_file_printer( printer );
592 }
593 release_lookup_reader( reader );
594 }
595 return rc;
596 }
597