1 /*===========================================================================
2 *
3 *                            PUBLIC DOMAIN NOTICE
4 *               National Center for Biotechnology Information
5 *
6 *  This software/database is a "United States Government Work" under the
7 *  terms of the United States Copyright Act.  It was written as part of
8 *  the author's official duties as a United States Government employee and
9 *  thus cannot be copyrighted.  This software/database is freely available
10 *  to the public for use. The National Library of Medicine and the U.S.
11 *  Government have not placed any restriction on its use or reproduction.
12 *
13 *  Although all reasonable efforts have been taken to ensure the accuracy
14 *  and reliability of the software and data, the NLM and the U.S.
15 *  Government do not and cannot warrant the performance or results that
16 *  may be obtained by using this software or data. The NLM and the U.S.
17 *  Government disclaim all warranties, express or implied, including
18 *  warranties of performance, merchantability or fitness for any particular
19 *  purpose.
20 *
21 *  Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 
27 #include "lookup_reader.h"
28 #include "file_printer.h"
29 #include "helper.h"
30 
31 #include <klib/printf.h>
32 #include <kfs/file.h>
33 #include <kfs/buffile.h>
34 
35 #include <string.h>
36 #include <stdio.h>
37 
38 typedef struct lookup_reader
39 {
40     const struct KFile * f;
41     const struct index_reader * index;
42     SBuffer buf;
43     uint64_t pos, f_size, max_key;
44 } lookup_reader;
45 
46 
release_lookup_reader(struct lookup_reader * self)47 void release_lookup_reader( struct lookup_reader * self )
48 {
49     if ( NULL != self )
50     {
51         if ( NULL != self -> f )
52         {
53             rc_t rc = KFileRelease( self -> f );
54             if ( 0 != rc )
55             {
56                 ErrMsg( "release_lookup_reader().KFileRelease() -> %R", rc );
57             }
58         }
59         release_SBuffer( &( self -> buf ) ); /* helper.c */
60         free( ( void * ) self );
61     }
62 }
63 
make_lookup_reader_obj(struct lookup_reader ** reader,const struct index_reader * index,const struct KFile * f)64 static rc_t make_lookup_reader_obj( struct lookup_reader ** reader,
65                                     const struct index_reader * index,
66                                     const struct KFile * f )
67 {
68     rc_t rc = 0;
69     lookup_reader * r = calloc( 1, sizeof * r );
70     if ( NULL == r )
71     {
72         rc = RC( rcVDB, rcNoTarg, rcConstructing, rcMemory, rcExhausted );
73         ErrMsg( "make_lookup_reader_obj().calloc( %d ) -> %R", ( sizeof * r ), rc );
74     }
75     else
76     {
77         r -> f = f;
78         r -> index = index;
79         rc = KFileSize( f, & r -> f_size );
80         if ( 0 == rc )
81         {
82             rc = make_SBuffer( &( r -> buf ), 4096 ); /* helper.c */
83         }
84         else
85         {
86             ErrMsg( "make_lookup_reader_obj().KFileSize() -> %R", rc );
87         }
88 
89         if ( 0 == rc && NULL != index )
90         {
91             rc = get_max_key( index, & r -> max_key ); /* index.c */
92         }
93 
94         if ( 0 == rc )
95         {
96             *reader = r;
97         }
98         else
99         {
100             release_lookup_reader( r );
101         }
102     }
103     return rc;
104 }
105 
make_lookup_reader(const KDirectory * dir,const struct index_reader * index,struct lookup_reader ** reader,size_t buf_size,const char * fmt,...)106 rc_t make_lookup_reader( const KDirectory *dir, const struct index_reader * index,
107                          struct lookup_reader ** reader, size_t buf_size, const char * fmt, ... )
108 {
109     rc_t rc;
110     const struct KFile * f = NULL;
111 
112     va_list args;
113     va_start ( args, fmt );
114     rc = KDirectoryVOpenFileRead( dir, &f, fmt, args );
115     va_end ( args );
116 
117     if ( 0 != rc )
118     {
119         ErrMsg( "make_lookup_reader().KDirectoryVOpenFileRead( '?' ) -> %R",  rc );
120     }
121     else
122     {
123         if ( buf_size > 0 )
124         {
125             const struct KFile * temp_file = NULL;
126             rc = KBufFileMakeRead( &temp_file, f, buf_size );
127             if ( 0 != rc )
128             {
129                 ErrMsg( "make_lookup_reader().KBufFileMakeRead() -> %R", rc );
130             }
131             else
132             {
133                 rc = KFileRelease( f );
134                 if ( 0 != rc )
135                 {
136                     ErrMsg( "make_lookup_reader().KFileRelease() -> %R", rc );
137                 }
138                 else
139                 {
140                     f = temp_file;
141                 }
142             }
143         }
144 
145         if ( 0 == rc )
146         {
147             rc = make_lookup_reader_obj( reader, index, f );
148         }
149     }
150     return rc;
151 }
152 
153 
read_key_and_len(struct lookup_reader * self,uint64_t pos,uint64_t * key,size_t * len)154 static rc_t read_key_and_len( struct lookup_reader * self, uint64_t pos, uint64_t *key, size_t *len )
155 {
156     size_t num_read;
157     uint8_t buffer[ 10 ];
158     rc_t rc = KFileReadAll( self -> f, pos, buffer, sizeof buffer, &num_read );
159     if ( rc != 0 )
160     {
161         ErrMsg( "read_key_and_len().KFileReadAll( at %ld, to_read %u ) -> %R", pos, sizeof buffer, rc );
162     }
163     else if ( num_read != sizeof buffer )
164     {
165         if ( 0 == num_read )
166         {
167             rc = SILENT_RC( rcVDB, rcNoTarg, rcReading, rcId, rcNotFound );
168         }
169         else
170         {
171             rc = SILENT_RC( rcVDB, rcNoTarg, rcReading, rcFormat, rcInvalid );
172         }
173     }
174     else
175     {
176         uint16_t dna_len;
177         size_t packed_len;
178         memmove( key, buffer, sizeof *key );
179         dna_len = buffer[ 8 ];
180         dna_len <<= 8;
181         dna_len |= buffer[ 9 ];
182         packed_len = ( dna_len & 1 ) ? ( dna_len + 1 ) >> 1 : dna_len >> 1;
183         *len = ( ( sizeof *key ) + ( sizeof dna_len ) + packed_len );
184     }
185     return rc;
186 }
187 
188 
keys_equal(uint64_t key1,uint64_t key2)189 static bool keys_equal( uint64_t key1, uint64_t key2 )
190 {
191     bool res = ( key1 == key2 );
192     if ( !res )
193     {
194         res = ( ( 0 == ( key1 & 0x01 ) ) && key2 == ( key1 + 1 ) );
195     }
196     return res;
197 }
198 
loop_until_key_found(struct lookup_reader * self,uint64_t key_to_find,uint64_t * key_found,uint64_t * offset)199 static rc_t loop_until_key_found( struct lookup_reader * self, uint64_t key_to_find,
200         uint64_t *key_found , uint64_t *offset )
201 {
202     rc_t rc = 0;
203     bool done = false;
204     uint64_t curr = *offset;
205     while ( !done && 0 == rc )
206     {
207         size_t found_len;
208         rc = read_key_and_len( self, curr, key_found, &found_len );
209         if ( keys_equal( key_to_find, *key_found ) )
210         {
211             done = true;
212             *offset = curr;
213         }
214         else if ( key_to_find > *key_found )
215         {
216             curr += found_len;
217         }
218         else
219         {
220             done = true;
221             rc = SILENT_RC( rcVDB, rcNoTarg, rcReading, rcId, rcNotFound );
222         }
223     }
224     return rc;
225 }
226 
227 
full_table_seek(struct lookup_reader * self,uint64_t key_to_find,uint64_t * key_found)228 static rc_t full_table_seek( struct lookup_reader * self, uint64_t key_to_find, uint64_t * key_found )
229 {
230     /* we have no index! search the whole thing... */
231     uint64_t offset = 0;
232     rc_t rc = loop_until_key_found( self, key_to_find, key_found, &offset );
233     if ( 0 == rc )
234     {
235         if ( keys_equal( key_to_find, *key_found ) )
236         {
237             self -> pos = offset;
238         }
239         else
240         {
241             rc = SILENT_RC( rcVDB, rcNoTarg, rcReading, rcId, rcNotFound );
242             ErrMsg( "lookup_reader.c full_table_seek( key: %ld ) -> %R", key_to_find, rc );
243         }
244     }
245     return rc;
246 }
247 
248 
indexed_seek(struct lookup_reader * self,uint64_t key_to_find,uint64_t * key_found,bool exactly)249 static rc_t indexed_seek( struct lookup_reader * self, uint64_t key_to_find, uint64_t * key_found, bool exactly )
250 {
251     /* we have a index! find set pos to the found offset */
252     rc_t rc;
253     uint64_t offset = 0;
254     if ( self -> max_key > 0 && ( key_to_find > self -> max_key ) )
255     {
256         rc = SILENT_RC( rcVDB, rcNoTarg, rcReading, rcId, rcTooBig );
257         /* ErrMsg( "lookup_reader.c indexed_seek( key_to_find=%lu, max_key=%lu ) -> %R", key_to_find, self -> max_key, rc ); */
258     }
259     else
260     {
261         rc = get_nearest_offset( self -> index, key_to_find, key_found, &offset ); /* in index.c */
262         if ( 0 == rc )
263         {
264             if ( keys_equal( key_to_find, *key_found ) )
265             {
266                 self -> pos = offset;
267             }
268             else
269             {
270                 if ( exactly )
271                 {
272                     rc = loop_until_key_found( self, key_to_find, key_found, &offset );
273                     if ( 0 == rc )
274                     {
275                         if ( keys_equal( key_to_find, *key_found ) )
276                         {
277                             self -> pos = offset;
278                         }
279                         else
280                         {
281                             rc = SILENT_RC( rcVDB, rcNoTarg, rcReading, rcId, rcNotFound );
282                         }
283                     }
284                     else
285                     {
286                         rc = SILENT_RC( rcVDB, rcNoTarg, rcReading, rcId, rcNotFound );
287                     }
288                 }
289                 else
290                 {
291                     self -> pos = offset;
292                     rc = SILENT_RC( rcVDB, rcNoTarg, rcReading, rcId, rcNotFound );
293                 }
294             }
295         }
296     }
297     return rc;
298 }
299 
300 
seek_lookup_reader(struct lookup_reader * self,uint64_t key_to_find,uint64_t * key_found,bool exactly)301 rc_t seek_lookup_reader( struct lookup_reader * self, uint64_t key_to_find, uint64_t * key_found, bool exactly )
302 {
303     rc_t rc = 0;
304     if ( NULL == self || NULL == key_found )
305     {
306         rc = RC( rcVDB, rcNoTarg, rcReading, rcParam, rcInvalid );
307         ErrMsg( "lookup_reader.c seek_lookup_reader() -> %R", rc );
308     }
309     else
310     {
311         if ( NULL != self -> index )
312         {
313             rc = indexed_seek( self, key_to_find, key_found, exactly );
314             if ( 0 != rc )
315             {
316                 rc = full_table_seek( self, key_to_find, key_found );
317             }
318         }
319         else
320         {
321             rc = full_table_seek( self, key_to_find, key_found );
322         }
323     }
324     return rc;
325 }
326 
327 
lookup_reader_get(struct lookup_reader * self,uint64_t * key,SBuffer * packed_bases)328 rc_t lookup_reader_get( struct lookup_reader * self, uint64_t * key, SBuffer * packed_bases )
329 {
330     rc_t rc = 0;
331     if ( NULL == self || NULL == key || NULL == packed_bases )
332     {
333         rc = RC( rcVDB, rcNoTarg, rcReading, rcParam, rcInvalid );
334         ErrMsg( "lookup_reader_get() #invalid input# -> %R",  rc );
335     }
336     else
337     {
338         if ( self -> pos >= ( self -> f_size - 1 ) )
339         {
340             rc = SILENT_RC( rcVDB, rcNoTarg, rcReading, rcFormat, rcInvalid );
341         }
342         else
343         {
344             size_t num_read;
345             uint8_t buffer1[ 10 ];
346 
347             rc = KFileReadAll( self -> f, self -> pos, buffer1, sizeof buffer1, &num_read );
348             if ( 0 != rc )
349             {
350                 /* we are not able to read 10 bytes from the file */
351                 ErrMsg( "lookup_reader_get().KFileReadAll( at %ld, to_read %u ) -> %R", self -> pos, sizeof buffer1, rc );
352             }
353             else
354             {
355                 if ( num_read != sizeof buffer1 )
356                 {
357                     rc = SILENT_RC( rcVDB, rcNoTarg, rcReading, rcFormat, rcInvalid );
358                     ErrMsg( "lookup_reader_get().KFileReadAll( at %ld, to_read %lu vs %lu )", self -> pos, sizeof buffer1, num_read );
359                 }
360                 else
361                 {
362                     uint16_t dna_len;
363                     size_t to_read;
364 
365                     /* we get the key out of the 10 bytes */
366                     memmove( key, buffer1, sizeof *key );
367 
368                     /* we get the dna-len out of the 10 bytes */
369                     dna_len = buffer1[ 8 ];
370                     dna_len <<= 8;
371                     dna_len |= buffer1[ 9 ];
372 
373                     /* adjust the number of bytes to read to the half of the dna_len */
374                     to_read = ( dna_len & 1 ) ? ( dna_len + 1 ) >> 1 : dna_len >> 1;
375                     if ( 0 == to_read )
376                     {
377                         rc = SILENT_RC( rcVDB, rcNoTarg, rcReading, rcFormat, rcInvalid );
378                         ErrMsg( "lookup_reader_get() to_read == 0 at %lu", self -> pos );
379                         packed_bases -> S . size = 0;
380                         packed_bases -> S . len = 0;
381                         self -> pos += ( 10 );
382                     }
383                     else
384                     {
385                         /* maybe we have to increase the size of the SBuffer, after seeing the real dna-length */
386                         if ( packed_bases -> buffer_size < ( to_read + 2 ) )
387                         {
388                             rc = increase_SBuffer( packed_bases, ( to_read + 2 ) - packed_bases -> buffer_size );
389                         }
390 
391                         if ( 0 == rc )
392                         {
393                             uint8_t * dst = ( uint8_t * )( packed_bases -> S . addr );
394 
395                             /* we write the dna-len into the first 2 bytes of the destination */
396                             dst[ 0 ] = buffer1[ 8 ];
397                             dst[ 1 ] = buffer1[ 9 ];
398                             dst += 2;
399 
400                             rc = KFileReadAll( self -> f, self -> pos + 10, dst, to_read, &num_read );
401                             if ( 0 != rc )
402                             {
403                                 ErrMsg( "lookup_reader_get().KFileReadAll( at %ld, to_read %u ) -> %R", self -> pos + 10, to_read, rc );
404                             }
405                             else if ( num_read != to_read )
406                             {
407                                 rc = RC( rcVDB, rcNoTarg, rcReading, rcFormat, rcInvalid );
408                                 ErrMsg( "lookup_reader_get().KFileReadAll( %ld ) %d vs %d -> %R", self -> pos + 10, num_read, to_read, rc );
409                             }
410                             else
411                             {
412                                 packed_bases -> S . size = num_read + 2;
413                                 packed_bases -> S . len = ( uint32_t )packed_bases -> S . size;
414                                 self -> pos += ( num_read + 10 );
415                             }
416                         }
417                     }
418                 }
419             }
420         }
421     }
422     return rc;
423 }
424 
lookup_bases(struct lookup_reader * self,int64_t row_id,uint32_t read_id,SBuffer * B,bool reverse)425 rc_t lookup_bases( struct lookup_reader * self, int64_t row_id, uint32_t read_id, SBuffer * B, bool reverse )
426 {
427     int64_t found_row_id;
428     uint32_t found_read_id;
429     uint64_t key;
430 
431     rc_t rc = lookup_reader_get( self, &key, &self -> buf );
432     if ( 0 == rc )
433     {
434         found_row_id = key >> 1;
435         found_read_id = key & 1 ? 2 : 1;
436 
437         if ( found_row_id == row_id && found_read_id == read_id )
438         {
439             rc = unpack_4na( &self -> buf . S, B, reverse ); /* helper.c */
440         }
441         else
442         {
443             /* in case the reader is not pointed to the right position, we try to seek again */
444             rc_t rc1;
445             uint64_t key_found;
446             uint64_t key_to_find = row_id;
447 
448             key_to_find <<= 1;
449             if ( 1 == read_id )
450             {
451                 key_to_find &= 0xFFFFFFFFFFFFFFFE;
452             }
453             else
454             {
455                 key_to_find |= 1;
456             }
457 
458             rc1 = seek_lookup_reader( self, key_to_find, &key_found, true );
459             if ( 0 == rc1 )
460             {
461                 rc = lookup_reader_get( self, &key, &self -> buf );
462                 if ( 0 == rc )
463                 {
464                     found_row_id = key >> 1;
465                     found_read_id = key & 1 ? 2 : 1;
466 
467                     if ( found_row_id == row_id && found_read_id == read_id )
468                     {
469                         rc = unpack_4na( &self -> buf . S, B, reverse ); /* helper.c */
470                     }
471                     else
472                     {
473                         rc = RC( rcVDB, rcNoTarg, rcConstructing, rcTransfer, rcInvalid );
474                         ErrMsg( "lookup_bases #2( %lu.%u ) ---> found %lu.%u (at pos=%lu)",
475                                     row_id, read_id, found_row_id, found_read_id, self -> pos );
476                     }
477                 }
478             }
479             else
480             {
481                 rc = rc1;
482                 ErrMsg( "lookup_bases( %lu.%u ) ---> seek failed ---> %R", row_id, read_id, rc );
483             }
484         }
485     }
486     else
487     {
488         ErrMsg( "lookup_bases( %lu.%u ) failed ---> %R", row_id, read_id, rc );
489     }
490     return rc;
491 }
492 
493 
lookup_check(struct lookup_reader * self)494 rc_t lookup_check( struct lookup_reader * self )
495 {
496     rc_t rc = 0;
497     int64_t last_key = 0;
498 
499     while ( 0 == rc && self -> pos < self -> f_size )
500     {
501         uint64_t key;
502         size_t len;
503         rc = read_key_and_len( self, self -> pos, &key, &len );
504         if ( 0 == rc )
505         {
506             if ( last_key < key )
507             {
508                 last_key = key;
509             }
510             else
511             {
512                 rc = SILENT_RC( rcVDB, rcNoTarg, rcReading, rcFormat, rcInvalid );
513                 ErrMsg( "lookup_reader.c lookup_check() jump from %lu to %lu at %lu", last_key, key, self -> pos );
514             }
515             self -> pos += len;
516         }
517     }
518     return rc;
519 }
520 
521 
lookup_check_file(const KDirectory * dir,size_t buf_size,const char * filename)522 rc_t lookup_check_file( const KDirectory *dir, size_t buf_size, const char * filename )
523 {
524     lookup_reader * reader;
525     rc_t rc = make_lookup_reader( dir, NULL, &reader, buf_size, "%s", filename );
526     if ( 0 == rc )
527     {
528         rc = lookup_check( reader );
529         release_lookup_reader( reader );
530     }
531     return rc;
532 }
533 
534 
lookup_count(struct lookup_reader * self,uint32_t * count)535 rc_t lookup_count( struct lookup_reader * self, uint32_t * count )
536 {
537     rc_t rc = 0;
538     int32_t n = 0;
539 
540     while ( 0 == rc && self -> pos < self -> f_size )
541     {
542         uint64_t key;
543         size_t len;
544         rc = read_key_and_len( self, self -> pos, &key, &len );
545         if ( 0 == rc )
546         {
547             n++;
548             self -> pos += len;
549         }
550     }
551 
552     *count = ( 0 == rc ) ? n : 0;
553     return rc;
554 }
555 
556 
lookup_count_file(const KDirectory * dir,size_t buf_size,const char * filename,uint32_t * count)557 rc_t lookup_count_file( const KDirectory *dir, size_t buf_size, const char * filename, uint32_t * count )
558 {
559     lookup_reader * reader;
560     rc_t rc = make_lookup_reader( dir, NULL, &reader, buf_size, "%s", filename );
561     if ( 0 == rc )
562     {
563         rc = lookup_count( reader, count );
564         release_lookup_reader( reader );
565     }
566     return rc;
567 }
568 
569 
write_out_lookup(const KDirectory * dir,size_t buf_size,const char * lookup_file,const char * output_file)570 rc_t write_out_lookup( const KDirectory *dir, size_t buf_size, const char * lookup_file, const char * output_file )
571 {
572     lookup_reader * reader;
573     rc_t rc = make_lookup_reader( dir, NULL, &reader, buf_size, "%s", lookup_file );
574     if ( 0 == rc )
575     {
576         struct file_printer * printer;
577         rc = make_file_printer_from_filename( dir, &printer, buf_size, 1024, "%s", output_file );
578         if ( 0 == rc )
579         {
580             while ( 0 == rc && reader -> pos < reader -> f_size )
581             {
582                 uint64_t key;
583                 size_t len;
584                 rc = read_key_and_len( reader, reader -> pos, &key, &len );
585                 if ( 0 == rc )
586                 {
587                     rc = file_print( printer, "%lu\n", key );
588                     reader -> pos += len;
589                 }
590             }
591             destroy_file_printer( printer );
592         }
593         release_lookup_reader( reader );
594     }
595     return rc;
596 }
597