1 /*===========================================================================
2  *
3  *                            PUBLIC DOMAIN NOTICE
4  *               National Center for Biotechnology Information
5  *
6  *  This software/database is a "United States Government Work" under the
7  *  terms of the United States Copyright Act.  It was written as part of
8  *  the author's official duties as a United States Government employee and
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  */
26 
27 #include "map-file.h"
28 #include "idx-mapping.h"
29 #include "ctx.h"
30 #include "caps.h"
31 #include "status.h"
32 #include "mem.h"
33 #include "sra-sort.h"
34 
35 #include <kfs/directory.h>
36 #include <kfs/file.h>
37 #include <kfs/buffile.h>
38 #include <klib/refcount.h>
39 #include <klib/sort.h>
40 #include <klib/rc.h>
41 
42 #include <string.h>
43 #include <endian.h>
44 #include <byteswap.h>
45 
46 #include "except.h"
47 
48 FILE_ENTRY ( map-file );
49 
50 
51 /*--------------------------------------------------------------------------
52  * MapFile
53  *  a file for storing id mappings
54  */
55 struct MapFile
56 {
57     int64_t first_id;
58     uint64_t num_ids;
59     uint64_t num_mapped_ids;
60     int64_t max_new_id;
61     KFile *f_old, *f_new, *f_pos;
62     size_t id_size;
63     KRefcount refcount;
64 };
65 
66 
67 /* Whack
68  */
69 static
MapFileWhack(MapFile * self,const ctx_t * ctx)70 void MapFileWhack ( MapFile *self, const ctx_t *ctx )
71 {
72     FUNC_ENTRY ( ctx );
73     rc_t rc = KFileRelease ( self -> f_old );
74     if ( rc != 0 )
75         SYSTEM_ERROR ( rc, "KFileRelease failed on old=>new" );
76     else
77     {
78         rc = KFileRelease ( self -> f_new );
79         if ( rc != 0 )
80             ABORT ( rc, "KFileRelease failed on new=>old" );
81         else if ( self -> f_pos != NULL )
82         {
83             rc = KFileRelease ( self -> f_pos );
84             if ( rc != 0 )
85                 ABORT ( rc, "KFileRelease failed on global poslen temp column" );
86         }
87 
88         MemFree ( ctx, self, sizeof * self );
89     }
90 }
91 
92 
93 /* Make
94  *  creates an id map
95  */
96 static
MapFileMakeFork(KFile ** fp,const ctx_t * ctx,const char * name,KDirectory * wd,const char * tmpdir,int pid,size_t bsize,const char * fork)97 void MapFileMakeFork ( KFile **fp, const ctx_t *ctx, const char *name,
98     KDirectory *wd, const char *tmpdir, int pid, size_t bsize, const char *fork )
99 {
100     FUNC_ENTRY ( ctx );
101 
102     /* create temporary KFile */
103     rc_t rc;
104     KFile *backing;
105 
106     rc = KDirectoryCreateFile ( wd, & backing, true,
107         0600, kcmInit | kcmParents, "%s/sra-sort-%s.%s.%d", tmpdir, name, fork, pid );
108     if ( rc != 0 )
109         SYSTEM_ERROR ( rc, "failed to create %s id map file '%s'", fork, name );
110     else
111     {
112 #if ! WINDOWS
113         /* never try to remove files on Windows */
114         if ( ctx -> caps -> tool -> unlink_idx_files )
115         {
116             /* unlink KFile */
117             rc = KDirectoryRemove ( wd, false, "%s/sra-sort-%s.%s.%d", tmpdir, name, fork, pid );
118             if ( rc != 0 )
119                 WARN ( "failed to unlink %s id map file '%s'", fork, name );
120         }
121 #endif
122 
123         /* create a read/write buffer file */
124         rc = KBufFileMakeWrite ( fp, backing, true, bsize );
125         if ( rc != 0 )
126             INTERNAL_ERROR ( rc, "failed to create buffer for %s id map file '%s'", fork, name );
127 
128         KFileRelease ( backing );
129     }
130 }
131 
132 static
MapFileMakeInt(const ctx_t * ctx,const char * name,bool random,bool for_poslen)133 MapFile *MapFileMakeInt ( const ctx_t *ctx, const char *name, bool random, bool for_poslen )
134 {
135     MapFile *mf;
136     TRY ( mf = MemAlloc ( ctx, sizeof * mf, true ) )
137     {
138         /* create KDirectory */
139         KDirectory *wd;
140         rc_t rc = KDirectoryNativeDir ( & wd );
141         if ( rc != 0 )
142             SYSTEM_ERROR ( rc, "failed to create native directory" );
143         else
144         {
145             const Tool *tp = ctx -> caps -> tool;
146             size_t bsize = random ? tp -> map_file_random_bsize : tp -> map_file_bsize;
147 
148             /* create old=>new id file */
149             TRY ( MapFileMakeFork ( & mf -> f_old, ctx, name, wd, tp -> tmpdir, tp -> pid, bsize, "old" ) )
150             {
151                 TRY ( MapFileMakeFork ( & mf -> f_new, ctx, name, wd, tp -> tmpdir, tp -> pid, 32 * 1024, "new" ) )
152                 {
153                     if ( for_poslen )
154                         MapFileMakeFork ( & mf -> f_pos, ctx, name, wd, tp -> tmpdir, tp -> pid, 32 * 1024, "pos" );
155 
156                     KDirectoryRelease ( wd );
157 
158                     if ( ! FAILED () )
159                     {
160                         /* this is our guy */
161                         KRefcountInit ( & mf -> refcount, 1, "MapFile", "make", name );
162 
163                         return mf;
164                     }
165 
166                     KFileRelease ( mf -> f_new );
167                 }
168 
169                 KFileRelease ( mf -> f_old );
170             }
171 
172             KDirectoryRelease ( wd );
173         }
174 
175         MemFree ( ctx, mf, sizeof * mf );
176     }
177 
178     return NULL;
179 }
180 
MapFileMake(const ctx_t * ctx,const char * name,bool random)181 MapFile *MapFileMake ( const ctx_t *ctx, const char *name, bool random )
182 {
183     FUNC_ENTRY ( ctx );
184     return MapFileMakeInt ( ctx, name, random, false );
185 }
186 
187 /* MakeForPoslen
188  *  creates an id map with an additional poslen file
189  *  this is specially for PRIMARY_ALIGNMENT_IDS and SECONDARY_ALIGNMENT_IDS
190  *  that use *_ALIGNMENT global position and length as a sorting key
191  *  we drop it to a file after its generation so that it can be
192  *  picked up later when copying the corresponding alignment table.
193  */
MapFileMakeForPoslen(const ctx_t * ctx,const char * name)194 MapFile *MapFileMakeForPoslen ( const ctx_t *ctx, const char *name )
195 {
196     FUNC_ENTRY ( ctx );
197     return MapFileMakeInt ( ctx, name, false, true );
198 }
199 
200 
201 /* Release
202  */
MapFileRelease(const MapFile * self,const ctx_t * ctx)203 void MapFileRelease ( const MapFile *self, const ctx_t *ctx )
204 {
205     rc_t rc;
206     FUNC_ENTRY ( ctx );
207 
208     if ( self != NULL )
209     {
210         switch ( KRefcountDrop ( & self -> refcount, "MapFile" ) )
211         {
212         case krefOkay:
213             break;
214         case krefWhack:
215             MapFileWhack ( ( MapFile* ) self, ctx );
216             break;
217         case krefZero:
218         case krefLimit:
219         case krefNegative:
220             rc = RC ( rcExe, rcFile, rcDestroying, rcRefcount, rcInvalid );
221             INTERNAL_ERROR ( rc, "failed to release MapFile" );
222         }
223     }
224 }
225 
226 /* Duplicate
227  */
MapFileDuplicate(const MapFile * self,const ctx_t * ctx)228 MapFile *MapFileDuplicate ( const MapFile *self, const ctx_t *ctx )
229 {
230     rc_t rc;
231     FUNC_ENTRY ( ctx );
232 
233     if ( self != NULL )
234     {
235         switch ( KRefcountAdd ( & self -> refcount, "MapFile" ) )
236         {
237         case krefOkay:
238         case krefWhack:
239         case krefZero:
240             break;
241         case krefLimit:
242         case krefNegative:
243             rc = RC ( rcExe, rcFile, rcAttaching, rcRefcount, rcInvalid );
244             INTERNAL_ERROR ( rc, "failed to duplicate MapFile" );
245             return NULL;
246         }
247     }
248 
249     return ( MapFile* ) self;
250 }
251 
252 
253 /* SsetIdRange
254  *  required second-stage initialization
255  *  must be called before any writes occur
256  */
MapFileSetIdRange(MapFile * self,const ctx_t * ctx,int64_t first_id,uint64_t num_ids)257 void MapFileSetIdRange ( MapFile *self, const ctx_t *ctx,
258     int64_t first_id, uint64_t num_ids )
259 {
260     rc_t rc;
261     FUNC_ENTRY ( ctx );
262 
263     if ( self == NULL )
264     {
265         rc = RC ( rcExe, rcFile, rcUpdating, rcSelf, rcNull );
266         INTERNAL_ERROR ( rc, "bad self" );
267     }
268     else if ( self -> max_new_id != 0 )
269     {
270         rc = RC ( rcExe, rcFile, rcUpdating, rcConstraint, rcViolated );
271         INTERNAL_ERROR ( rc, "cannot change id range after writing has begun" );
272     }
273     else
274     {
275         /* record new first id and count */
276         self -> first_id = first_id;
277         self -> num_ids = num_ids;
278 
279         /* determine id size for all ids PLUS 0 for NULL */
280         for ( ++ num_ids, self -> id_size = 1; self -> id_size < 8; ++ self -> id_size )
281         {
282             if ( num_ids <= ( ( uint64_t ) 1 ) << ( self -> id_size * 8 ) )
283                 break;
284         }
285     }
286 }
287 
288 
289 /* First
290  *  return first mapped id
291  */
MapFileFirst(const MapFile * self,const ctx_t * ctx)292 int64_t MapFileFirst ( const MapFile *self, const ctx_t *ctx )
293 {
294     return self -> first_id;
295 }
296 
297 
298 /* Count
299  *  return num_ids
300  */
MapFileCount(const MapFile * self,const ctx_t * ctx)301 uint64_t MapFileCount ( const MapFile *self, const ctx_t *ctx )
302 {
303     return self -> num_ids;
304 }
305 
306 
307 /* SetOldToNew
308  *  write old=>new id mappings
309  */
MapFileSetOldToNew(MapFile * self,const ctx_t * ctx,const IdxMapping * ids,size_t count)310 void MapFileSetOldToNew ( MapFile *self, const ctx_t *ctx, const IdxMapping *ids, size_t count )
311 {
312     rc_t rc;
313     FUNC_ENTRY ( ctx );
314 
315     if ( self == NULL )
316     {
317         rc = RC ( rcExe, rcFile, rcWriting, rcSelf, rcNull );
318         INTERNAL_ERROR ( rc, "bad self reference" );
319     }
320     else if ( self -> num_ids == 0 )
321     {
322         rc = RC ( rcExe, rcFile, rcWriting, rcRange, rcUndefined );
323         INTERNAL_ERROR ( rc, "SetIdRange must be called with a non-empty range" );
324     }
325     else
326     {
327         size_t i;
328         for ( i = 0; i < count; ++ i )
329         {
330             size_t num_writ;
331             /* zero-based index */
332             int64_t old_id = ids [ i ] . old_id - self -> first_id;
333             /* position in bytes */
334             uint64_t pos = old_id * self -> id_size;
335             /* 1-based translated new-id */
336             int64_t new_id = ids [ i ] . new_id - self -> first_id + 1;
337             assert ( old_id >= 0 );
338             assert ( new_id >= 0 );
339 #if __BYTE_ORDER == __BIG_ENDIAN
340             new_id = bswap_64 ( new_id );
341 #endif
342             rc = KFileWriteAll ( self -> f_old, pos, & new_id, self -> id_size, & num_writ );
343             if ( rc != 0 )
344             {
345                 SYSTEM_ERROR ( rc, "failed to write old=>new id mapping" );
346                 break;
347             }
348 
349             if ( num_writ != self -> id_size )
350             {
351                 rc = RC ( rcExe, rcFile, rcWriting, rcTransfer, rcIncomplete );
352                 SYSTEM_ERROR ( rc, "failed to write old=>new id mapping" );
353                 break;
354             }
355         }
356     }
357 }
358 
359 
360 /* SetNewToOld
361  *  write new=>old id mappings
362  */
MapFileSetNewToOld(MapFile * self,const ctx_t * ctx,const IdxMapping * ids,size_t count)363 void MapFileSetNewToOld ( MapFile *self, const ctx_t *ctx, const IdxMapping *ids, size_t count )
364 {
365     rc_t rc;
366     FUNC_ENTRY ( ctx );
367 
368     if ( self == NULL )
369     {
370         rc = RC ( rcExe, rcFile, rcWriting, rcSelf, rcNull );
371         INTERNAL_ERROR ( rc, "bad self reference" );
372     }
373     else if ( self -> num_ids == 0 )
374     {
375         rc = RC ( rcExe, rcFile, rcWriting, rcRange, rcUndefined );
376         INTERNAL_ERROR ( rc, "SetIdRange must be called with a non-empty range" );
377     }
378     else
379     {
380         if ( ! ctx -> caps -> tool -> write_new_to_old )
381             self -> max_new_id = self -> first_id + count - 1;
382         else
383         {
384             size_t i;
385             for ( i = 0; i < count; ++ i )
386             {
387                 size_t num_writ;
388                 /* zero based index */
389                 int64_t new_id = ids [ i ] . new_id - self -> first_id;
390                 /* zero based byte offset */
391                 uint64_t pos = new_id * self -> id_size;
392                 /* 1-based translated old-id */
393                 int64_t old_id = ids [ i ] . old_id - self -> first_id + 1;
394                 assert ( new_id >= 0 );
395                 assert ( old_id >= 0 );
396 #if __BYTE_ORDER == __BIG_ENDIAN
397                 old_id = bswap_64 ( old_id );
398 #endif
399                 rc = KFileWriteAll ( self -> f_new, pos, & old_id, self -> id_size, & num_writ );
400                 if ( rc != 0 )
401                 {
402                     SYSTEM_ERROR ( rc, "failed to write new=>old id mapping" );
403                     break;
404                 }
405 
406                 if ( num_writ != self -> id_size )
407                 {
408                     rc = RC ( rcExe, rcFile, rcWriting, rcTransfer, rcIncomplete );
409                     SYSTEM_ERROR ( rc, "failed to write new=>old id mapping" );
410                     break;
411                 }
412 
413                 self -> max_new_id = new_id + self -> first_id;
414             }
415         }
416     }
417 }
418 
419 
420 /* SetPoslen
421  *  write global position/length in new-id order
422  */
MapFileSetPoslen(MapFile * self,const ctx_t * ctx,const IdxMapping * ids,size_t count)423 void MapFileSetPoslen ( MapFile *self, const ctx_t *ctx, const IdxMapping *ids, size_t count )
424 {
425     rc_t rc;
426     FUNC_ENTRY ( ctx );
427 
428     if ( self == NULL )
429     {
430         rc = RC ( rcExe, rcFile, rcWriting, rcSelf, rcNull );
431         INTERNAL_ERROR ( rc, "bad self reference" );
432     }
433     else if ( self -> f_pos == NULL )
434     {
435         rc = RC ( rcExe, rcFile, rcWriting, rcFile, rcIncorrect );
436         INTERNAL_ERROR ( rc, "MapFile must be created with MapFileMakeForPoslen" );
437     }
438     else if ( self -> num_ids == 0 )
439     {
440         rc = RC ( rcExe, rcFile, rcWriting, rcRange, rcUndefined );
441         INTERNAL_ERROR ( rc, "SetIdRange must be called with a non-empty range" );
442     }
443     else
444     {
445         /* start writing after the last new id recorded */
446         uint64_t pos = ( self -> max_new_id - self -> first_id + 1 ) * sizeof ids -> new_id;
447 
448         size_t i;
449         for ( i = 0; i < count; pos += sizeof ids -> new_id, ++ i )
450         {
451             size_t num_writ;
452             int64_t poslen = ids [ i ] . new_id;
453 #if __BYTE_ORDER == __BIG_ENDIAN
454             poslen = bswap_64 ( poslen );
455 #endif
456             rc = KFileWriteAll ( self -> f_pos, pos, & poslen, sizeof ids -> new_id, & num_writ );
457             if ( rc != 0 )
458             {
459                 SYSTEM_ERROR ( rc, "failed to write poslen temporary column" );
460                 break;
461             }
462 
463             if ( num_writ != sizeof ids -> new_id )
464             {
465                 rc = RC ( rcExe, rcFile, rcWriting, rcTransfer, rcIncomplete );
466                 SYSTEM_ERROR ( rc, "failed to write poslen temporary column" );
467                 break;
468             }
469         }
470     }
471 }
472 
473 
474 /* ReadPoslen
475  *  read global position/length in new-id order
476  *  starts reading at the indicated row
477  *  returns the number read
478  */
MapFileReadPoslen(const MapFile * self,const ctx_t * ctx,int64_t start_id,uint64_t * poslen,size_t max_count)479 size_t MapFileReadPoslen ( const MapFile *self, const ctx_t *ctx,
480     int64_t start_id, uint64_t *poslen, size_t max_count )
481 {
482     FUNC_ENTRY ( ctx );
483 
484     rc_t rc;
485     size_t total = 0;
486 
487     if ( self == NULL )
488     {
489         rc = RC ( rcExe, rcFile, rcReading, rcSelf, rcNull );
490         INTERNAL_ERROR ( rc, "bad self reference" );
491     }
492     else if ( self -> f_pos == NULL )
493     {
494         rc = RC ( rcExe, rcFile, rcReading, rcFile, rcIncorrect );
495         INTERNAL_ERROR ( rc, "MapFile must be created with MapFileMakeForPoslen" );
496     }
497     /* the column is read until end, but
498        this is not an error. just return no-more-data */
499     else if ( start_id == self -> first_id + self -> num_ids )
500         return 0;
501     else if ( start_id < self -> first_id || start_id > self -> first_id + self -> num_ids )
502     {
503         rc = RC ( rcExe, rcFile, rcReading, rcParam, rcInvalid );
504         INTERNAL_ERROR ( rc, "start_id ( %ld ) is not within column range ( %ld .. %ld )",
505             start_id, self -> first_id, self -> first_id + self -> num_ids - 1 );
506     }
507     else if ( max_count != 0 )
508     {
509         /* limit read to number of ids in index */
510         if ( start_id + max_count > self -> first_id + self -> num_ids )
511             max_count = ( size_t ) ( self -> first_id + self -> num_ids - start_id );
512 
513         if ( poslen == NULL )
514         {
515             rc = RC ( rcExe, rcFile, rcReading, rcBuffer, rcNull );
516             INTERNAL_ERROR ( rc, "bad buffer parameter" );
517         }
518         else
519         {
520             /* read as many bytes of id as possible */
521             size_t num_read, to_read = max_count * sizeof * poslen;
522             uint64_t pos = ( start_id - self -> first_id ) * sizeof * poslen;
523             rc = KFileReadAll ( self -> f_pos, pos, poslen, to_read, & num_read );
524             if ( rc != 0 )
525                 SYSTEM_ERROR ( rc, "failed to read new=>old map" );
526             else
527             {
528                 total = ( size_t ) ( num_read / sizeof * poslen );
529 
530 #if __BYTE_ORDER == __BIG_ENDIAN
531                 {
532                     /* revert byte-order */
533                     size_t i;
534                     for ( i = 0; i < total; ++ i )
535                         poslen [ i ] = bswap_64 ( poslen [ i ] );
536                 }
537 #endif
538             }
539         }
540     }
541 
542     return total;
543 }
544 
545 
546 /* AllocMissingNewIds
547  *  it is possible to have written fewer new=>old mappings
548  *  than are supposed to be there, e.g. SEQUENCE that gets
549  *  initially written by align tables. unaligned sequences
550  *  need to fill in the remainder.
551  */
552 static
make_missing_entry(MapFile * self,const ctx_t * ctx,IdxMapping * missing,size_t i,size_t max_missing_ids,int64_t old_id,int64_t new_id)553 size_t make_missing_entry ( MapFile *self, const ctx_t *ctx,
554     IdxMapping *missing, size_t i, size_t max_missing_ids,
555     int64_t old_id, int64_t new_id )
556 {
557     if ( i == max_missing_ids )
558     {
559         ON_FAIL ( MapFileSetNewToOld ( self, ctx, missing, i ) )
560             return i;
561 
562         i = 0;
563     }
564 
565     /* insert into missing id map */
566     missing [ i ] . old_id = old_id;
567     missing [ i ] . new_id = new_id;
568 
569     /* update old=>new mapping */
570     MapFileSetOldToNew ( self, ctx, & missing [ i ], 1 );
571 
572     return i + 1;
573 }
574 
MapFileAllocMissingNewIds(MapFile * self,const ctx_t * ctx)575 int64_t MapFileAllocMissingNewIds ( MapFile *self, const ctx_t *ctx )
576 {
577     FUNC_ENTRY ( ctx );
578 
579     rc_t rc;
580     int64_t first_allocated = 0;
581 
582     if ( self == NULL )
583     {
584         rc = RC ( rcExe, rcFile, rcUpdating, rcSelf, rcNull );
585         INTERNAL_ERROR ( rc, "bad self reference" );
586     }
587     else if ( self -> num_ids == 0 )
588     {
589         rc = RC ( rcExe, rcFile, rcWriting, rcRange, rcUndefined );
590         INTERNAL_ERROR ( rc, "SetIdRange must be called with a non-empty range" );
591     }
592     else
593     {
594         uint64_t num_missing_ids = self -> num_ids - ( self -> max_new_id - self -> first_id + 1 );
595         if ( num_missing_ids != 0 )
596         {
597             IdxMapping *missing;
598 
599             /* determine maximum missing id mappings */
600             size_t max_missing_ids = ctx -> caps -> tool -> max_missing_ids;
601             if ( ( uint64_t ) max_missing_ids > num_missing_ids )
602                 max_missing_ids = ( size_t ) num_missing_ids;
603 
604             /* allocate a buffer of IdxMapping */
605             TRY ( missing = MemAlloc ( ctx, sizeof * missing * max_missing_ids, false ) )
606             {
607                 /* use a scan buffer somewhere around 32K,
608                    to keep page-file cache focused on a few pages */
609                 void *scan_buffer;
610                 const size_t scan_buffer_size = 32 * 1024;
611                 TRY ( scan_buffer = MemAlloc ( ctx, scan_buffer_size, false ) )
612                 {
613                     size_t i, num_read;
614                     int64_t new_id, old_id = self -> first_id;
615                     uint64_t eof = self -> num_ids * self -> id_size;
616 
617                     /* this guy will be used to assign new ids */
618                     int64_t max_new_id = self -> max_new_id;
619                     int64_t entry_max_new_id = self -> max_new_id;
620 
621 
622                     /* in a loop, read as many ids as possible into scan buffer
623                        the number is dependent upon self->id_size */
624                     for ( i = 0; ; old_id += num_read )
625                     {
626                         size_t j, to_read;
627 
628                         /* determine position from old_id */
629                         uint64_t pos = ( old_id - self -> first_id ) * self -> id_size;
630                         if ( pos == eof )
631                             break;
632 
633                         /* determine bytes to read - this must be limited
634                            or we risk tricking page-buffer into resizing */
635                         to_read = scan_buffer_size;
636                         if ( pos + to_read > eof )
637                             to_read = ( size_t ) ( eof - pos );
638 
639                         /* read as many bytes as we can */
640                         rc = KFileReadAll ( self -> f_old, pos, scan_buffer, to_read, & num_read );
641                         if ( rc != 0 )
642                         {
643                             SYSTEM_ERROR ( rc, "failed to read old=>new map" );
644                             break;
645                         }
646 
647                         /* convert to count */
648                         num_read /= self -> id_size;
649                         if ( num_read == 0 )
650                         {
651                             rc = RC ( rcExe, rcFile, rcReading, rcTransfer, rcIncomplete );
652                             SYSTEM_ERROR ( rc, "failed to read old=>new map" );
653                             break;
654                         }
655 
656                         /* scan for zeros, and for every zero found,
657                            make entry into IdxMapping table, writing
658                            new-id back to old=>new map */
659                         switch ( self -> id_size )
660                         {
661                         case 1:
662                             for ( j = 0; j < num_read; ++ j )
663                             {
664                                 if ( ( ( const uint8_t* ) scan_buffer ) [ j ] == 0 )
665                                 {
666                                     ON_FAIL ( i = make_missing_entry ( self, ctx, missing,
667                                               i, max_missing_ids, old_id + j, ++ max_new_id ) )
668                                         break;
669                                 }
670                             }
671                             break;
672                         case 2:
673                             for ( j = 0; j < num_read; ++ j )
674                             {
675                                 if ( ( ( const uint16_t* ) scan_buffer ) [ j ] == 0 )
676                                 {
677                                     ON_FAIL ( i = make_missing_entry ( self, ctx, missing,
678                                               i, max_missing_ids, old_id + j, ++ max_new_id ) )
679                                         break;
680                                 }
681                             }
682                             break;
683                         case 3:
684                             for ( num_read *= 3, j = 0; j < num_read; j += 3 )
685                             {
686                                 if ( ( ( const uint8_t* ) scan_buffer ) [ j + 0 ] == 0 &&
687                                      ( ( const uint8_t* ) scan_buffer ) [ j + 1 ] == 0 &&
688                                      ( ( const uint8_t* ) scan_buffer ) [ j + 2 ] == 0 )
689                                 {
690                                     ON_FAIL ( i = make_missing_entry ( self, ctx, missing,
691                                               i, max_missing_ids, old_id + j / 3, ++ max_new_id ) )
692                                         break;
693                                 }
694                             }
695                             num_read /= 3;
696                             break;
697                         case 4:
698                             for ( j = 0; j < num_read; ++ j )
699                             {
700                                 if ( ( ( const uint32_t* ) scan_buffer ) [ j ] == 0 )
701                                 {
702                                     ON_FAIL ( i = make_missing_entry ( self, ctx, missing,
703                                               i, max_missing_ids, old_id + j, ++ max_new_id ) )
704                                         break;
705                                 }
706                             }
707                             break;
708                         case 8:
709                             for ( j = 0; j < num_read; ++ j )
710                             {
711                                 if ( ( ( const uint64_t* ) scan_buffer ) [ j ] == 0 )
712                                 {
713                                     ON_FAIL ( i = make_missing_entry ( self, ctx, missing,
714                                               i, max_missing_ids, old_id + j, ++ max_new_id ) )
715                                         break;
716                                 }
717                             }
718                             break;
719                         default:
720                             for ( num_read *= self -> id_size, j = 0; j < num_read; j += self -> id_size )
721                             {
722                                 new_id = 0;
723                                 memmove ( & new_id, & ( ( const uint8_t* ) scan_buffer ) [ j ], self -> id_size );
724                                 if ( new_id == 0 )
725                                 {
726                                     ON_FAIL ( i = make_missing_entry ( self, ctx, missing,
727                                               i, max_missing_ids, old_id + j / self -> id_size, ++ max_new_id ) )
728                                         break;
729                                 }
730                             }
731                             num_read /= self -> id_size;
732                         }
733                     }
734 
735                     if ( ! FAILED () && i != 0 )
736                         MapFileSetNewToOld ( self, ctx, missing, i );
737 
738                     assert ( FAILED () || self -> max_new_id == max_new_id );
739 
740                     if ( max_new_id != entry_max_new_id )
741                         first_allocated = entry_max_new_id + 1;
742 
743                     MemFree ( ctx, scan_buffer, scan_buffer_size );
744                 }
745 
746                 MemFree ( ctx, missing, sizeof * missing * max_missing_ids );
747             }
748         }
749     }
750 
751     return first_allocated;
752 }
753 
754 
755 /* ReadNewToOld
756  *  reads a bunch of new=>old mappings
757  */
758 static
MapFileReadNewToOld(const MapFile * self,const ctx_t * ctx,int64_t start_id,IdxMapping * ids,size_t max_count)759 size_t MapFileReadNewToOld ( const MapFile *self, const ctx_t *ctx,
760     int64_t start_id, IdxMapping *ids, size_t max_count )
761 {
762     FUNC_ENTRY ( ctx );
763 
764     rc_t rc;
765     size_t total = 0;
766 
767     if ( self == NULL )
768     {
769         rc = RC ( rcExe, rcFile, rcReading, rcSelf, rcNull );
770         INTERNAL_ERROR ( rc, "bad self reference" );
771     }
772     /* the new=>old mapping is read until end, but
773        this is not an error. just return no-more-data */
774     else if ( start_id == self -> first_id + self -> num_ids )
775         return 0;
776     else if ( start_id < self -> first_id || start_id > self -> first_id + self -> num_ids )
777     {
778         rc = RC ( rcExe, rcFile, rcReading, rcParam, rcInvalid );
779         INTERNAL_ERROR ( rc, "start_id ( %ld ) is not within map range ( %ld .. %ld )",
780             start_id, self -> first_id, self -> first_id + self -> num_ids - 1 );
781     }
782     else if ( max_count != 0 )
783     {
784         /* limit read to number of ids in index */
785         if ( start_id + max_count > self -> first_id + self -> num_ids )
786             max_count = ( size_t ) ( self -> first_id + self -> num_ids - start_id );
787 
788         if ( ids == NULL )
789         {
790             rc = RC ( rcExe, rcFile, rcReading, rcBuffer, rcNull );
791             INTERNAL_ERROR ( rc, "bad buffer parameter" );
792         }
793         else
794         {
795             /* read as many bytes of id as possible */
796             size_t num_read, to_read = max_count * self -> id_size;
797             uint64_t pos = ( start_id - self -> first_id ) * self -> id_size;
798             rc = KFileReadAll ( self -> f_new, pos, ids, to_read, & num_read );
799             if ( rc != 0 )
800                 SYSTEM_ERROR ( rc, "failed to read new=>old map" );
801             else
802             {
803                 size_t i;
804                 const char *packed = ( const char* ) ids;
805                 total = ( size_t ) ( num_read /self -> id_size );
806 
807                 /* read each packed id as little-endian integer onto
808                    pre-zeroed 64-bit integer, swap if needed */
809                 for ( num_read = total * self -> id_size, i = total; i > 0; )
810                 {
811                     int64_t unpacked = 0;
812 
813                     num_read -= self -> id_size;
814                     -- i;
815 
816                     memmove ( & unpacked, & packed [ num_read ], self -> id_size );
817 #if __BYTE_ORDER == __BIG_ENDIAN
818                     unpacked = bswap_64 ( unpacked );
819 #endif
820                     if ( unpacked != 0 )
821                         unpacked += self -> first_id - 1;
822 
823                     ids [ i ] . old_id = unpacked;
824                     ids [ i ] . new_id = start_id + i;
825                 }
826             }
827         }
828     }
829 
830     return total;
831 }
832 
833 
834 /* SelectOldToNewPairs
835  *  specify the range of new ids to select from
836  *  read them in old=>new order
837  */
MapFileSelectOldToNewPairs(const MapFile * self,const ctx_t * ctx,int64_t start_id,IdxMapping * ids,size_t max_count)838 size_t MapFileSelectOldToNewPairs ( const MapFile *self, const ctx_t *ctx,
839     int64_t start_id, IdxMapping *ids, size_t max_count )
840 {
841     FUNC_ENTRY ( ctx );
842 
843     uint64_t eof;
844     int64_t end_excl;
845     size_t i, j, total;
846 
847     /* limit read to number of ids in index */
848     if ( start_id + max_count > self -> first_id + self -> num_ids )
849         max_count = ( size_t ) ( self -> first_id + self -> num_ids - start_id );
850 
851     /* range is start_id to end_excl */
852     end_excl = start_id + max_count;
853 
854     /* eof for f_old */
855     eof = self -> num_ids * self -> id_size;
856 
857     for ( total = i = 0; i < max_count; total += j )
858     {
859         rc_t rc;
860         size_t off, num_read;
861         char buff [ 32 * 1024 ];
862 
863         /* read some data from file */
864         uint64_t pos = total * self -> id_size;
865         size_t to_read = sizeof buff;
866 	if ( pos == eof )
867 	    break;
868         if ( pos + to_read > eof )
869             to_read = ( size_t ) ( eof - pos );
870         rc = KFileReadAll ( self -> f_old, pos, buff, to_read, & num_read );
871         if ( rc != 0 )
872         {
873             SYSTEM_ERROR ( rc, "failed to read old=>new map" );
874             break;
875         }
876 
877         /* reached end - requested more ids than we have */
878         if ( num_read == 0 )
879             break;
880 
881         /* the number of whole ids actually read */
882         if ( ( num_read /= self -> id_size ) == 0 )
883         {
884             rc = RC ( rcExe, rcFile, rcReading, rcTransfer, rcIncomplete );
885             SYSTEM_ERROR ( rc, "failed to read old=>new map - file incomplete" );
886             break;
887         }
888 
889         /* select new-ids from ids read */
890         for ( off = j = 0; j < num_read; off += self -> id_size, ++ j )
891         {
892             int64_t unpacked = 0;
893             memmove ( & unpacked, & buff [ off ], self -> id_size );
894 #if __BYTE_ORDER == __BIG_ENDIAN
895             unpacked = bswap_64 ( unpacked );
896 #endif
897             /* only offset non-zero (NULL) ids */
898             if ( unpacked != 0 )
899                 unpacked += self -> first_id - 1;
900 
901             /* if id meets criteria */
902             if ( unpacked >= start_id && unpacked < end_excl )
903             {
904                 ids [ i ] . old_id = self -> first_id + total + j;
905                 ids [ i ] . new_id = unpacked;
906                 if ( ++ i == max_count )
907                     break;
908             }
909         }
910     }
911 
912     return i;
913 }
914 
915 
916 /* SelectOldToNewSingle
917  *  specify the range of new ids to select from
918  *  read them in old=>new order
919  */
MapFileSelectOldToNewSingle(const MapFile * self,const ctx_t * ctx,int64_t start_id,int64_t * ids,uint32_t * opt_ord,size_t max_count)920 size_t MapFileSelectOldToNewSingle ( const MapFile *self, const ctx_t *ctx,
921     int64_t start_id, int64_t *ids, uint32_t *opt_ord, size_t max_count )
922 {
923     FUNC_ENTRY ( ctx );
924 
925     uint64_t eof;
926     int64_t end_excl;
927     size_t i, j, total;
928 
929     /* limit read to number of ids in index */
930     if ( start_id + max_count > self -> first_id + self -> num_ids )
931         max_count = ( size_t ) ( self -> first_id + self -> num_ids - start_id );
932 
933     /* range is start_id to end_excl */
934     end_excl = start_id + max_count;
935 
936     /* eof for f_old */
937     eof = self -> num_ids * self -> id_size;
938 
939     for ( total = i = 0; i < max_count; total += j )
940     {
941         rc_t rc;
942         size_t off, num_read;
943         char buff [ 32 * 1024 ];
944 
945         /* read some data from file */
946         uint64_t pos = total * self -> id_size;
947         size_t to_read = sizeof buff;
948 	if ( pos == eof )
949             break;
950         if ( pos + to_read > eof )
951             to_read = ( size_t ) ( eof - pos );
952         rc = KFileReadAll ( self -> f_old, pos, buff, to_read, & num_read );
953         if ( rc != 0 )
954         {
955             SYSTEM_ERROR ( rc, "failed to read old=>new map" );
956             break;
957         }
958 
959         /* reached end - requested more ids than we have */
960         if ( num_read == 0 )
961             break;
962 
963         /* the number of whole ids actually read */
964         if ( ( num_read /= self -> id_size ) == 0 )
965         {
966             rc = RC ( rcExe, rcFile, rcReading, rcTransfer, rcIncomplete );
967             SYSTEM_ERROR ( rc, "failed to read old=>new map - file incomplete" );
968             break;
969         }
970 
971         /* select new-ids from ids read */
972         for ( off = j = 0; j < num_read; off += self -> id_size, ++ j )
973         {
974             int64_t unpacked = 0;
975             memmove ( & unpacked, & buff [ off ], self -> id_size );
976 #if __BYTE_ORDER == __BIG_ENDIAN
977             unpacked = bswap_64 ( unpacked );
978 #endif
979             /* only offset non-zero (NULL) ids */
980             if ( unpacked != 0 )
981                 unpacked += self -> first_id - 1;
982 
983             /* if id meets criteria */
984             if ( unpacked >= start_id && unpacked < end_excl )
985             {
986                 ids [ i ] = self -> first_id + total + j;
987                 assert ( i <= 0xFFFFFFFF );
988                 if ( opt_ord != NULL )
989                     opt_ord [ unpacked - start_id ] = ( uint32_t ) i;
990                 if ( ++ i == max_count )
991                     break;
992             }
993         }
994     }
995 
996     return i;
997 }
998 
999 
1000 /* MapSingleOldToNew
1001  *  reads a single old=>new mapping
1002  *  returns new id or 0 if not found
1003  *  optionally allocates a new id if "insert" is true
1004  */
MapFileMapSingleOldToNew(MapFile * self,const ctx_t * ctx,int64_t old_id,bool insert)1005 int64_t MapFileMapSingleOldToNew ( MapFile *self, const ctx_t *ctx,
1006     int64_t old_id, bool insert )
1007 {
1008     FUNC_ENTRY ( ctx );
1009 
1010     rc_t rc;
1011     int64_t new_id = 0;
1012 
1013     if ( self == NULL )
1014     {
1015         rc = RC ( rcExe, rcFile, rcReading, rcSelf, rcNull );
1016         INTERNAL_ERROR ( rc, "bad self reference" );
1017     }
1018     else if ( old_id < self -> first_id || old_id >= self -> first_id + self -> num_ids )
1019     {
1020         rc = RC ( rcExe, rcFile, rcReading, rcParam, rcInvalid );
1021         INTERNAL_ERROR ( rc, "old_id ( %ld ) is not within map range ( %ld .. %ld )",
1022             old_id, self -> first_id, self -> first_id + self -> num_ids - 1 );
1023     }
1024     else
1025     {
1026         size_t num_read, to_read = self -> id_size;
1027         uint64_t pos = ( old_id - self -> first_id ) * self -> id_size;
1028         rc = KFileReadAll ( self -> f_old, pos, & new_id, to_read, & num_read );
1029         if ( rc != 0 )
1030             SYSTEM_ERROR ( rc, "failed to read old=>new map" );
1031         else
1032         {
1033 #if __BYTE_ORDER == __BIG_ENDIAN
1034             new_id = bswap_64 ( new_id );
1035 #endif
1036             if ( new_id != 0 )
1037                 new_id += self -> first_id - 1;
1038             else if ( insert )
1039             {
1040                 /* create a mapping using the last known
1041                    new id plus one as the id to assign on insert */
1042                 IdxMapping mapping;
1043                 mapping . old_id = old_id;
1044                 mapping . new_id = self -> max_new_id + 1;
1045 
1046                 TRY ( MapFileSetOldToNew ( self, ctx, & mapping, 1 ) )
1047                 {
1048                     TRY ( MapFileSetNewToOld ( self, ctx, & mapping, 1 ) )
1049                     {
1050                         new_id = mapping . new_id;
1051 
1052                         if ( self -> num_ids >= 100000 )
1053                         {
1054                             uint64_t scaled = ++ self -> num_mapped_ids * 100;
1055                             uint64_t prior = scaled - 100;
1056                             if ( ( prior / self -> num_ids ) != ( scaled /= self -> num_ids ) )
1057                                 STATUS ( 2, "have mapped %lu%% ids", scaled );
1058                         }
1059                     }
1060                 }
1061             }
1062         }
1063     }
1064 
1065     return new_id;
1066 }
1067 
1068 
1069 /* ConsistencyCheck
1070  */
1071 static
MapFileCrossCheck(const MapFile * self,const ctx_t * ctx,IdxMapping * ids,size_t max_ids)1072 void MapFileCrossCheck ( const MapFile *self, const ctx_t *ctx, IdxMapping *ids, size_t max_ids )
1073 {
1074     FUNC_ENTRY ( ctx );
1075 
1076     rc_t rc;
1077     uint64_t total;
1078     size_t num_read;
1079     int64_t last_old, last_new;
1080 
1081     for ( last_old = last_new = -1, total = 0; total < self -> num_ids; total += num_read )
1082     {
1083         int64_t id;
1084         size_t i;
1085 
1086         STATUS ( 2, "reading new=>old ids" );
1087 
1088         ON_FAIL ( num_read = MapFileReadNewToOld ( self, ctx, self -> first_id + total, ids, max_ids ) )
1089         {
1090             ANNOTATE ( "failed to read new=>old ids" );
1091             return;
1092         }
1093 
1094         STATUS ( 2, "read %,u pairs", num_read );
1095 
1096         STATUS ( 2, "checking for repeats in new-id space" );
1097         for ( i = 0; i < num_read; last_new = id, ++ i )
1098         {
1099             id = ids [ i ] . new_id;
1100             if ( id == last_new )
1101             {
1102                 rc = RC ( rcExe, rcIndex, rcValidating, rcId, rcDuplicate );
1103                 ERROR ( rc, "new id %,ld repeats", id );
1104                 return;
1105             }
1106         }
1107 
1108         STATUS ( 2, "sorting on old id" );
1109 
1110 #if USE_OLD_KSORT
1111         ksort ( ids, num_read, sizeof ids [ 0 ], IdxMappingCmpOld, ( void* ) ctx );
1112 #else
1113         IdxMappingSortOld ( ids, ctx, num_read );
1114 #endif
1115 
1116         STATUS ( 2, "checking for repeats in old-id space" );
1117         for ( i = 0; i < num_read; last_old = id, ++ i )
1118         {
1119             id = ids [ i ] . old_id;
1120             if ( id == last_old )
1121             {
1122                 rc = RC ( rcExe, rcIndex, rcValidating, rcId, rcDuplicate );
1123                 ERROR ( rc, "old id %,ld repeats", id );
1124                 return;
1125             }
1126         }
1127 
1128         STATUS ( 2, "cross-checking against old=>new index" );
1129 
1130         for ( i = 0; i < num_read; ++ i )
1131         {
1132             int64_t new_id;
1133 
1134             /* cast away const on "self" for prototype,
1135                but function is only non-const if "insert"
1136                were true - we use "false" */
1137             ON_FAIL ( new_id = MapFileMapSingleOldToNew ( ( MapFile* ) self, ctx, ids [ i ] . old_id, false ) )
1138                 return;
1139 
1140             if ( new_id != ids [ i ] . new_id )
1141             {
1142                 rc = RC ( rcExe, rcIndex, rcValidating, rcId, rcUnequal );
1143                 ERROR ( rc, "mismatch of new ids %,ld != %,ld", new_id, ids [ i ] . new_id );
1144                 return;
1145             }
1146         }
1147     }
1148 }
1149 
MapFileConsistencyCheck(const MapFile * self,const ctx_t * ctx)1150 void MapFileConsistencyCheck ( const MapFile *self, const ctx_t *ctx )
1151 {
1152     FUNC_ENTRY ( ctx );
1153 
1154     rc_t rc;
1155     IdxMapping *ids;
1156     const size_t max_ids = 1024 * 1024;
1157 
1158     STATUS ( 2, "running consistency check on index files" );
1159 
1160     /* "self" is valid */
1161     if ( self == NULL )
1162     {
1163         ANNOTATE ( "checking the validity of a NULL MapFile" );
1164         return;
1165     }
1166 
1167     STATUS ( 2, "checking id_size" );
1168 
1169     /* number of ids requires some number of bits */
1170     if ( ( ( uint64_t ) 1 << ( self -> id_size * 8 ) ) < self -> num_ids )
1171     {
1172         rc = RC ( rcExe, rcIndex, rcValidating, rcSize, rcIncorrect );
1173         ERROR ( rc, "calculated id size ( %zu bytes ) is insufficient to represent count of %,lu ids",
1174                 self -> id_size, self -> num_ids );
1175         return;
1176     }
1177 
1178     STATUS ( 2, "allocating id buffer" );
1179 
1180     TRY ( ids = MemAlloc ( ctx, sizeof * ids * max_ids, false ) )
1181     {
1182         MapFileCrossCheck ( self, ctx, ids, max_ids );
1183         MemFree ( ctx, ids, sizeof * ids * max_ids );
1184     }
1185     CATCH_ALL ()
1186     {
1187         ANNOTATE ( "failed to allocate memory ( %zu pairs ) for consistency check", max_ids );
1188         return;
1189     }
1190 
1191     STATUS ( 2, "finished idx consistency check" );
1192 }
1193