1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 #include "map-file.h"
28 #include "idx-mapping.h"
29 #include "ctx.h"
30 #include "caps.h"
31 #include "status.h"
32 #include "mem.h"
33 #include "sra-sort.h"
34
35 #include <kfs/directory.h>
36 #include <kfs/file.h>
37 #include <kfs/buffile.h>
38 #include <klib/refcount.h>
39 #include <klib/sort.h>
40 #include <klib/rc.h>
41
42 #include <string.h>
43 #include <endian.h>
44 #include <byteswap.h>
45
46 #include "except.h"
47
48 FILE_ENTRY ( map-file );
49
50
51 /*--------------------------------------------------------------------------
52 * MapFile
53 * a file for storing id mappings
54 */
55 struct MapFile
56 {
57 int64_t first_id;
58 uint64_t num_ids;
59 uint64_t num_mapped_ids;
60 int64_t max_new_id;
61 KFile *f_old, *f_new, *f_pos;
62 size_t id_size;
63 KRefcount refcount;
64 };
65
66
67 /* Whack
68 */
69 static
MapFileWhack(MapFile * self,const ctx_t * ctx)70 void MapFileWhack ( MapFile *self, const ctx_t *ctx )
71 {
72 FUNC_ENTRY ( ctx );
73 rc_t rc = KFileRelease ( self -> f_old );
74 if ( rc != 0 )
75 SYSTEM_ERROR ( rc, "KFileRelease failed on old=>new" );
76 else
77 {
78 rc = KFileRelease ( self -> f_new );
79 if ( rc != 0 )
80 ABORT ( rc, "KFileRelease failed on new=>old" );
81 else if ( self -> f_pos != NULL )
82 {
83 rc = KFileRelease ( self -> f_pos );
84 if ( rc != 0 )
85 ABORT ( rc, "KFileRelease failed on global poslen temp column" );
86 }
87
88 MemFree ( ctx, self, sizeof * self );
89 }
90 }
91
92
93 /* Make
94 * creates an id map
95 */
96 static
MapFileMakeFork(KFile ** fp,const ctx_t * ctx,const char * name,KDirectory * wd,const char * tmpdir,int pid,size_t bsize,const char * fork)97 void MapFileMakeFork ( KFile **fp, const ctx_t *ctx, const char *name,
98 KDirectory *wd, const char *tmpdir, int pid, size_t bsize, const char *fork )
99 {
100 FUNC_ENTRY ( ctx );
101
102 /* create temporary KFile */
103 rc_t rc;
104 KFile *backing;
105
106 rc = KDirectoryCreateFile ( wd, & backing, true,
107 0600, kcmInit | kcmParents, "%s/sra-sort-%s.%s.%d", tmpdir, name, fork, pid );
108 if ( rc != 0 )
109 SYSTEM_ERROR ( rc, "failed to create %s id map file '%s'", fork, name );
110 else
111 {
112 #if ! WINDOWS
113 /* never try to remove files on Windows */
114 if ( ctx -> caps -> tool -> unlink_idx_files )
115 {
116 /* unlink KFile */
117 rc = KDirectoryRemove ( wd, false, "%s/sra-sort-%s.%s.%d", tmpdir, name, fork, pid );
118 if ( rc != 0 )
119 WARN ( "failed to unlink %s id map file '%s'", fork, name );
120 }
121 #endif
122
123 /* create a read/write buffer file */
124 rc = KBufFileMakeWrite ( fp, backing, true, bsize );
125 if ( rc != 0 )
126 INTERNAL_ERROR ( rc, "failed to create buffer for %s id map file '%s'", fork, name );
127
128 KFileRelease ( backing );
129 }
130 }
131
132 static
MapFileMakeInt(const ctx_t * ctx,const char * name,bool random,bool for_poslen)133 MapFile *MapFileMakeInt ( const ctx_t *ctx, const char *name, bool random, bool for_poslen )
134 {
135 MapFile *mf;
136 TRY ( mf = MemAlloc ( ctx, sizeof * mf, true ) )
137 {
138 /* create KDirectory */
139 KDirectory *wd;
140 rc_t rc = KDirectoryNativeDir ( & wd );
141 if ( rc != 0 )
142 SYSTEM_ERROR ( rc, "failed to create native directory" );
143 else
144 {
145 const Tool *tp = ctx -> caps -> tool;
146 size_t bsize = random ? tp -> map_file_random_bsize : tp -> map_file_bsize;
147
148 /* create old=>new id file */
149 TRY ( MapFileMakeFork ( & mf -> f_old, ctx, name, wd, tp -> tmpdir, tp -> pid, bsize, "old" ) )
150 {
151 TRY ( MapFileMakeFork ( & mf -> f_new, ctx, name, wd, tp -> tmpdir, tp -> pid, 32 * 1024, "new" ) )
152 {
153 if ( for_poslen )
154 MapFileMakeFork ( & mf -> f_pos, ctx, name, wd, tp -> tmpdir, tp -> pid, 32 * 1024, "pos" );
155
156 KDirectoryRelease ( wd );
157
158 if ( ! FAILED () )
159 {
160 /* this is our guy */
161 KRefcountInit ( & mf -> refcount, 1, "MapFile", "make", name );
162
163 return mf;
164 }
165
166 KFileRelease ( mf -> f_new );
167 }
168
169 KFileRelease ( mf -> f_old );
170 }
171
172 KDirectoryRelease ( wd );
173 }
174
175 MemFree ( ctx, mf, sizeof * mf );
176 }
177
178 return NULL;
179 }
180
MapFileMake(const ctx_t * ctx,const char * name,bool random)181 MapFile *MapFileMake ( const ctx_t *ctx, const char *name, bool random )
182 {
183 FUNC_ENTRY ( ctx );
184 return MapFileMakeInt ( ctx, name, random, false );
185 }
186
187 /* MakeForPoslen
188 * creates an id map with an additional poslen file
189 * this is specially for PRIMARY_ALIGNMENT_IDS and SECONDARY_ALIGNMENT_IDS
190 * that use *_ALIGNMENT global position and length as a sorting key
191 * we drop it to a file after its generation so that it can be
192 * picked up later when copying the corresponding alignment table.
193 */
MapFileMakeForPoslen(const ctx_t * ctx,const char * name)194 MapFile *MapFileMakeForPoslen ( const ctx_t *ctx, const char *name )
195 {
196 FUNC_ENTRY ( ctx );
197 return MapFileMakeInt ( ctx, name, false, true );
198 }
199
200
201 /* Release
202 */
MapFileRelease(const MapFile * self,const ctx_t * ctx)203 void MapFileRelease ( const MapFile *self, const ctx_t *ctx )
204 {
205 rc_t rc;
206 FUNC_ENTRY ( ctx );
207
208 if ( self != NULL )
209 {
210 switch ( KRefcountDrop ( & self -> refcount, "MapFile" ) )
211 {
212 case krefOkay:
213 break;
214 case krefWhack:
215 MapFileWhack ( ( MapFile* ) self, ctx );
216 break;
217 case krefZero:
218 case krefLimit:
219 case krefNegative:
220 rc = RC ( rcExe, rcFile, rcDestroying, rcRefcount, rcInvalid );
221 INTERNAL_ERROR ( rc, "failed to release MapFile" );
222 }
223 }
224 }
225
226 /* Duplicate
227 */
MapFileDuplicate(const MapFile * self,const ctx_t * ctx)228 MapFile *MapFileDuplicate ( const MapFile *self, const ctx_t *ctx )
229 {
230 rc_t rc;
231 FUNC_ENTRY ( ctx );
232
233 if ( self != NULL )
234 {
235 switch ( KRefcountAdd ( & self -> refcount, "MapFile" ) )
236 {
237 case krefOkay:
238 case krefWhack:
239 case krefZero:
240 break;
241 case krefLimit:
242 case krefNegative:
243 rc = RC ( rcExe, rcFile, rcAttaching, rcRefcount, rcInvalid );
244 INTERNAL_ERROR ( rc, "failed to duplicate MapFile" );
245 return NULL;
246 }
247 }
248
249 return ( MapFile* ) self;
250 }
251
252
253 /* SsetIdRange
254 * required second-stage initialization
255 * must be called before any writes occur
256 */
MapFileSetIdRange(MapFile * self,const ctx_t * ctx,int64_t first_id,uint64_t num_ids)257 void MapFileSetIdRange ( MapFile *self, const ctx_t *ctx,
258 int64_t first_id, uint64_t num_ids )
259 {
260 rc_t rc;
261 FUNC_ENTRY ( ctx );
262
263 if ( self == NULL )
264 {
265 rc = RC ( rcExe, rcFile, rcUpdating, rcSelf, rcNull );
266 INTERNAL_ERROR ( rc, "bad self" );
267 }
268 else if ( self -> max_new_id != 0 )
269 {
270 rc = RC ( rcExe, rcFile, rcUpdating, rcConstraint, rcViolated );
271 INTERNAL_ERROR ( rc, "cannot change id range after writing has begun" );
272 }
273 else
274 {
275 /* record new first id and count */
276 self -> first_id = first_id;
277 self -> num_ids = num_ids;
278
279 /* determine id size for all ids PLUS 0 for NULL */
280 for ( ++ num_ids, self -> id_size = 1; self -> id_size < 8; ++ self -> id_size )
281 {
282 if ( num_ids <= ( ( uint64_t ) 1 ) << ( self -> id_size * 8 ) )
283 break;
284 }
285 }
286 }
287
288
289 /* First
290 * return first mapped id
291 */
MapFileFirst(const MapFile * self,const ctx_t * ctx)292 int64_t MapFileFirst ( const MapFile *self, const ctx_t *ctx )
293 {
294 return self -> first_id;
295 }
296
297
298 /* Count
299 * return num_ids
300 */
MapFileCount(const MapFile * self,const ctx_t * ctx)301 uint64_t MapFileCount ( const MapFile *self, const ctx_t *ctx )
302 {
303 return self -> num_ids;
304 }
305
306
307 /* SetOldToNew
308 * write old=>new id mappings
309 */
MapFileSetOldToNew(MapFile * self,const ctx_t * ctx,const IdxMapping * ids,size_t count)310 void MapFileSetOldToNew ( MapFile *self, const ctx_t *ctx, const IdxMapping *ids, size_t count )
311 {
312 rc_t rc;
313 FUNC_ENTRY ( ctx );
314
315 if ( self == NULL )
316 {
317 rc = RC ( rcExe, rcFile, rcWriting, rcSelf, rcNull );
318 INTERNAL_ERROR ( rc, "bad self reference" );
319 }
320 else if ( self -> num_ids == 0 )
321 {
322 rc = RC ( rcExe, rcFile, rcWriting, rcRange, rcUndefined );
323 INTERNAL_ERROR ( rc, "SetIdRange must be called with a non-empty range" );
324 }
325 else
326 {
327 size_t i;
328 for ( i = 0; i < count; ++ i )
329 {
330 size_t num_writ;
331 /* zero-based index */
332 int64_t old_id = ids [ i ] . old_id - self -> first_id;
333 /* position in bytes */
334 uint64_t pos = old_id * self -> id_size;
335 /* 1-based translated new-id */
336 int64_t new_id = ids [ i ] . new_id - self -> first_id + 1;
337 assert ( old_id >= 0 );
338 assert ( new_id >= 0 );
339 #if __BYTE_ORDER == __BIG_ENDIAN
340 new_id = bswap_64 ( new_id );
341 #endif
342 rc = KFileWriteAll ( self -> f_old, pos, & new_id, self -> id_size, & num_writ );
343 if ( rc != 0 )
344 {
345 SYSTEM_ERROR ( rc, "failed to write old=>new id mapping" );
346 break;
347 }
348
349 if ( num_writ != self -> id_size )
350 {
351 rc = RC ( rcExe, rcFile, rcWriting, rcTransfer, rcIncomplete );
352 SYSTEM_ERROR ( rc, "failed to write old=>new id mapping" );
353 break;
354 }
355 }
356 }
357 }
358
359
360 /* SetNewToOld
361 * write new=>old id mappings
362 */
MapFileSetNewToOld(MapFile * self,const ctx_t * ctx,const IdxMapping * ids,size_t count)363 void MapFileSetNewToOld ( MapFile *self, const ctx_t *ctx, const IdxMapping *ids, size_t count )
364 {
365 rc_t rc;
366 FUNC_ENTRY ( ctx );
367
368 if ( self == NULL )
369 {
370 rc = RC ( rcExe, rcFile, rcWriting, rcSelf, rcNull );
371 INTERNAL_ERROR ( rc, "bad self reference" );
372 }
373 else if ( self -> num_ids == 0 )
374 {
375 rc = RC ( rcExe, rcFile, rcWriting, rcRange, rcUndefined );
376 INTERNAL_ERROR ( rc, "SetIdRange must be called with a non-empty range" );
377 }
378 else
379 {
380 if ( ! ctx -> caps -> tool -> write_new_to_old )
381 self -> max_new_id = self -> first_id + count - 1;
382 else
383 {
384 size_t i;
385 for ( i = 0; i < count; ++ i )
386 {
387 size_t num_writ;
388 /* zero based index */
389 int64_t new_id = ids [ i ] . new_id - self -> first_id;
390 /* zero based byte offset */
391 uint64_t pos = new_id * self -> id_size;
392 /* 1-based translated old-id */
393 int64_t old_id = ids [ i ] . old_id - self -> first_id + 1;
394 assert ( new_id >= 0 );
395 assert ( old_id >= 0 );
396 #if __BYTE_ORDER == __BIG_ENDIAN
397 old_id = bswap_64 ( old_id );
398 #endif
399 rc = KFileWriteAll ( self -> f_new, pos, & old_id, self -> id_size, & num_writ );
400 if ( rc != 0 )
401 {
402 SYSTEM_ERROR ( rc, "failed to write new=>old id mapping" );
403 break;
404 }
405
406 if ( num_writ != self -> id_size )
407 {
408 rc = RC ( rcExe, rcFile, rcWriting, rcTransfer, rcIncomplete );
409 SYSTEM_ERROR ( rc, "failed to write new=>old id mapping" );
410 break;
411 }
412
413 self -> max_new_id = new_id + self -> first_id;
414 }
415 }
416 }
417 }
418
419
420 /* SetPoslen
421 * write global position/length in new-id order
422 */
MapFileSetPoslen(MapFile * self,const ctx_t * ctx,const IdxMapping * ids,size_t count)423 void MapFileSetPoslen ( MapFile *self, const ctx_t *ctx, const IdxMapping *ids, size_t count )
424 {
425 rc_t rc;
426 FUNC_ENTRY ( ctx );
427
428 if ( self == NULL )
429 {
430 rc = RC ( rcExe, rcFile, rcWriting, rcSelf, rcNull );
431 INTERNAL_ERROR ( rc, "bad self reference" );
432 }
433 else if ( self -> f_pos == NULL )
434 {
435 rc = RC ( rcExe, rcFile, rcWriting, rcFile, rcIncorrect );
436 INTERNAL_ERROR ( rc, "MapFile must be created with MapFileMakeForPoslen" );
437 }
438 else if ( self -> num_ids == 0 )
439 {
440 rc = RC ( rcExe, rcFile, rcWriting, rcRange, rcUndefined );
441 INTERNAL_ERROR ( rc, "SetIdRange must be called with a non-empty range" );
442 }
443 else
444 {
445 /* start writing after the last new id recorded */
446 uint64_t pos = ( self -> max_new_id - self -> first_id + 1 ) * sizeof ids -> new_id;
447
448 size_t i;
449 for ( i = 0; i < count; pos += sizeof ids -> new_id, ++ i )
450 {
451 size_t num_writ;
452 int64_t poslen = ids [ i ] . new_id;
453 #if __BYTE_ORDER == __BIG_ENDIAN
454 poslen = bswap_64 ( poslen );
455 #endif
456 rc = KFileWriteAll ( self -> f_pos, pos, & poslen, sizeof ids -> new_id, & num_writ );
457 if ( rc != 0 )
458 {
459 SYSTEM_ERROR ( rc, "failed to write poslen temporary column" );
460 break;
461 }
462
463 if ( num_writ != sizeof ids -> new_id )
464 {
465 rc = RC ( rcExe, rcFile, rcWriting, rcTransfer, rcIncomplete );
466 SYSTEM_ERROR ( rc, "failed to write poslen temporary column" );
467 break;
468 }
469 }
470 }
471 }
472
473
474 /* ReadPoslen
475 * read global position/length in new-id order
476 * starts reading at the indicated row
477 * returns the number read
478 */
MapFileReadPoslen(const MapFile * self,const ctx_t * ctx,int64_t start_id,uint64_t * poslen,size_t max_count)479 size_t MapFileReadPoslen ( const MapFile *self, const ctx_t *ctx,
480 int64_t start_id, uint64_t *poslen, size_t max_count )
481 {
482 FUNC_ENTRY ( ctx );
483
484 rc_t rc;
485 size_t total = 0;
486
487 if ( self == NULL )
488 {
489 rc = RC ( rcExe, rcFile, rcReading, rcSelf, rcNull );
490 INTERNAL_ERROR ( rc, "bad self reference" );
491 }
492 else if ( self -> f_pos == NULL )
493 {
494 rc = RC ( rcExe, rcFile, rcReading, rcFile, rcIncorrect );
495 INTERNAL_ERROR ( rc, "MapFile must be created with MapFileMakeForPoslen" );
496 }
497 /* the column is read until end, but
498 this is not an error. just return no-more-data */
499 else if ( start_id == self -> first_id + self -> num_ids )
500 return 0;
501 else if ( start_id < self -> first_id || start_id > self -> first_id + self -> num_ids )
502 {
503 rc = RC ( rcExe, rcFile, rcReading, rcParam, rcInvalid );
504 INTERNAL_ERROR ( rc, "start_id ( %ld ) is not within column range ( %ld .. %ld )",
505 start_id, self -> first_id, self -> first_id + self -> num_ids - 1 );
506 }
507 else if ( max_count != 0 )
508 {
509 /* limit read to number of ids in index */
510 if ( start_id + max_count > self -> first_id + self -> num_ids )
511 max_count = ( size_t ) ( self -> first_id + self -> num_ids - start_id );
512
513 if ( poslen == NULL )
514 {
515 rc = RC ( rcExe, rcFile, rcReading, rcBuffer, rcNull );
516 INTERNAL_ERROR ( rc, "bad buffer parameter" );
517 }
518 else
519 {
520 /* read as many bytes of id as possible */
521 size_t num_read, to_read = max_count * sizeof * poslen;
522 uint64_t pos = ( start_id - self -> first_id ) * sizeof * poslen;
523 rc = KFileReadAll ( self -> f_pos, pos, poslen, to_read, & num_read );
524 if ( rc != 0 )
525 SYSTEM_ERROR ( rc, "failed to read new=>old map" );
526 else
527 {
528 total = ( size_t ) ( num_read / sizeof * poslen );
529
530 #if __BYTE_ORDER == __BIG_ENDIAN
531 {
532 /* revert byte-order */
533 size_t i;
534 for ( i = 0; i < total; ++ i )
535 poslen [ i ] = bswap_64 ( poslen [ i ] );
536 }
537 #endif
538 }
539 }
540 }
541
542 return total;
543 }
544
545
546 /* AllocMissingNewIds
547 * it is possible to have written fewer new=>old mappings
548 * than are supposed to be there, e.g. SEQUENCE that gets
549 * initially written by align tables. unaligned sequences
550 * need to fill in the remainder.
551 */
552 static
make_missing_entry(MapFile * self,const ctx_t * ctx,IdxMapping * missing,size_t i,size_t max_missing_ids,int64_t old_id,int64_t new_id)553 size_t make_missing_entry ( MapFile *self, const ctx_t *ctx,
554 IdxMapping *missing, size_t i, size_t max_missing_ids,
555 int64_t old_id, int64_t new_id )
556 {
557 if ( i == max_missing_ids )
558 {
559 ON_FAIL ( MapFileSetNewToOld ( self, ctx, missing, i ) )
560 return i;
561
562 i = 0;
563 }
564
565 /* insert into missing id map */
566 missing [ i ] . old_id = old_id;
567 missing [ i ] . new_id = new_id;
568
569 /* update old=>new mapping */
570 MapFileSetOldToNew ( self, ctx, & missing [ i ], 1 );
571
572 return i + 1;
573 }
574
MapFileAllocMissingNewIds(MapFile * self,const ctx_t * ctx)575 int64_t MapFileAllocMissingNewIds ( MapFile *self, const ctx_t *ctx )
576 {
577 FUNC_ENTRY ( ctx );
578
579 rc_t rc;
580 int64_t first_allocated = 0;
581
582 if ( self == NULL )
583 {
584 rc = RC ( rcExe, rcFile, rcUpdating, rcSelf, rcNull );
585 INTERNAL_ERROR ( rc, "bad self reference" );
586 }
587 else if ( self -> num_ids == 0 )
588 {
589 rc = RC ( rcExe, rcFile, rcWriting, rcRange, rcUndefined );
590 INTERNAL_ERROR ( rc, "SetIdRange must be called with a non-empty range" );
591 }
592 else
593 {
594 uint64_t num_missing_ids = self -> num_ids - ( self -> max_new_id - self -> first_id + 1 );
595 if ( num_missing_ids != 0 )
596 {
597 IdxMapping *missing;
598
599 /* determine maximum missing id mappings */
600 size_t max_missing_ids = ctx -> caps -> tool -> max_missing_ids;
601 if ( ( uint64_t ) max_missing_ids > num_missing_ids )
602 max_missing_ids = ( size_t ) num_missing_ids;
603
604 /* allocate a buffer of IdxMapping */
605 TRY ( missing = MemAlloc ( ctx, sizeof * missing * max_missing_ids, false ) )
606 {
607 /* use a scan buffer somewhere around 32K,
608 to keep page-file cache focused on a few pages */
609 void *scan_buffer;
610 const size_t scan_buffer_size = 32 * 1024;
611 TRY ( scan_buffer = MemAlloc ( ctx, scan_buffer_size, false ) )
612 {
613 size_t i, num_read;
614 int64_t new_id, old_id = self -> first_id;
615 uint64_t eof = self -> num_ids * self -> id_size;
616
617 /* this guy will be used to assign new ids */
618 int64_t max_new_id = self -> max_new_id;
619 int64_t entry_max_new_id = self -> max_new_id;
620
621
622 /* in a loop, read as many ids as possible into scan buffer
623 the number is dependent upon self->id_size */
624 for ( i = 0; ; old_id += num_read )
625 {
626 size_t j, to_read;
627
628 /* determine position from old_id */
629 uint64_t pos = ( old_id - self -> first_id ) * self -> id_size;
630 if ( pos == eof )
631 break;
632
633 /* determine bytes to read - this must be limited
634 or we risk tricking page-buffer into resizing */
635 to_read = scan_buffer_size;
636 if ( pos + to_read > eof )
637 to_read = ( size_t ) ( eof - pos );
638
639 /* read as many bytes as we can */
640 rc = KFileReadAll ( self -> f_old, pos, scan_buffer, to_read, & num_read );
641 if ( rc != 0 )
642 {
643 SYSTEM_ERROR ( rc, "failed to read old=>new map" );
644 break;
645 }
646
647 /* convert to count */
648 num_read /= self -> id_size;
649 if ( num_read == 0 )
650 {
651 rc = RC ( rcExe, rcFile, rcReading, rcTransfer, rcIncomplete );
652 SYSTEM_ERROR ( rc, "failed to read old=>new map" );
653 break;
654 }
655
656 /* scan for zeros, and for every zero found,
657 make entry into IdxMapping table, writing
658 new-id back to old=>new map */
659 switch ( self -> id_size )
660 {
661 case 1:
662 for ( j = 0; j < num_read; ++ j )
663 {
664 if ( ( ( const uint8_t* ) scan_buffer ) [ j ] == 0 )
665 {
666 ON_FAIL ( i = make_missing_entry ( self, ctx, missing,
667 i, max_missing_ids, old_id + j, ++ max_new_id ) )
668 break;
669 }
670 }
671 break;
672 case 2:
673 for ( j = 0; j < num_read; ++ j )
674 {
675 if ( ( ( const uint16_t* ) scan_buffer ) [ j ] == 0 )
676 {
677 ON_FAIL ( i = make_missing_entry ( self, ctx, missing,
678 i, max_missing_ids, old_id + j, ++ max_new_id ) )
679 break;
680 }
681 }
682 break;
683 case 3:
684 for ( num_read *= 3, j = 0; j < num_read; j += 3 )
685 {
686 if ( ( ( const uint8_t* ) scan_buffer ) [ j + 0 ] == 0 &&
687 ( ( const uint8_t* ) scan_buffer ) [ j + 1 ] == 0 &&
688 ( ( const uint8_t* ) scan_buffer ) [ j + 2 ] == 0 )
689 {
690 ON_FAIL ( i = make_missing_entry ( self, ctx, missing,
691 i, max_missing_ids, old_id + j / 3, ++ max_new_id ) )
692 break;
693 }
694 }
695 num_read /= 3;
696 break;
697 case 4:
698 for ( j = 0; j < num_read; ++ j )
699 {
700 if ( ( ( const uint32_t* ) scan_buffer ) [ j ] == 0 )
701 {
702 ON_FAIL ( i = make_missing_entry ( self, ctx, missing,
703 i, max_missing_ids, old_id + j, ++ max_new_id ) )
704 break;
705 }
706 }
707 break;
708 case 8:
709 for ( j = 0; j < num_read; ++ j )
710 {
711 if ( ( ( const uint64_t* ) scan_buffer ) [ j ] == 0 )
712 {
713 ON_FAIL ( i = make_missing_entry ( self, ctx, missing,
714 i, max_missing_ids, old_id + j, ++ max_new_id ) )
715 break;
716 }
717 }
718 break;
719 default:
720 for ( num_read *= self -> id_size, j = 0; j < num_read; j += self -> id_size )
721 {
722 new_id = 0;
723 memmove ( & new_id, & ( ( const uint8_t* ) scan_buffer ) [ j ], self -> id_size );
724 if ( new_id == 0 )
725 {
726 ON_FAIL ( i = make_missing_entry ( self, ctx, missing,
727 i, max_missing_ids, old_id + j / self -> id_size, ++ max_new_id ) )
728 break;
729 }
730 }
731 num_read /= self -> id_size;
732 }
733 }
734
735 if ( ! FAILED () && i != 0 )
736 MapFileSetNewToOld ( self, ctx, missing, i );
737
738 assert ( FAILED () || self -> max_new_id == max_new_id );
739
740 if ( max_new_id != entry_max_new_id )
741 first_allocated = entry_max_new_id + 1;
742
743 MemFree ( ctx, scan_buffer, scan_buffer_size );
744 }
745
746 MemFree ( ctx, missing, sizeof * missing * max_missing_ids );
747 }
748 }
749 }
750
751 return first_allocated;
752 }
753
754
755 /* ReadNewToOld
756 * reads a bunch of new=>old mappings
757 */
758 static
MapFileReadNewToOld(const MapFile * self,const ctx_t * ctx,int64_t start_id,IdxMapping * ids,size_t max_count)759 size_t MapFileReadNewToOld ( const MapFile *self, const ctx_t *ctx,
760 int64_t start_id, IdxMapping *ids, size_t max_count )
761 {
762 FUNC_ENTRY ( ctx );
763
764 rc_t rc;
765 size_t total = 0;
766
767 if ( self == NULL )
768 {
769 rc = RC ( rcExe, rcFile, rcReading, rcSelf, rcNull );
770 INTERNAL_ERROR ( rc, "bad self reference" );
771 }
772 /* the new=>old mapping is read until end, but
773 this is not an error. just return no-more-data */
774 else if ( start_id == self -> first_id + self -> num_ids )
775 return 0;
776 else if ( start_id < self -> first_id || start_id > self -> first_id + self -> num_ids )
777 {
778 rc = RC ( rcExe, rcFile, rcReading, rcParam, rcInvalid );
779 INTERNAL_ERROR ( rc, "start_id ( %ld ) is not within map range ( %ld .. %ld )",
780 start_id, self -> first_id, self -> first_id + self -> num_ids - 1 );
781 }
782 else if ( max_count != 0 )
783 {
784 /* limit read to number of ids in index */
785 if ( start_id + max_count > self -> first_id + self -> num_ids )
786 max_count = ( size_t ) ( self -> first_id + self -> num_ids - start_id );
787
788 if ( ids == NULL )
789 {
790 rc = RC ( rcExe, rcFile, rcReading, rcBuffer, rcNull );
791 INTERNAL_ERROR ( rc, "bad buffer parameter" );
792 }
793 else
794 {
795 /* read as many bytes of id as possible */
796 size_t num_read, to_read = max_count * self -> id_size;
797 uint64_t pos = ( start_id - self -> first_id ) * self -> id_size;
798 rc = KFileReadAll ( self -> f_new, pos, ids, to_read, & num_read );
799 if ( rc != 0 )
800 SYSTEM_ERROR ( rc, "failed to read new=>old map" );
801 else
802 {
803 size_t i;
804 const char *packed = ( const char* ) ids;
805 total = ( size_t ) ( num_read /self -> id_size );
806
807 /* read each packed id as little-endian integer onto
808 pre-zeroed 64-bit integer, swap if needed */
809 for ( num_read = total * self -> id_size, i = total; i > 0; )
810 {
811 int64_t unpacked = 0;
812
813 num_read -= self -> id_size;
814 -- i;
815
816 memmove ( & unpacked, & packed [ num_read ], self -> id_size );
817 #if __BYTE_ORDER == __BIG_ENDIAN
818 unpacked = bswap_64 ( unpacked );
819 #endif
820 if ( unpacked != 0 )
821 unpacked += self -> first_id - 1;
822
823 ids [ i ] . old_id = unpacked;
824 ids [ i ] . new_id = start_id + i;
825 }
826 }
827 }
828 }
829
830 return total;
831 }
832
833
834 /* SelectOldToNewPairs
835 * specify the range of new ids to select from
836 * read them in old=>new order
837 */
MapFileSelectOldToNewPairs(const MapFile * self,const ctx_t * ctx,int64_t start_id,IdxMapping * ids,size_t max_count)838 size_t MapFileSelectOldToNewPairs ( const MapFile *self, const ctx_t *ctx,
839 int64_t start_id, IdxMapping *ids, size_t max_count )
840 {
841 FUNC_ENTRY ( ctx );
842
843 uint64_t eof;
844 int64_t end_excl;
845 size_t i, j, total;
846
847 /* limit read to number of ids in index */
848 if ( start_id + max_count > self -> first_id + self -> num_ids )
849 max_count = ( size_t ) ( self -> first_id + self -> num_ids - start_id );
850
851 /* range is start_id to end_excl */
852 end_excl = start_id + max_count;
853
854 /* eof for f_old */
855 eof = self -> num_ids * self -> id_size;
856
857 for ( total = i = 0; i < max_count; total += j )
858 {
859 rc_t rc;
860 size_t off, num_read;
861 char buff [ 32 * 1024 ];
862
863 /* read some data from file */
864 uint64_t pos = total * self -> id_size;
865 size_t to_read = sizeof buff;
866 if ( pos == eof )
867 break;
868 if ( pos + to_read > eof )
869 to_read = ( size_t ) ( eof - pos );
870 rc = KFileReadAll ( self -> f_old, pos, buff, to_read, & num_read );
871 if ( rc != 0 )
872 {
873 SYSTEM_ERROR ( rc, "failed to read old=>new map" );
874 break;
875 }
876
877 /* reached end - requested more ids than we have */
878 if ( num_read == 0 )
879 break;
880
881 /* the number of whole ids actually read */
882 if ( ( num_read /= self -> id_size ) == 0 )
883 {
884 rc = RC ( rcExe, rcFile, rcReading, rcTransfer, rcIncomplete );
885 SYSTEM_ERROR ( rc, "failed to read old=>new map - file incomplete" );
886 break;
887 }
888
889 /* select new-ids from ids read */
890 for ( off = j = 0; j < num_read; off += self -> id_size, ++ j )
891 {
892 int64_t unpacked = 0;
893 memmove ( & unpacked, & buff [ off ], self -> id_size );
894 #if __BYTE_ORDER == __BIG_ENDIAN
895 unpacked = bswap_64 ( unpacked );
896 #endif
897 /* only offset non-zero (NULL) ids */
898 if ( unpacked != 0 )
899 unpacked += self -> first_id - 1;
900
901 /* if id meets criteria */
902 if ( unpacked >= start_id && unpacked < end_excl )
903 {
904 ids [ i ] . old_id = self -> first_id + total + j;
905 ids [ i ] . new_id = unpacked;
906 if ( ++ i == max_count )
907 break;
908 }
909 }
910 }
911
912 return i;
913 }
914
915
916 /* SelectOldToNewSingle
917 * specify the range of new ids to select from
918 * read them in old=>new order
919 */
MapFileSelectOldToNewSingle(const MapFile * self,const ctx_t * ctx,int64_t start_id,int64_t * ids,uint32_t * opt_ord,size_t max_count)920 size_t MapFileSelectOldToNewSingle ( const MapFile *self, const ctx_t *ctx,
921 int64_t start_id, int64_t *ids, uint32_t *opt_ord, size_t max_count )
922 {
923 FUNC_ENTRY ( ctx );
924
925 uint64_t eof;
926 int64_t end_excl;
927 size_t i, j, total;
928
929 /* limit read to number of ids in index */
930 if ( start_id + max_count > self -> first_id + self -> num_ids )
931 max_count = ( size_t ) ( self -> first_id + self -> num_ids - start_id );
932
933 /* range is start_id to end_excl */
934 end_excl = start_id + max_count;
935
936 /* eof for f_old */
937 eof = self -> num_ids * self -> id_size;
938
939 for ( total = i = 0; i < max_count; total += j )
940 {
941 rc_t rc;
942 size_t off, num_read;
943 char buff [ 32 * 1024 ];
944
945 /* read some data from file */
946 uint64_t pos = total * self -> id_size;
947 size_t to_read = sizeof buff;
948 if ( pos == eof )
949 break;
950 if ( pos + to_read > eof )
951 to_read = ( size_t ) ( eof - pos );
952 rc = KFileReadAll ( self -> f_old, pos, buff, to_read, & num_read );
953 if ( rc != 0 )
954 {
955 SYSTEM_ERROR ( rc, "failed to read old=>new map" );
956 break;
957 }
958
959 /* reached end - requested more ids than we have */
960 if ( num_read == 0 )
961 break;
962
963 /* the number of whole ids actually read */
964 if ( ( num_read /= self -> id_size ) == 0 )
965 {
966 rc = RC ( rcExe, rcFile, rcReading, rcTransfer, rcIncomplete );
967 SYSTEM_ERROR ( rc, "failed to read old=>new map - file incomplete" );
968 break;
969 }
970
971 /* select new-ids from ids read */
972 for ( off = j = 0; j < num_read; off += self -> id_size, ++ j )
973 {
974 int64_t unpacked = 0;
975 memmove ( & unpacked, & buff [ off ], self -> id_size );
976 #if __BYTE_ORDER == __BIG_ENDIAN
977 unpacked = bswap_64 ( unpacked );
978 #endif
979 /* only offset non-zero (NULL) ids */
980 if ( unpacked != 0 )
981 unpacked += self -> first_id - 1;
982
983 /* if id meets criteria */
984 if ( unpacked >= start_id && unpacked < end_excl )
985 {
986 ids [ i ] = self -> first_id + total + j;
987 assert ( i <= 0xFFFFFFFF );
988 if ( opt_ord != NULL )
989 opt_ord [ unpacked - start_id ] = ( uint32_t ) i;
990 if ( ++ i == max_count )
991 break;
992 }
993 }
994 }
995
996 return i;
997 }
998
999
1000 /* MapSingleOldToNew
1001 * reads a single old=>new mapping
1002 * returns new id or 0 if not found
1003 * optionally allocates a new id if "insert" is true
1004 */
MapFileMapSingleOldToNew(MapFile * self,const ctx_t * ctx,int64_t old_id,bool insert)1005 int64_t MapFileMapSingleOldToNew ( MapFile *self, const ctx_t *ctx,
1006 int64_t old_id, bool insert )
1007 {
1008 FUNC_ENTRY ( ctx );
1009
1010 rc_t rc;
1011 int64_t new_id = 0;
1012
1013 if ( self == NULL )
1014 {
1015 rc = RC ( rcExe, rcFile, rcReading, rcSelf, rcNull );
1016 INTERNAL_ERROR ( rc, "bad self reference" );
1017 }
1018 else if ( old_id < self -> first_id || old_id >= self -> first_id + self -> num_ids )
1019 {
1020 rc = RC ( rcExe, rcFile, rcReading, rcParam, rcInvalid );
1021 INTERNAL_ERROR ( rc, "old_id ( %ld ) is not within map range ( %ld .. %ld )",
1022 old_id, self -> first_id, self -> first_id + self -> num_ids - 1 );
1023 }
1024 else
1025 {
1026 size_t num_read, to_read = self -> id_size;
1027 uint64_t pos = ( old_id - self -> first_id ) * self -> id_size;
1028 rc = KFileReadAll ( self -> f_old, pos, & new_id, to_read, & num_read );
1029 if ( rc != 0 )
1030 SYSTEM_ERROR ( rc, "failed to read old=>new map" );
1031 else
1032 {
1033 #if __BYTE_ORDER == __BIG_ENDIAN
1034 new_id = bswap_64 ( new_id );
1035 #endif
1036 if ( new_id != 0 )
1037 new_id += self -> first_id - 1;
1038 else if ( insert )
1039 {
1040 /* create a mapping using the last known
1041 new id plus one as the id to assign on insert */
1042 IdxMapping mapping;
1043 mapping . old_id = old_id;
1044 mapping . new_id = self -> max_new_id + 1;
1045
1046 TRY ( MapFileSetOldToNew ( self, ctx, & mapping, 1 ) )
1047 {
1048 TRY ( MapFileSetNewToOld ( self, ctx, & mapping, 1 ) )
1049 {
1050 new_id = mapping . new_id;
1051
1052 if ( self -> num_ids >= 100000 )
1053 {
1054 uint64_t scaled = ++ self -> num_mapped_ids * 100;
1055 uint64_t prior = scaled - 100;
1056 if ( ( prior / self -> num_ids ) != ( scaled /= self -> num_ids ) )
1057 STATUS ( 2, "have mapped %lu%% ids", scaled );
1058 }
1059 }
1060 }
1061 }
1062 }
1063 }
1064
1065 return new_id;
1066 }
1067
1068
1069 /* ConsistencyCheck
1070 */
1071 static
MapFileCrossCheck(const MapFile * self,const ctx_t * ctx,IdxMapping * ids,size_t max_ids)1072 void MapFileCrossCheck ( const MapFile *self, const ctx_t *ctx, IdxMapping *ids, size_t max_ids )
1073 {
1074 FUNC_ENTRY ( ctx );
1075
1076 rc_t rc;
1077 uint64_t total;
1078 size_t num_read;
1079 int64_t last_old, last_new;
1080
1081 for ( last_old = last_new = -1, total = 0; total < self -> num_ids; total += num_read )
1082 {
1083 int64_t id;
1084 size_t i;
1085
1086 STATUS ( 2, "reading new=>old ids" );
1087
1088 ON_FAIL ( num_read = MapFileReadNewToOld ( self, ctx, self -> first_id + total, ids, max_ids ) )
1089 {
1090 ANNOTATE ( "failed to read new=>old ids" );
1091 return;
1092 }
1093
1094 STATUS ( 2, "read %,u pairs", num_read );
1095
1096 STATUS ( 2, "checking for repeats in new-id space" );
1097 for ( i = 0; i < num_read; last_new = id, ++ i )
1098 {
1099 id = ids [ i ] . new_id;
1100 if ( id == last_new )
1101 {
1102 rc = RC ( rcExe, rcIndex, rcValidating, rcId, rcDuplicate );
1103 ERROR ( rc, "new id %,ld repeats", id );
1104 return;
1105 }
1106 }
1107
1108 STATUS ( 2, "sorting on old id" );
1109
1110 #if USE_OLD_KSORT
1111 ksort ( ids, num_read, sizeof ids [ 0 ], IdxMappingCmpOld, ( void* ) ctx );
1112 #else
1113 IdxMappingSortOld ( ids, ctx, num_read );
1114 #endif
1115
1116 STATUS ( 2, "checking for repeats in old-id space" );
1117 for ( i = 0; i < num_read; last_old = id, ++ i )
1118 {
1119 id = ids [ i ] . old_id;
1120 if ( id == last_old )
1121 {
1122 rc = RC ( rcExe, rcIndex, rcValidating, rcId, rcDuplicate );
1123 ERROR ( rc, "old id %,ld repeats", id );
1124 return;
1125 }
1126 }
1127
1128 STATUS ( 2, "cross-checking against old=>new index" );
1129
1130 for ( i = 0; i < num_read; ++ i )
1131 {
1132 int64_t new_id;
1133
1134 /* cast away const on "self" for prototype,
1135 but function is only non-const if "insert"
1136 were true - we use "false" */
1137 ON_FAIL ( new_id = MapFileMapSingleOldToNew ( ( MapFile* ) self, ctx, ids [ i ] . old_id, false ) )
1138 return;
1139
1140 if ( new_id != ids [ i ] . new_id )
1141 {
1142 rc = RC ( rcExe, rcIndex, rcValidating, rcId, rcUnequal );
1143 ERROR ( rc, "mismatch of new ids %,ld != %,ld", new_id, ids [ i ] . new_id );
1144 return;
1145 }
1146 }
1147 }
1148 }
1149
MapFileConsistencyCheck(const MapFile * self,const ctx_t * ctx)1150 void MapFileConsistencyCheck ( const MapFile *self, const ctx_t *ctx )
1151 {
1152 FUNC_ENTRY ( ctx );
1153
1154 rc_t rc;
1155 IdxMapping *ids;
1156 const size_t max_ids = 1024 * 1024;
1157
1158 STATUS ( 2, "running consistency check on index files" );
1159
1160 /* "self" is valid */
1161 if ( self == NULL )
1162 {
1163 ANNOTATE ( "checking the validity of a NULL MapFile" );
1164 return;
1165 }
1166
1167 STATUS ( 2, "checking id_size" );
1168
1169 /* number of ids requires some number of bits */
1170 if ( ( ( uint64_t ) 1 << ( self -> id_size * 8 ) ) < self -> num_ids )
1171 {
1172 rc = RC ( rcExe, rcIndex, rcValidating, rcSize, rcIncorrect );
1173 ERROR ( rc, "calculated id size ( %zu bytes ) is insufficient to represent count of %,lu ids",
1174 self -> id_size, self -> num_ids );
1175 return;
1176 }
1177
1178 STATUS ( 2, "allocating id buffer" );
1179
1180 TRY ( ids = MemAlloc ( ctx, sizeof * ids * max_ids, false ) )
1181 {
1182 MapFileCrossCheck ( self, ctx, ids, max_ids );
1183 MemFree ( ctx, ids, sizeof * ids * max_ids );
1184 }
1185 CATCH_ALL ()
1186 {
1187 ANNOTATE ( "failed to allocate memory ( %zu pairs ) for consistency check", max_ids );
1188 return;
1189 }
1190
1191 STATUS ( 2, "finished idx consistency check" );
1192 }
1193