1 /*===========================================================================
2 *
3 *                            PUBLIC DOMAIN NOTICE
4 *               National Center for Biotechnology Information
5 *
6 *  This software/database is a "United States Government Work" under the
7 *  terms of the United States Copyright Act.  It was written as part of
8 *  the author's official duties as a United States Government employee and
9 *  thus cannot be copyrighted.  This software/database is freely available
10 *  to the public for use. The National Library of Medicine and the U.S.
11 *  Government have not placed any restriction on its use or reproduction.
12 *
13 *  Although all reasonable efforts have been taken to ensure the accuracy
14 *  and reliability of the software and data, the NLM and the U.S.
15 *  Government do not and cannot warrant the performance or results that
16 *  may be obtained by using this software or data. The NLM and the U.S.
17 *  Government disclaim all warranties, express or implied, including
18 *  warranties of performance, merchantability or fitness for any particular
19 *  purpose.
20 *
21 *  Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 
27 #include "pl-tools.h"
28 #include <klib/printf.h>
29 #include <sysalloc.h>
30 #include <stdlib.h>
31 #include <stdio.h>
32 #include <string.h>
33 
34 #include <kdb/database.h>
35 #include <vdb/database.h>
36 #include <vdb/vdb-priv.h>
37 
lctx_init(ld_context * lctx)38 void lctx_init( ld_context * lctx )
39 {
40     lctx->xml_logger = NULL;
41     lctx->xml_progress = NULL;
42     lctx->with_progress = false;
43     lctx->total_printed = false;
44     lctx->cache_content = false;
45     lctx->check_src_obj = false;
46     lctx->total_seq_bases = 0;
47     lctx->total_seq_spots = 0;
48 }
49 
50 
lctx_free(ld_context * lctx)51 void lctx_free( ld_context * lctx )
52 {
53     if ( lctx->xml_logger != NULL )
54     {
55         XMLLogger_Release( lctx->xml_logger );
56         lctx->xml_logger = NULL;
57     }
58     if ( lctx->xml_progress != NULL )
59     {
60         KLoadProgressbar_Release( lctx->xml_progress, false );
61         lctx->xml_progress = NULL;
62     }
63 }
64 
65 
check_src_objects(const KDirectory * hdf5_dir,const char ** groups,const char ** tables,bool show_not_found)66 rc_t check_src_objects( const KDirectory *hdf5_dir,
67                         const char ** groups,
68                         const char **tables,
69                         bool show_not_found )
70 {
71     rc_t rc = 0;
72     uint16_t idx = 0;
73     uint32_t pt;
74 
75     if ( groups != NULL )
76     {
77         while ( groups[ idx ] != NULL && rc == 0 )
78         {
79             pt = KDirectoryPathType ( hdf5_dir, "%s", groups[idx] );
80             if ( pt != kptDir )
81             {
82                 rc = RC( rcExe, rcNoTarg, rcAllocating, rcParam, rcInvalid );
83                 if ( show_not_found )
84                     PLOGERR( klogErr, ( klogErr, rc, "hdf5-group '$(grp)' not found",
85                                     "grp=%s", groups[ idx ] ) );
86                 else
87                     PLOGERR( klogWarn, ( klogWarn, rc, "hdf5-group '$(grp)' not found",
88                                     "grp=%s", groups[ idx ] ) );
89             }
90             else
91                 idx++;
92         }
93     }
94 
95     idx = 0;
96     if ( tables != NULL && rc == 0 )
97     {
98         while ( tables[ idx ] != NULL && rc == 0 )
99         {
100             pt = KDirectoryPathType ( hdf5_dir, "%s", tables[idx] );
101             if ( pt != kptDataset )
102             {
103                 rc = RC( rcExe, rcNoTarg, rcAllocating, rcParam, rcInvalid );
104                 if ( show_not_found )
105                     PLOGERR( klogErr, ( klogErr, rc, "hdf5-table '$(tbl)' not found",
106                                     "tbl=%s", tables[ idx ] ) );
107                 else
108                     PLOGERR( klogWarn, ( klogWarn, rc, "hdf5-table '$(tbl)' not found",
109                                     "tbl=%s", tables[ idx ] ) );
110             }
111             else
112                 idx++;
113         }
114     }
115 
116     return rc;
117 }
118 
119 
init_array_file(af_data * af)120 void init_array_file( af_data * af )
121 {
122     af->f  = NULL;
123     af->af = NULL;
124     af->extents = NULL;
125     af->rc = -1;
126     af->content = NULL;
127 }
128 
129 
free_array_file(af_data * af)130 void free_array_file( af_data * af )
131 {
132     if ( af->af != NULL )
133     {
134         KArrayFileRelease( af->af );
135         af->af = NULL;
136     }
137     if ( af->f != NULL )
138     {
139         KFileRelease( af->f );
140         af->f = NULL;
141     }
142     if ( af->extents != NULL )
143     {
144         free( af->extents );
145         af->extents = NULL;
146     }
147     if ( af->content != NULL )
148     {
149         free( af->content );
150         af->content = NULL;
151     }
152 }
153 
154 
read_cache_content(af_data * af)155 static rc_t read_cache_content( af_data * af )
156 {
157     rc_t rc = 0;
158     uint64_t filesize = ( af->element_bits >> 3 ) * ( af->extents[ 0 ] );
159     if ( af->dimensionality == 2 )
160         filesize *= af->extents[ 1 ];
161     af->content = malloc( filesize );
162     if ( af->content == NULL )
163         rc = RC ( rcApp, rcArgv, rcAccessing, rcMemory, rcExhausted );
164     else
165     {
166         uint64_t pos2[ 2 ];
167         uint64_t read2[ 2 ];
168         uint64_t count2[ 2 ];
169         rc_t rc;
170 
171         pos2[ 0 ] = 0;
172         pos2[ 1 ] = 0;
173         count2[ 0 ] = af->extents[ 0 ];
174         if ( af->dimensionality == 2 )
175             count2[ 1 ] = af->extents[ 1 ];
176         else
177             count2[ 1 ] = 0;
178 
179         rc = KArrayFileRead ( af->af, af->dimensionality, pos2,
180                               af->content, count2, read2 );
181         if ( rc != 0 )
182             LOGERR( klogErr, rc, "error reading arrayfile-data int cache" );
183     }
184     return rc;
185 }
186 
187 
open_array_file(const KDirectory * dir,const char * name,af_data * af,const uint64_t expected_element_bits,const uint64_t expected_cols,bool disp_wrong_bitsize,bool cache_content,bool supress_err_msg)188 rc_t open_array_file( const KDirectory *dir,
189                       const char *name,
190                       af_data * af,
191                       const uint64_t expected_element_bits,
192                       const uint64_t expected_cols,
193                       bool disp_wrong_bitsize,
194                       bool cache_content,
195                       bool supress_err_msg )
196 {
197     rc_t rc;
198 
199     init_array_file( af );
200     /* open the requested "File" (actually a hdf5-table) as KFile
201        the works because the given KDirectory is a HDF5-Directory */
202     rc = KDirectoryOpenFileRead ( dir, &af->f, "%s", name );
203     if ( rc != 0 )
204     {
205         if ( !supress_err_msg )
206         {
207             PLOGERR( klogErr, ( klogErr, rc, "cannot open hdf5-dataset '$(name)'",
208                             "name=%s", name ) );
209         }
210         return rc;
211     }
212     /* cast the KFile into a KArrayFile */
213     rc = MakeHDF5ArrayFile ( af->f, &af->af );
214     if ( rc != 0 )
215     {
216         PLOGERR( klogErr, ( klogErr, rc, "cannot open hdf5-arrayfile '$(name)'",
217                             "name=%s", name ) );
218         free_array_file( af );
219         return rc;
220     }
221     /* detect the dimensionality of the array-file */
222     rc = KArrayFileDimensionality ( af->af, &af->dimensionality );
223     if ( rc != 0 )
224     {
225         PLOGERR( klogErr, ( klogErr, rc, "cannot retrieve dimensionality on '$(name)'",
226                             "name=%s", name ) );
227         free_array_file( af );
228         return rc;
229     }
230     /* make a array to hold the extent in every dimension */
231     af->extents = malloc( af->dimensionality * ( sizeof ( uint64_t ) ) );
232     if ( af->extents == NULL )
233     {
234         rc = RC ( rcApp, rcArgv, rcAccessing, rcMemory, rcExhausted );
235         PLOGERR( klogErr, ( klogErr, rc, "cannot allocate enough memory for extents of '$(name)'",
236                             "name=%s", name ) );
237         free_array_file( af );
238         return rc;
239     }
240     /* read the actuall extents into the created array */
241     rc = KArrayFileDimExtents ( af->af, af->dimensionality, af->extents );
242     if ( rc != 0 )
243     {
244         PLOGERR( klogErr, ( klogErr, rc, "cannot retrieve extents of '$(name)'",
245                             "name=%s", name ) );
246         free_array_file( af );
247         return rc;
248     }
249     /* request the size of the element in bits */
250     rc = KArrayFileElementSize ( af->af, &af->element_bits );
251     if ( rc != 0 )
252     {
253         PLOGERR( klogErr, ( klogErr, rc, "cannot retrieve element-size of '$(name)'",
254                             "name=%s", name ) );
255         free_array_file( af );
256         return rc;
257     }
258     /* compare the discovered bit-size with the expected one */
259     if ( af->element_bits != expected_element_bits )
260     {
261         rc = RC ( rcExe, rcNoTarg, rcLoading, rcData, rcInconsistent );
262 
263         /* display the wrong bitsize only if wanted
264            ( this function can be called to probe the bitsize:
265              in this case the wrong one should not be shown as an error )*/
266         if ( disp_wrong_bitsize )
267             PLOGERR( klogErr, ( klogErr, rc, "unexpected element-bits of $(bsize) in '$(name)'",
268                      "bsize=%lu,name=%s", af->element_bits, name ) );
269 
270         free_array_file( af );
271         return rc;
272     }
273 
274     /* not generic, we handle only dimensionality of 1 and 2 */
275     if ( expected_cols == 1 )
276     {
277         /* the dimensionality has to be 1 in this case */
278         if ( af->dimensionality != 1 )
279         {
280             rc = RC ( rcExe, rcNoTarg, rcLoading, rcData, rcInconsistent );
281             PLOGERR( klogErr, ( klogErr, rc, "unexpected dimensionality of $(dim) in '$(name)'",
282                                 "dim=%lu,name=%s", af->dimensionality, name ) );
283             free_array_file( af );
284             return rc;
285         }
286     }
287     else
288     {
289         /* the dimensionality has to be 2 in this case */
290         if ( af->dimensionality != 2 )
291         {
292             rc = RC ( rcExe, rcNoTarg, rcLoading, rcData, rcInconsistent );
293             PLOGERR( klogErr, ( klogErr, rc, "unexpected dimensionality of $(dim) in '$(name)'",
294                                 "dim=%lu,name=%s", af->dimensionality, name ) );
295             free_array_file( af );
296             return rc;
297         }
298         else
299         {
300             if ( af->extents[ 1 ] != expected_cols )
301             {
302                 rc = RC ( rcExe, rcNoTarg, rcLoading, rcData, rcInconsistent );
303                 PLOGERR( klogErr, ( klogErr, rc, "unexpected extent[1] of $(ext) in '$(name)'",
304                                     "ext=%lu,name=%s", af->extents[ 1 ], name ) );
305                 free_array_file( af );
306                 return rc;
307             }
308         }
309     }
310     if ( rc == 0 && cache_content )
311     {
312         rc = read_cache_content( af );
313     }
314     return rc;
315 }
316 
317 
318 /* assembles the 'absolute' path to the requested array-file before opening it */
open_element(const KDirectory * hdf5_dir,af_data * element,const char * path,const char * name,const uint64_t expected_element_bits,const uint64_t expected_cols,bool disp_wrong_bitsize,bool cache_content,bool supress_err_msg)319 rc_t open_element( const KDirectory *hdf5_dir,
320                    af_data *element,
321                    const char * path,
322                    const char * name,
323                    const uint64_t expected_element_bits,
324                    const uint64_t expected_cols,
325                    bool disp_wrong_bitsize,
326                    bool cache_content,
327                    bool supress_err_msg )
328 {
329     char src_path[ 64 ];
330     size_t num_writ;
331 
332     element->rc = string_printf ( src_path, sizeof src_path, &num_writ, "%s/%s", path, name );
333     if ( element->rc != 0 )
334         LOGERR( klogErr, element->rc, "cannot assemble hdf5-element-name" );
335     else
336         element->rc = open_array_file( hdf5_dir, src_path, element,
337                                        expected_element_bits, expected_cols,
338                                        disp_wrong_bitsize,
339                                        cache_content,
340                                        supress_err_msg );
341     return element->rc;
342 }
343 
344 
345 /* we are reading data from an array-file,
346    the underlying array-file knows the size of an element */
array_file_read_dim1(af_data * af,const uint64_t pos,void * dst,const uint64_t count,uint64_t * n_read)347 rc_t array_file_read_dim1( af_data * af, const uint64_t pos,
348                            void *dst, const uint64_t count,
349                            uint64_t *n_read )
350 {
351     rc_t rc = 0;
352     if ( af->content == NULL )
353         rc = KArrayFileRead ( af->af, 1, &pos, dst, &count, n_read );
354     else
355     {
356         if ( ( pos + count ) > af->extents[ 0 ] )
357             rc = RC ( rcExe, rcNoTarg, rcLoading, rcData, rcInconsistent );
358         else
359         {
360             uint64_t buf_idx = ( af->element_bits >> 3 ) * pos;
361             size_t num = ( af->element_bits >> 3 ) * count;
362             char * src = af->content;
363             src+=buf_idx;
364             memmove( dst, src, num );
365             *n_read = count;
366         }
367     }
368     if ( rc != 0 )
369         LOGERR( klogErr, rc, "error reading arrayfile-data (1 dim)" );
370     return rc;
371 }
372 
373 
374 /* we are reading values in 2 dimensions from the array-file */
array_file_read_dim2(af_data * af,const uint64_t pos,void * dst,const uint64_t count,const uint64_t ext2,uint64_t * n_read)375 rc_t array_file_read_dim2( af_data * af, const uint64_t pos,
376                            void *dst, const uint64_t count,
377                            const uint64_t ext2, uint64_t *n_read )
378 {
379     rc_t rc = 0;
380     if ( af->content == NULL )
381     {
382         uint64_t pos2[ 2 ];
383         uint64_t read2[ 2 ];
384         uint64_t count2[ 2 ];
385         rc_t rc;
386 
387         pos2[ 0 ] = pos;
388         pos2[ 1 ] = 0;
389         count2[ 0 ] = count;
390         count2[ 1 ] = ext2;
391         rc = KArrayFileRead ( af->af, 2, pos2, dst, count2, read2 );
392         if ( rc != 0 )
393             LOGERR( klogErr, rc, "error reading arrayfile-data (2 dim)" );
394         *n_read = read2[ 0 ];
395     }
396     else
397     {
398         if ( ( pos + count ) > af->extents[ 0 ] )
399             rc = RC ( rcExe, rcNoTarg, rcLoading, rcData, rcInconsistent );
400         else
401         {
402             uint64_t buf_idx = ( af->element_bits >> 3 ) * pos * af->extents[ 1 ];
403             size_t num = ( af->element_bits >> 3 ) * count * af->extents[ 1 ];
404             char * src = af->content;
405             src+=buf_idx;
406             memmove( dst, src, num );
407             *n_read = count * af->extents[ 1 ];
408         }
409     }
410     return rc;
411 }
412 
413 
add_columns(VCursor * cursor,uint32_t count,int32_t exclude_this,uint32_t * idx_vector,const char ** names)414 rc_t add_columns( VCursor * cursor, uint32_t count, int32_t exclude_this,
415                   uint32_t * idx_vector, const char ** names )
416 {
417     rc_t rc = 0;
418     uint32_t i;
419     for ( i = 0; i < count && rc == 0; ++i )
420     {
421         if ( i != exclude_this )
422         {
423             rc = VCursorAddColumn( cursor, &(idx_vector[i]), "%s", names[i] );
424             if ( rc != 0 )
425                 PLOGERR( klogErr, ( klogErr, rc, "cannot add column '$(name)' to vdb-cursor",
426                                     "name=%s", names[i] ) );
427         }
428     }
429     return rc;
430 }
431 
check_table_count(af_data * tab,const char * name,const uint64_t expected)432 bool check_table_count( af_data *tab, const char * name,
433                         const uint64_t expected )
434 {
435 	bool res = ( tab->extents != NULL );
436 	if ( res )
437 	{
438 		res = ( tab->extents[ 0 ] == expected );
439 		if ( !res )
440 		{
441 			rc_t rc = RC( rcExe, rcNoTarg, rcAllocating, rcParam, rcInvalid );
442 			PLOGERR( klogErr, ( klogErr, rc, "'$(name)'.count != expected",
443 								"name=%s", name ) );
444 		}
445 	}
446 	else
447 	{
448 		PLOGMSG( klogWarn, ( klogWarn, "Table ... '$(name)' not found", "name=%s", name ) );
449 	}
450     return res;
451 }
452 
453 
transfer_bits(VCursor * cursor,const uint32_t col_idx,af_data * src,char * buffer,const uint64_t offset,const uint64_t count,const uint32_t n_bits,const char * explanation)454 rc_t transfer_bits( VCursor *cursor, const uint32_t col_idx,
455     af_data *src, char * buffer, const uint64_t offset, const uint64_t count,
456     const uint32_t n_bits, const char * explanation )
457 {
458     uint64_t n_read;
459     rc_t rc = array_file_read_dim1( src, offset, buffer, count, &n_read );
460     if ( rc == 0 )
461     {
462         if ( count != n_read )
463         {
464             rc = RC( rcExe, rcNoTarg, rcAllocating, rcParam, rcInvalid );
465             PLOGERR( klogErr, ( klogErr, rc, "cannot read enought data from hdf5-table for '$(name)'",
466                                 "name=%s", explanation ) );
467         }
468         if ( rc == 0 )
469         {
470             rc = VCursorWrite( cursor, col_idx, n_bits, buffer, 0, count );
471             if ( rc != 0 )
472                 PLOGERR( klogErr, ( klogErr, rc, "cannot write data to vdb for '$(name)'",
473                                     "name=%s", explanation ) );
474         }
475     }
476     return rc;
477 }
478 
479 
vdb_write_value(VCursor * cursor,const uint32_t col_idx,void * src,const uint32_t n_bits,const uint32_t n_elem,const char * explanation)480 rc_t vdb_write_value( VCursor *cursor, const uint32_t col_idx,
481                       void * src, const uint32_t n_bits,
482                       const uint32_t n_elem, const char *explanation )
483 {
484     rc_t rc = VCursorWrite( cursor, col_idx, n_bits, src, 0, n_elem );
485     if ( rc != 0 )
486         PLOGERR( klogErr, ( klogErr, rc, "cannot write data to vdb for '$(name)'",
487                             "name=%s", explanation ) );
488     return rc;
489 }
490 
491 
vdb_write_uint32(VCursor * cursor,const uint32_t col_idx,uint32_t value,const char * explanation)492 rc_t vdb_write_uint32( VCursor *cursor, const uint32_t col_idx,
493                        uint32_t value, const char *explanation )
494 {
495     return vdb_write_value( cursor, col_idx, &value, 32, 1, explanation );
496 }
497 
498 
vdb_write_uint16(VCursor * cursor,const uint32_t col_idx,uint16_t value,const char * explanation)499 rc_t vdb_write_uint16( VCursor *cursor, const uint32_t col_idx,
500                        uint16_t value, const char *explanation )
501 {
502     return vdb_write_value( cursor, col_idx, &value, 16, 1, explanation );
503 }
504 
505 
vdb_write_uint8(VCursor * cursor,const uint32_t col_idx,uint8_t value,const char * explanation)506 rc_t vdb_write_uint8( VCursor *cursor, const uint32_t col_idx,
507                       uint8_t value, const char *explanation )
508 {
509     return vdb_write_value( cursor, col_idx, &value, 8, 1, explanation );
510 }
511 
512 
vdb_write_float32(VCursor * cursor,const uint32_t col_idx,float value,const char * explanation)513 rc_t vdb_write_float32( VCursor *cursor, const uint32_t col_idx,
514                         float value, const char *explanation )
515 {
516     return vdb_write_value( cursor, col_idx, &value, 32, 1, explanation );
517 }
518 
519 
prepare_table(VDatabase * database,VCursor ** cursor,const char * template_name,const char * table_name)520 rc_t prepare_table( VDatabase * database, VCursor ** cursor,
521                     const char * template_name,
522                     const char * table_name )
523 {
524     VTable * table;
525     rc_t rc = VDatabaseCreateTable( database, &table, template_name,
526                                     kcmInit | kcmMD5 | kcmParents, "%s", table_name );
527     if ( rc != 0 )
528     {
529         PLOGERR( klogErr, ( klogErr, rc, "cannot create vdb-table '$(name)'",
530                             "name=%s", table_name ) );
531     }
532     else
533     {
534         rc = VTableCreateCursorWrite( table, cursor, kcmInsert );
535         if ( rc != 0 )
536         {
537             PLOGERR( klogErr, ( klogErr, rc, "cannot create vdb-cursor for '$(name)'",
538                                 "name=%s", table_name ) );
539         }
540         VTableRelease( table );
541     }
542     return rc;
543 }
544 
545 
load_table(VDatabase * database,KDirectory * hdf5_src,ld_context * lctx,const char * template_name,const char * table_name,loader_func func)546 rc_t load_table( VDatabase * database, KDirectory * hdf5_src, ld_context *lctx,
547                  const char * template_name, const char * table_name, loader_func func )
548 {
549     VTable * table;
550     rc_t rc = VDatabaseCreateTable( database, &table, template_name,
551                                     kcmInit | kcmMD5 | kcmParents, "%s", table_name );
552     if ( rc != 0 )
553         PLOGERR( klogErr, ( klogErr, rc, "cannot create vdb-table '$(name)'",
554                             "name=%s", table_name ) );
555     else
556     {
557         VCursor * cursor;
558         rc = VTableCreateCursorWrite( table, &cursor, kcmInsert );
559         if ( rc != 0 )
560             PLOGERR( klogErr, ( klogErr, rc, "cannot create vdb-cursor for '$(name)'",
561                                 "name=%s", table_name ) );
562         else
563         {
564             VTableRelease( table );
565             rc = func( lctx, hdf5_src, cursor, table_name ); /* the callback does the job! */
566             VCursorRelease( cursor );
567         }
568     }
569     return rc;
570 }
571 
572 
progress_chunk(const KLoadProgressbar ** xml_progress,const uint64_t chunk)573 rc_t progress_chunk( const KLoadProgressbar ** xml_progress, const uint64_t chunk )
574 {
575     rc_t rc;
576     /* release the old progressbar... */
577     if ( *xml_progress != NULL )
578     {
579         KLoadProgressbar_Release( *xml_progress, false );
580         *xml_progress = NULL;
581     }
582     rc = KLoadProgressbar_Make( xml_progress, 0 );
583     if ( rc == 0 )
584         rc = KLoadProgressbar_Append( *xml_progress, chunk );
585     else
586         LOGERR( klogErr, rc, "cannot make KLoadProgressbar" );
587 
588     return rc;
589 }
590 
591 
progress_step(const KLoadProgressbar * xml_progress)592 rc_t progress_step( const KLoadProgressbar * xml_progress )
593 {
594     if ( xml_progress != NULL )
595        return KLoadProgressbar_Process( xml_progress, 1, false );
596     else
597         return 0;
598 }
599 
600 
print_log_info(const char * info)601 void print_log_info( const char * info )
602 {
603     KLogLevel tmp_lvl = KLogLevelGet();
604     KLogLevelSet( klogInfo );
605     LOGMSG( klogInfo, info );
606     KLogLevelSet( tmp_lvl );
607 }
608 
609 
610 /* was once intended to make the SEQ-table a alias to the CONSENSU-table,
611    this step was removed, but the decision could be reinstate again
612    because of that the function to do so is still here */
pacbio_make_alias(VDatabase * vdb_db,const char * existing_obj,const char * alias_to_create)613 rc_t pacbio_make_alias( VDatabase * vdb_db,
614                         const char *existing_obj, const char *alias_to_create )
615 {
616     KDatabase *kdb;
617     rc_t rc = VDatabaseOpenKDatabaseUpdate ( vdb_db, & kdb );
618     if ( rc == 0 )
619     {
620         rc = KDatabaseAliasTable ( kdb, existing_obj, alias_to_create );
621         KDatabaseRelease ( kdb );
622     }
623     return rc;
624 }
625