1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 #include "vdb-dump-helper.h"
28
29 #include "vdb-dump-coldefs.h"
30
31 #include <klib/vector.h>
32 #include <klib/text.h>
33 #include <klib/printf.h>
34 #include <klib/log.h>
35 #include <klib/rc.h>
36
37 #include <vdb/vdb-priv.h>
38
39 #include <sra/sradb.h>
40 #include <sra/pacbio.h>
41 #include <os-native.h>
42 #include <sysalloc.h>
43
44 /* for platforms */
45 #include <insdc/sra.h>
46
47 #include <stdlib.h>
48 #include <stdio.h>
49 #include <string.h>
50 #include <assert.h>
51 #include <math.h>
52
53 rc_t Quitting();
54
55 /* once we get used to having moved the read descriptor
56 out of SRA, we should begin using those names.
57 if anyone has an investment in the old names, we may
58 want to provide a switch for using them... */
59 #if USE_OLD_SRA_NAME || 1
60 #define SRA_NAME( name ) \
61 "SRA_" #name
62 #define SRA_NAMES( name1, name2 ) \
63 "SRA_" #name1 "|SRA_" #name2
64 #else
65 #define SRA_NAME( name ) \
66 # name
67 #define SRA_NAMES( name1, name2 ) \
68 #name1 "|" #name2
69 #endif
70
71 /* implementation of the value-translation-functions */
72
73 const char SRA_PB_HS_0[] = { "SRA_PACBIO_HOLE_SEQUENCING" };
74 const char SRA_PB_HS_1[] = { "SRA_PACBIO_HOLE_ANTIHOLE" };
75 const char SRA_PB_HS_2[] = { "SRA_PACBIO_HOLE_FIDUCIAL" };
76 const char SRA_PB_HS_3[] = { "SRA_PACBIO_HOLE_SUSPECT" };
77 const char SRA_PB_HS_4[] = { "SRA_PACBIO_HOLE_ANTIMIRROR" };
78 const char SRA_PB_HS_5[] = { "SRA_PACBIO_HOLE_FDZMW" };
79 const char SRA_PB_HS_6[] = { "SRA_PACBIO_HOLE_FBZMW" };
80 const char SRA_PB_HS_7[] = { "SRA_PACBIO_HOLE_ANTIBEAMLET" };
81 const char SRA_PB_HS_8[] = { "SRA_PACBIO_HOLE_OUTSIDEFOV" };
82 const char SRA_PB_HS_9[] = { "unknown hole-status" };
83
vdcd_get_hole_status_txt(const uint32_t id)84 const char *vdcd_get_hole_status_txt( const uint32_t id )
85 {
86 switch( id )
87 {
88 case SRA_PACBIO_HOLE_SEQUENCING : return( SRA_PB_HS_0 ); break;
89 case SRA_PACBIO_HOLE_ANTIHOLE : return( SRA_PB_HS_1 ); break;
90 case SRA_PACBIO_HOLE_FIDUCIAL : return( SRA_PB_HS_2 ); break;
91 case SRA_PACBIO_HOLE_SUSPECT : return( SRA_PB_HS_3 ); break;
92 case SRA_PACBIO_HOLE_ANTIMIRROR : return( SRA_PB_HS_4 ); break;
93 case SRA_PACBIO_HOLE_FDZMW : return( SRA_PB_HS_5 ); break;
94 case SRA_PACBIO_HOLE_FBZMW : return( SRA_PB_HS_6 ); break;
95 case SRA_PACBIO_HOLE_ANTIBEAMLET : return( SRA_PB_HS_7 ); break;
96 case SRA_PACBIO_HOLE_OUTSIDEFOV : return( SRA_PB_HS_8 ); break;
97 }
98 return( SRA_PB_HS_9 );
99 }
100
vdcd_get_platform_txt(const uint32_t id)101 const char *vdcd_get_platform_txt( const uint32_t id )
102 {
103 #define CASE( id ) \
104 case id : return # id; break
105
106 switch( id )
107 {
108 CASE ( SRA_PLATFORM_UNDEFINED );
109 CASE ( SRA_PLATFORM_454 );
110 CASE ( SRA_PLATFORM_ILLUMINA );
111 CASE ( SRA_PLATFORM_ABSOLID );
112 CASE ( SRA_PLATFORM_COMPLETE_GENOMICS );
113 CASE ( SRA_PLATFORM_HELICOS );
114 CASE ( SRA_PLATFORM_PACBIO_SMRT );
115 CASE ( SRA_PLATFORM_ION_TORRENT );
116 CASE ( SRA_PLATFORM_CAPILLARY );
117 CASE ( SRA_PLATFORM_OXFORD_NANOPORE );
118 }
119 #undef CASE
120
121 return "unknown platform";
122 }
123
124 const char SRA_RT_0[] = { SRA_NAME ( READ_TYPE_TECHNICAL ) };
125 const char SRA_RT_1[] = { SRA_NAME ( READ_TYPE_BIOLOGICAL ) };
126 const char SRA_RT_2[] = { SRA_NAMES ( READ_TYPE_TECHNICAL, READ_TYPE_FORWARD ) };
127 const char SRA_RT_3[] = { SRA_NAMES ( READ_TYPE_BIOLOGICAL, READ_TYPE_FORWARD ) };
128 const char SRA_RT_4[] = { SRA_NAMES ( READ_TYPE_TECHNICAL, READ_TYPE_REVERSE ) };
129 const char SRA_RT_5[] = { SRA_NAMES ( READ_TYPE_BIOLOGICAL, READ_TYPE_REVERSE ) };
130 const char SRA_RT_6[] = { "unknown read-type" };
131
vdcd_get_read_type_txt(const uint32_t id)132 const char *vdcd_get_read_type_txt( const uint32_t id )
133 {
134 switch( id )
135 {
136 case 0 : return( SRA_RT_0 ); break;
137 case 1 : return( SRA_RT_1 ); break;
138 case 2 : return( SRA_RT_2 ); break;
139 case 3 : return( SRA_RT_3 ); break;
140 case 4 : return( SRA_RT_4 ); break;
141 case 5 : return( SRA_RT_5 ); break;
142 }
143 return( SRA_RT_6 );
144 }
145
146 const char SRA_FT_0[] = { SRA_NAME ( READ_FILTER_PASS ) };
147 const char SRA_FT_1[] = { SRA_NAME ( READ_FILTER_REJECT ) };
148 const char SRA_FT_2[] = { SRA_NAME ( READ_FILTER_CRITERIA ) };
149 const char SRA_FT_3[] = { SRA_NAME ( READ_FILTER_REDACTED ) };
150 const char SRA_FT_4[] = { "unknown read-filter" };
151
vdcd_get_read_filter_txt(const uint32_t id)152 const char *vdcd_get_read_filter_txt( const uint32_t id )
153 {
154 switch( id )
155 {
156 case 0 : return( SRA_FT_0 ); break;
157 case 1 : return( SRA_FT_1 ); break;
158 case 2 : return( SRA_FT_2 ); break;
159 case 3 : return( SRA_FT_3 ); break;
160 }
161 return( SRA_FT_4 );
162 }
163
164 /* hardcoded values taken from asm-trace/interface/sra/sradb.h */
165 #define SRA_KEY_PLATFORM_ID "INSDC:SRA:platform_id"
166 #define SRA_KEY_XREAD_TYPE "INSDC:SRA:xread_type"
167 #define SRA_KEY_READ_TYPE "INSDC:SRA:read_type"
168 #define SRA_KEY_READ_FILTER "INSDC:SRA:read_filter"
169 #define SRA_PACBIO_HOLE_STATUS "PacBio:hole:status"
170
171
vdcd_type_cmp(const VSchema * my_schema,VTypedecl * typedecl,const char * to_check)172 static bool vdcd_type_cmp( const VSchema *my_schema, VTypedecl * typedecl, const char * to_check )
173 {
174 VTypedecl type_to_check;
175 rc_t rc = VSchemaResolveTypedecl ( my_schema, &type_to_check, "%s", to_check );
176 if ( 0 == rc )
177 {
178 return VTypedeclToTypedecl ( typedecl, my_schema, &type_to_check, NULL, NULL );
179 }
180 return false;
181 }
182
183
vdcd_get_value_trans_fct(const VSchema * my_schema,VTypedecl * typedecl)184 static value_trans_fct_t vdcd_get_value_trans_fct( const VSchema *my_schema, VTypedecl * typedecl )
185 {
186 value_trans_fct_t res = NULL;
187
188 if ( NULL == my_schema || NULL == typedecl )
189 {
190 return res;
191 }
192
193 if ( vdcd_type_cmp( my_schema, typedecl, SRA_KEY_PLATFORM_ID ) )
194 {
195 res = vdcd_get_platform_txt;
196 }
197 else if ( vdcd_type_cmp( my_schema, typedecl, SRA_KEY_XREAD_TYPE ) )
198 {
199 res = vdcd_get_read_type_txt;
200 }
201 else if ( vdcd_type_cmp( my_schema, typedecl, SRA_KEY_READ_TYPE ) )
202 {
203 res = vdcd_get_read_type_txt;
204 }
205 else if ( vdcd_type_cmp( my_schema, typedecl, SRA_KEY_READ_FILTER ) )
206 {
207 res = vdcd_get_read_filter_txt;
208 }
209 else if ( vdcd_type_cmp( my_schema, typedecl, SRA_PACBIO_HOLE_STATUS ) )
210 {
211 res = vdcd_get_hole_status_txt;
212 }
213
214 return res;
215 }
216
217
218 /* implementation of the dimension-translation-functions */
vdcd_get_read_desc_txt(const uint8_t * src)219 static char *vdcd_get_read_desc_txt( const uint8_t * src )
220 {
221 char *res = calloc( 1, 120 );
222 SRAReadDesc desc;
223 memmove( &desc, src, sizeof( desc ) );
224 string_printf ( res, 119, NULL,
225 "seg.start=%u, seg.len=%u, type=%u, cs_key=%u, label=%s",
226 desc . seg.start, desc . seg.len, desc . type,
227 desc . cs_key, desc . label );
228 return res;
229 }
230
vdcd_get_spot_desc_txt(const uint8_t * src)231 static char *vdcd_get_spot_desc_txt( const uint8_t *src )
232 {
233 char *res = calloc( 1, 120 );
234 SRASpotDesc desc;
235 memmove( &desc, src, sizeof( desc ) );
236 string_printf ( res, 119, NULL,
237 "spot_len=%u, fixed_len=%u, signal_len=%u, clip_qual_right=%u, num_reads=%u",
238 desc . spot_len, desc . fixed_len, desc . signal_len,
239 desc . clip_qual_right, desc . num_reads );
240 return res;
241 }
242
243 /* hardcoded values taken from asm-trace/interface/sra/sradb.h */
244 #define SRA_KEY_READ_DESC "NCBI:SRA:ReadDesc"
245 #define SRA_KEY_SPOT_DESC "NCBI:SRA:SpotDesc"
246
vdcd_get_dim_trans_fct(const VSchema * my_schema,VTypedecl * typedecl)247 static dim_trans_fct_t vdcd_get_dim_trans_fct( const VSchema *my_schema, VTypedecl * typedecl )
248 {
249 dim_trans_fct_t res = NULL;
250
251 if ( NULL == my_schema || NULL == typedecl )
252 {
253 return res;
254 }
255
256 if ( vdcd_type_cmp( my_schema, typedecl, SRA_KEY_READ_DESC ) )
257 {
258 res = vdcd_get_read_desc_txt;
259 }
260 else if ( vdcd_type_cmp( my_schema, typedecl, SRA_KEY_SPOT_DESC ) )
261 {
262 res = vdcd_get_spot_desc_txt;
263 }
264 return res;
265 }
266
267
268 const char * const_s_Ascii = "Ascii";
269 const char * const_s_Unicode = "Unicode";
270 const char * const_s_Uint = "Uint";
271 const char * const_s_Int = "Int";
272 const char * const_s_Float = "Float";
273 const char * const_s_Bool = "Bool";
274 const char * const_s_Unknown = "unknown";
275
vdcd_make_domain_txt(const uint32_t domain)276 char *vdcd_make_domain_txt( const uint32_t domain )
277 {
278 char* res = NULL;
279 switch( domain )
280 {
281 case vtdAscii : res = string_dup_measure( const_s_Ascii, NULL ); break;
282 case vtdUnicode : res = string_dup_measure( const_s_Unicode, NULL ); break;
283 case vtdUint : res = string_dup_measure( const_s_Uint, NULL ); break;
284 case vtdInt : res = string_dup_measure( const_s_Int, NULL ); break;
285 case vtdFloat : res = string_dup_measure( const_s_Float, NULL ); break;
286 case vtdBool : res = string_dup_measure( const_s_Bool, NULL ); break;
287 default : res = string_dup_measure( const_s_Unknown, NULL ); break;
288 }
289 return res;
290 }
291
292 /* a single column-definition */
vdcd_init_col(const char * name,const size_t str_limit)293 p_col_def vdcd_init_col( const char* name, const size_t str_limit )
294 {
295 p_col_def res = NULL;
296 if ( NULL == name ) return res;
297 if ( 0 == name[ 0 ] ) return res;
298 res = ( p_col_def )calloc( 1, sizeof( col_def ) );
299 if ( res != NULL )
300 {
301 res -> name = string_dup_measure ( name, NULL );
302 vds_make( &( res -> content ), str_limit, DUMP_STR_INC );
303 }
304 return res;
305 }
306
vdcd_destroy_col(p_col_def col_def)307 void vdcd_destroy_col( p_col_def col_def )
308 {
309 if ( NULL != col_def )
310 {
311 if ( col_def -> name )
312 {
313 free( col_def -> name );
314 }
315 vds_free( &( col_def -> content ) );
316 free( col_def );
317 }
318 }
319
320 /* a vector of column-definitions */
vdcd_init(col_defs ** defs,const size_t str_limit)321 bool vdcd_init( col_defs** defs, const size_t str_limit )
322 {
323 bool res = false;
324 if ( NULL != defs )
325 {
326 ( *defs ) = calloc( 1, sizeof( col_defs ) );
327 if ( NULL != *defs )
328 {
329 VectorInit( &( ( *defs ) -> cols ), 0, 5 );
330 ( *defs ) -> max_colname_chars = 0;
331 res = true;
332 }
333 ( *defs ) -> str_limit = str_limit;
334 }
335 return res;
336 }
337
vdcd_destroy_node(void * node,void * data)338 static void CC vdcd_destroy_node( void* node, void* data )
339 {
340 vdcd_destroy_col( ( p_col_def ) node );
341 }
342
vdcd_destroy(col_defs * defs)343 void vdcd_destroy( col_defs* defs )
344 {
345 if ( NULL != defs )
346 {
347 VectorWhack( &( defs -> cols ), vdcd_destroy_node, NULL );
348 free( defs );
349 }
350 }
351
vdcd_append_col(col_defs * defs,const char * name)352 static p_col_def vdcd_append_col( col_defs* defs, const char* name )
353 {
354 p_col_def col = vdcd_init_col( name, defs -> str_limit );
355 if ( NULL != col )
356 {
357 if ( 0 == VectorAppend( &( defs -> cols ), NULL, col ) )
358 {
359 size_t len = string_size( name );
360 if ( len > defs -> max_colname_chars )
361 {
362 defs -> max_colname_chars = ( uint16_t )len;
363 }
364 }
365 }
366 return col;
367 }
368
split_column_string(col_defs * defs,const char * src,size_t limit)369 static uint32_t split_column_string( col_defs* defs, const char* src, size_t limit )
370 {
371 size_t i_dest = 0;
372 size_t i_src = 0;
373 char colname[ MAX_COL_NAME_LEN + 1 ];
374
375 if ( NULL == defs || NULL == src )
376 {
377 return 0;
378 }
379
380 while ( i_src < limit && src[ i_src ] )
381 {
382 if ( src[ i_src ] == ',' )
383 {
384 if ( i_dest > 0 )
385 {
386 colname[ i_dest ] = 0;
387 vdcd_append_col( defs, colname );
388 }
389 i_dest = 0;
390 }
391 else
392 {
393 if ( i_dest < MAX_COL_NAME_LEN )
394 colname[ i_dest++ ] = src[ i_src ];
395 }
396 i_src++;
397 }
398 if ( i_dest > 0 )
399 {
400 colname[ i_dest ] = 0;
401 vdcd_append_col( defs, colname );
402 }
403 return VectorLength( &defs -> cols );
404 }
405
vdcd_parse_string(col_defs * defs,const char * src,const VTable * tbl,uint32_t * invalid_columns)406 uint32_t vdcd_parse_string( col_defs* defs, const char* src, const VTable *tbl,
407 uint32_t * invalid_columns )
408 {
409 uint32_t valid_columns = 0;
410 uint32_t count = split_column_string( defs, src, 4096 );
411
412 if ( count > 0 && NULL != tbl )
413 {
414 const VCursor *probing_cursor;
415 rc_t rc = VTableCreateCursorRead( tbl, &probing_cursor );
416 DISP_RC( rc, "VTableCreateCursorRead() failed" );
417 if ( 0 == rc )
418 {
419 uint32_t idx;
420 for ( idx = 0; idx < count; ++idx )
421 {
422 col_def *col = ( col_def * )VectorGet( &( defs -> cols ), idx );
423 if ( col != NULL )
424 {
425 rc = VCursorAddColumn( probing_cursor, &( col -> idx ), "%s", col -> name );
426 DISP_RC( rc, "VCursorAddColumn() failed in vdcd_parse_string()" );
427 if ( 0 == rc )
428 {
429 rc = VCursorDatatype( probing_cursor, col -> idx, &( col -> type_decl ), &( col -> type_desc ) );
430 DISP_RC( rc, "VCursorDatatype() failed" );
431 if ( 0 == rc )
432 {
433 valid_columns++;
434 col -> valid = true;
435 }
436 else
437 {
438 col -> valid = false;
439 ( *invalid_columns )++;
440 }
441 }
442 else
443 {
444 col -> valid = false;
445 ( *invalid_columns )++;
446 }
447 }
448 }
449 rc = VCursorRelease( probing_cursor );
450 DISP_RC( rc, "VCursorRelease() failed" );
451 }
452 }
453 return valid_columns;
454 }
455
456
vdcd_table_has_column(const VTable * tbl,const char * to_find)457 bool vdcd_table_has_column( const VTable *tbl, const char * to_find )
458 {
459 bool res = false;
460 if ( NULL != tbl && NULL != to_find )
461 {
462 size_t to_find_len = string_size( to_find );
463 if ( to_find_len > 0 )
464 {
465 KNamelist * names;
466 rc_t rc = VTableListCol( tbl, &names );
467 DISP_RC( rc, "VTableListCol() failed" );
468 if ( 0 == rc )
469 {
470 uint32_t n;
471 rc = KNamelistCount( names, &n );
472 DISP_RC( rc, "KNamelistCount() failed" );
473 if ( 0 == rc )
474 {
475 uint32_t i;
476 for ( i = 0; ( i < n ) && ( 0 == rc ) && !res; ++i )
477 {
478 const char * col_name;
479 rc = KNamelistGet( names, i, &col_name );
480 DISP_RC( rc, "KNamelistGet() failed" );
481 if ( 0 == rc )
482 {
483 size_t col_name_len = string_size( col_name );
484 if ( col_name_len == to_find_len )
485 {
486 res = ( 0 == string_cmp( to_find, to_find_len, col_name, col_name_len, ( uint32_t )col_name_len ) );
487 }
488 }
489 }
490 }
491 KNamelistRelease( names );
492 }
493 }
494 }
495 return res;
496 }
497
vdcd_extract_from_table(col_defs * defs,const VTable * tbl,uint32_t * invalid_columns)498 uint32_t vdcd_extract_from_table( col_defs* defs, const VTable *tbl, uint32_t *invalid_columns )
499 {
500 uint32_t found = 0;
501 KNamelist *names;
502 rc_t rc = VTableListCol( tbl, &names );
503 DISP_RC( rc, "VTableListCol() failed" );
504 if ( NULL != invalid_columns ) *invalid_columns = 0;
505 if ( 0 == rc )
506 {
507 const VCursor *curs;
508 rc = VTableCreateCursorRead( tbl, &curs );
509 DISP_RC( rc, "VTableCreateCursorRead() failed" );
510 if ( 0 == rc )
511 {
512 uint32_t n;
513 rc = KNamelistCount( names, &n );
514 DISP_RC( rc, "KNamelistCount() failed" );
515 if ( 0 == rc )
516 {
517 uint32_t i;
518 for ( i = 0; i < n && 0 == rc; ++i )
519 {
520 const char *col_name;
521 rc = KNamelistGet( names, i, &col_name );
522 DISP_RC( rc, "KNamelistGet() failed" );
523 if ( 0 == rc )
524 {
525 p_col_def def = vdcd_append_col( defs, col_name );
526 rc = VCursorAddColumn( curs, &(def->idx), "%s", def -> name );
527 DISP_RC( rc, "VCursorAddColumn() failed in vdcd_extract_from_table()" );
528 if ( 0 == rc )
529 {
530 rc = VCursorDatatype( curs, def->idx, &(def->type_decl), &(def->type_desc) );
531 DISP_RC( rc, "VCursorDatatype() failed" );
532 if ( 0 == rc )
533 {
534 found++;
535 def -> valid = true;
536 }
537 else
538 {
539 if ( NULL != invalid_columns )
540 {
541 ( *invalid_columns )++;
542 }
543 def -> valid = false;
544 }
545
546 }
547 else
548 {
549 if ( NULL != invalid_columns )
550 {
551 ( *invalid_columns )++;
552 }
553 def -> valid = false;
554 }
555
556 }
557 }
558 }
559 rc = VCursorRelease( curs );
560 DISP_RC( rc, "VCursorRelease() failed" );
561 }
562 rc = KNamelistRelease( names );
563 DISP_RC( rc, "KNamelistRelease() failed" );
564 }
565 return found;
566 }
567
568
vdcd_extract_from_phys_table(col_defs * defs,const VTable * tbl)569 bool vdcd_extract_from_phys_table( col_defs* defs, const VTable *tbl )
570 {
571 bool col_defs_found = false;
572 KNamelist *names;
573 rc_t rc = VTableListPhysColumns( tbl, &names );
574 DISP_RC( rc, "VTableListPhysColumns() failed" );
575 if ( 0 == rc )
576 {
577 uint32_t n;
578 rc = KNamelistCount( names, &n );
579 DISP_RC( rc, "KNamelistCount() failed" );
580 if ( 0 == rc )
581 {
582 uint32_t i, found;
583 for ( i = 0, found = 0; i < n && 0 == rc; ++i )
584 {
585 const char *col_name;
586 rc = KNamelistGet( names, i, &col_name );
587 DISP_RC( rc, "KNamelistGet() failed" );
588 if ( 0 == rc )
589 {
590 vdcd_append_col( defs, col_name );
591 found++;
592 }
593 }
594 col_defs_found = ( found > 0 );
595 }
596 rc = KNamelistRelease( names );
597 DISP_RC( rc, "KNamelistRelease() failed" );
598 }
599 return col_defs_found;
600 }
601
602 typedef struct add_2_cur_context
603 {
604 const VCursor *curs;
605 uint32_t count;
606 } add_2_cur_context;
607 typedef add_2_cur_context* p_add_2_cur_context;
608
609
vdcd_add_1_to_cursor(void * item,void * data)610 static void CC vdcd_add_1_to_cursor( void *item, void *data )
611 {
612 rc_t rc;
613 p_col_def col_def = ( p_col_def )item;
614 p_add_2_cur_context ctx = ( p_add_2_cur_context )data;
615
616 if ( NULL == col_def || NULL == ctx )
617 {
618 return;
619 }
620 if ( NULL == ctx -> curs || ! col_def -> valid )
621 {
622 return;
623 }
624
625 rc = VCursorAddColumn( ctx -> curs, &( col_def -> idx ), "%s", col_def -> name );
626 DISP_RC( rc, "VCursorAddColumn() failed in vdcd_add_1_to_cursor" );
627
628 /***************************************************************************
629 !!! extract type information !!!
630 **************************************************************************/
631 if ( 0 == rc )
632 {
633 rc = VCursorDatatype( ctx -> curs, col_def -> idx,
634 &( col_def -> type_decl ), &( col_def->type_desc ) );
635 DISP_RC( rc, "VCursorDatatype() failed" );
636 if ( 0 == rc )
637 {
638 ctx -> count++;
639 col_def -> valid = true;
640 }
641 }
642 else
643 {
644 col_def -> valid = false;
645 }
646 }
647
vdcd_add_to_cursor(col_defs * defs,const VCursor * curs)648 uint32_t vdcd_add_to_cursor( col_defs* defs, const VCursor *curs )
649 {
650 add_2_cur_context ctx;
651 ctx . count = 0;
652 ctx . curs = curs;
653 VectorForEach( &( defs -> cols ), false, vdcd_add_1_to_cursor, &ctx );
654 return ctx . count;
655 }
656
vdcd_reset_1_content(void * item,void * data)657 static void CC vdcd_reset_1_content( void *item, void *data )
658 {
659 p_col_def my_col_def = ( p_col_def )item;
660 if ( NULL != my_col_def )
661 {
662 rc_t rc = vds_clear( &( my_col_def -> content ) );
663 DISP_RC( rc, "dump_str_clear() failed" );
664 }
665 }
666
vdcd_reset_content(col_defs * defs)667 void vdcd_reset_content( col_defs* defs )
668 {
669 VectorForEach( &( defs -> cols), false, vdcd_reset_1_content, NULL );
670 }
671
vdcd_ins_1_trans_fkt(void * item,void * data)672 static void CC vdcd_ins_1_trans_fkt( void *item, void *data )
673 {
674 p_col_def col_def = ( p_col_def )item;
675 const VSchema *schema = ( const VSchema * )data;
676
677 if ( NULL != col_def && NULL != schema )
678 {
679 /* resolves special sra-types and retrieves the addr of
680 a function that later can translate the values into plain-text
681 --- is defined in this file! */
682 col_def -> value_trans_fct = vdcd_get_value_trans_fct( schema, &( col_def -> type_decl ) );
683 col_def -> dim_trans_fct = vdcd_get_dim_trans_fct( schema, &( col_def -> type_decl ) );
684 }
685 }
686
vdcd_ins_trans_fkt(col_defs * defs,const VSchema * schema)687 void vdcd_ins_trans_fkt( col_defs* defs, const VSchema *schema )
688 {
689 if ( NULL != defs && NULL != schema )
690 {
691 VectorForEach( &( defs -> cols ), false, vdcd_ins_1_trans_fkt, ( void* )schema );
692 }
693 }
694
vdcd_exclude_column_cb(void * item,void * data)695 static void CC vdcd_exclude_column_cb( void *item, void *data )
696 {
697 const char * s = ( const char * )data;
698 p_col_def col_def = ( p_col_def )item;
699 if ( NULL != s && NULL != col_def )
700 {
701 if ( 0 == strcmp( col_def -> name, s ) )
702 col_def -> excluded = true;
703 }
704 }
705
vdcd_exclude_this_column(col_defs * defs,const char * column_name)706 void vdcd_exclude_this_column( col_defs* defs, const char* column_name )
707 {
708 VectorForEach( &( defs -> cols ), false, vdcd_exclude_column_cb, ( void* )column_name );
709 }
710
vdcd_exclude_these_columns(col_defs * defs,const char * column_names)711 void vdcd_exclude_these_columns( col_defs* defs, const char* column_names )
712 {
713 char colname[ MAX_COL_NAME_LEN + 1 ];
714 size_t i_dest = 0;
715 if ( NULL != defs && NULL != column_names )
716 {
717 while ( *column_names )
718 {
719 if ( *column_names == ',' )
720 {
721 if ( i_dest > 0 )
722 {
723 colname[ i_dest ] = 0;
724 vdcd_exclude_this_column( defs, colname );
725 }
726 i_dest = 0;
727 }
728 else
729 {
730 if ( i_dest < MAX_COL_NAME_LEN )
731 {
732 colname[ i_dest++ ] = *column_names;
733 }
734 }
735 column_names++;
736 }
737 if ( i_dest > 0 )
738 {
739 colname[ i_dest ] = 0;
740 vdcd_exclude_this_column( defs, colname );
741 }
742 }
743 }
744
vdcd_get_first_none_static_column_idx(col_defs * defs,const VCursor * cur,uint32_t * idx)745 bool vdcd_get_first_none_static_column_idx( col_defs *defs, const VCursor *cur, uint32_t *idx )
746 {
747 bool res = false;
748 if ( NULL != defs && NULL != cur && NULL != idx )
749 {
750 uint32_t len = VectorLength( &( defs -> cols ) );
751 if ( len > 0 )
752 {
753 uint32_t start = VectorStart( &( defs -> cols ) );
754 uint32_t run_idx = start;
755 while ( ( run_idx < ( start + len ) ) && !res )
756 {
757 col_def * cd = VectorGet( &( defs -> cols ), run_idx );
758 if ( NULL != cd )
759 {
760 int64_t first;
761 uint64_t count;
762
763 rc_t rc = VCursorIdRange( cur, cd -> idx, &first, &count );
764 if ( 0 == rc && count > 0 )
765 {
766 *idx = cd -> idx;
767 res = true;
768 }
769 }
770 run_idx++;
771 }
772 }
773 }
774 return res;
775 }
776
777 /* ******************************************************************************************************** */
778 typedef struct spread
779 {
780 uint64_t count;
781 double sum, sum_sq;
782 int64_t min, max;
783 } spread;
784
785
786 /*
787 s ... spread * s
788 b ... const void * base
789 l ... uint32_t row_len
790 t ... type ( int64_t, uint64_t ... )
791 */
792 #define COUNTVALUES( S, b, l, t ) \
793 { \
794 const t * values = base; \
795 uint32_t i; \
796 for ( i = 0; i < l; ++i ) \
797 { \
798 t value = ( t )values[ i ]; \
799 if ( value != 0 ) \
800 { \
801 double value_d = ( double ) value; \
802 if ( value < ( t )(S)->min ) (S)->min = value; \
803 if ( value > ( t )(S)->max ) (S)->max = value; \
804 (S)->sum += value_d; \
805 (S)->sum_sq += ( value_d * value_d ); \
806 (S)->count++; \
807 } \
808 } \
809 } \
810
round_to_uint64_t(double value)811 static uint64_t round_to_uint64_t( double value )
812 {
813 double floor_value = floor( value );
814 double x = ( value - floor_value ) > 0.5 ? ceil( value ) : floor_value;
815 return ( uint64_t )x;
816 }
817
vdcd_collect_spread_col(const struct num_gen * row_set,col_def * cd,const VCursor * curs)818 static rc_t vdcd_collect_spread_col( const struct num_gen *row_set, col_def *cd, const VCursor * curs )
819 {
820 const struct num_gen_iter *iter;
821 rc_t rc = num_gen_iterator_make( row_set, &iter );
822 if ( 0 == rc )
823 {
824 const void * base;
825 uint32_t row_len, elem_bits;
826 int64_t row_id;
827 spread s;
828 spread * sp = &s;
829
830 s . max = 0;
831 s . sum = 0;
832 s . sum_sq = 0;
833 s . count = 0;
834 s . min = INT64_MAX;
835
836 while ( ( 0 == rc ) && num_gen_iterator_next( iter, &row_id, &rc ) )
837 {
838 if ( 0 == rc )
839 {
840 rc = Quitting();
841 }
842 if ( 0 != rc )
843 {
844 break;
845 }
846 rc = VCursorCellDataDirect( curs, row_id, cd -> idx, &elem_bits, &base, NULL, &row_len );
847 if ( 0 == rc )
848 {
849 if ( cd -> type_desc.domain == vtdUint )
850 {
851 /* unsigned int's */
852 switch( elem_bits )
853 {
854 case 64 : COUNTVALUES( sp, base, row_len, uint64_t ) break;
855 case 32 : COUNTVALUES( sp, base, row_len, uint32_t ) break;
856 case 16 : COUNTVALUES( sp, base, row_len, uint16_t ) break;
857 case 8 : COUNTVALUES( sp, base, row_len, uint8_t ) break;
858 }
859 }
860 else
861 {
862 /* signed int's */
863 switch( elem_bits )
864 {
865 case 64 : COUNTVALUES( sp, base, row_len, int64_t ) break;
866 case 32 : COUNTVALUES( sp, base, row_len, int32_t ) break;
867 case 16 : COUNTVALUES( sp, base, row_len, int16_t ) break;
868 case 8 : COUNTVALUES( sp, base, row_len, int8_t ) break;
869 }
870 }
871 }
872 }
873
874 if ( s . count > 0 )
875 {
876 rc = KOutMsg( "\n[%s]\n", cd -> name );
877 if ( 0 == rc )
878 {
879 rc = KOutMsg( "min = %,ld\n", s . min );
880 }
881 if ( 0 == rc )
882 {
883 rc = KOutMsg( "max = %,ld\n", s . max );
884 }
885 if ( 0 == rc )
886 {
887 rc = KOutMsg( "count = %,ld\n", s . count );
888 }
889 if ( 0 == rc )
890 {
891 double median = ( s . sum / s . count );
892 rc = KOutMsg( "median = %,ld\n", round_to_uint64_t( median ) );
893 if ( 0 == rc )
894 {
895 double stdev = sqrt( ( ( s . sum_sq - ( s . sum * s . sum ) / s . count ) ) / ( s . count - 1 ) );
896 rc = KOutMsg( "stdev = %,ld\n", round_to_uint64_t( stdev ) );
897 }
898 }
899 }
900 num_gen_iterator_destroy( iter );
901 }
902 return rc;
903 }
904 #undef COUNTVALUES
905
vdcd_collect_spread(const struct num_gen * row_set,col_defs * cols,const VCursor * curs)906 rc_t vdcd_collect_spread( const struct num_gen * row_set, col_defs * cols, const VCursor * curs )
907 {
908 rc_t rc = 0;
909 uint32_t i, n = VectorLength( &( cols -> cols ) );
910 for ( i = 0; i < n && 0 == rc; ++i )
911 {
912 col_def * cd = VectorGet( &( cols -> cols ), i );
913 if ( NULL != cd )
914 {
915 if ( vtdUint == cd -> type_desc . domain || vtdInt == cd -> type_desc . domain )
916 {
917 rc = vdcd_collect_spread_col( row_set, cd, curs );
918 }
919 }
920 }
921 return rc;
922 }
923
same_values(const VCursor * curs,uint32_t col_idx,int64_t first,uint32_t test_rows)924 static uint32_t same_values( const VCursor * curs, uint32_t col_idx, int64_t first, uint32_t test_rows )
925 {
926 uint32_t res = 0;
927 const void * base;
928 uint32_t elem_bits, boff, row_len;
929 rc_t rc = VCursorCellDataDirect( curs, first, col_idx, &elem_bits, &base, &boff, &row_len );
930 while ( 0 == rc && res < test_rows && 0 == rc )
931 {
932 const void * base_1;
933 uint32_t elem_bits_1, boff_1, row_len_1;
934 rc = VCursorCellDataDirect( curs, first + res + 1, col_idx, &elem_bits_1, &base_1, &boff_1, &row_len_1 );
935 if ( 0 == rc )
936 {
937 if ( elem_bits != elem_bits_1 ||
938 boff != boff_1 ||
939 row_len != row_len_1 ||
940 base != base_1 )
941 {
942 return res;
943 }
944 }
945 res += 1;
946 }
947 return res;
948 }
949
vdcd_is_static_column1(const VTable * tbl,col_def * col,uint32_t test_rows)950 static bool vdcd_is_static_column1( const VTable *tbl, col_def *col, uint32_t test_rows )
951 {
952 bool res = false;
953 const VCursor * curs;
954 rc_t rc = VTableCreateCursorRead( tbl, &curs );
955 if ( 0 == rc )
956 {
957 uint32_t idx;
958 rc = VCursorAddColumn( curs, &idx, "%s", col -> name );
959 if ( 0 == rc )
960 {
961 rc = VCursorOpen( curs );
962 if ( 0 == rc )
963 {
964 int64_t first;
965 uint64_t count;
966 rc = VCursorIdRange( curs, idx, &first, &count );
967 if ( 0 == rc && 0 == count )
968 {
969 res = ( same_values( curs, idx, first, test_rows ) == test_rows );
970 }
971 }
972 }
973 VCursorRelease( curs );
974 }
975 return res;
976 }
977
978 #define TEST_ROWS 20
979
vdcd_extract_static_columns(col_defs * defs,const VTable * tbl,const size_t str_limit,uint32_t * invalid_columns)980 uint32_t vdcd_extract_static_columns( col_defs *defs, const VTable *tbl,
981 const size_t str_limit, uint32_t *invalid_columns )
982 {
983 col_defs * temp_defs;
984 uint32_t res = 0;
985 if ( vdcd_init( &temp_defs, str_limit ) )
986 {
987 uint32_t count = vdcd_extract_from_table( temp_defs, tbl, invalid_columns );
988 uint32_t idx;
989 for ( idx = 0; idx < count; ++idx )
990 {
991 col_def * col = VectorGet( &( temp_defs -> cols ), idx );
992 if ( col != NULL && col -> valid )
993 {
994 if ( vdcd_is_static_column1( tbl, col, TEST_ROWS ) )
995 {
996 p_col_def c = vdcd_append_col( defs, col -> name );
997 if ( c != NULL )
998 {
999 res++;
1000 }
1001 }
1002 }
1003 }
1004 vdcd_destroy( temp_defs );
1005 }
1006 else
1007 {
1008 if ( NULL != invalid_columns )
1009 {
1010 *invalid_columns = 0;
1011 }
1012 }
1013 return res;
1014 }
1015