1 /*===========================================================================
2 *
3 *                            PUBLIC DOMAIN NOTICE
4 *               National Center for Biotechnology Information
5 *
6 *  This software/database is a "United States Government Work" under the
7 *  terms of the United States Copyright Act.  It was written as part of
8 *  the author's official duties as a United States Government employee and
9 *  thus cannot be copyrighted.  This software/database is freely available
10 *  to the public for use. The National Library of Medicine and the U.S.
11 *  Government have not placed any restriction on its use or reproduction.
12 *
13 *  Although all reasonable efforts have been taken to ensure the accuracy
14 *  and reliability of the software and data, the NLM and the U.S.
15 *  Government do not and cannot warrant the performance or results that
16 *  may be obtained by using this software or data. The NLM and the U.S.
17 *  Government disclaim all warranties, express or implied, including
18 *  warranties of performance, merchantability or fitness for any particular
19 *  purpose.
20 *
21 *  Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 
27 #include "vdb-dump-helper.h"
28 
29 #include "vdb-dump-coldefs.h"
30 
31 #include <klib/vector.h>
32 #include <klib/text.h>
33 #include <klib/printf.h>
34 #include <klib/log.h>
35 #include <klib/rc.h>
36 
37 #include <vdb/vdb-priv.h>
38 
39 #include <sra/sradb.h>
40 #include <sra/pacbio.h>
41 #include <os-native.h>
42 #include <sysalloc.h>
43 
44 /* for platforms */
45 #include <insdc/sra.h>
46 
47 #include <stdlib.h>
48 #include <stdio.h>
49 #include <string.h>
50 #include <assert.h>
51 #include <math.h>
52 
53 rc_t Quitting();
54 
55 /* once we get used to having moved the read descriptor
56    out of SRA, we should begin using those names.
57    if anyone has an investment in the old names, we may
58    want to provide a switch for using them... */
59 #if USE_OLD_SRA_NAME || 1
60 #define SRA_NAME( name ) \
61     "SRA_" #name
62 #define SRA_NAMES( name1, name2 ) \
63     "SRA_" #name1 "|SRA_" #name2
64 #else
65 #define SRA_NAME( name ) \
66     # name
67 #define SRA_NAMES( name1, name2 ) \
68     #name1 "|" #name2
69 #endif
70 
71 /* implementation of the value-translation-functions */
72 
73 const char SRA_PB_HS_0[] = { "SRA_PACBIO_HOLE_SEQUENCING" };
74 const char SRA_PB_HS_1[] = { "SRA_PACBIO_HOLE_ANTIHOLE" };
75 const char SRA_PB_HS_2[] = { "SRA_PACBIO_HOLE_FIDUCIAL" };
76 const char SRA_PB_HS_3[] = { "SRA_PACBIO_HOLE_SUSPECT" };
77 const char SRA_PB_HS_4[] = { "SRA_PACBIO_HOLE_ANTIMIRROR" };
78 const char SRA_PB_HS_5[] = { "SRA_PACBIO_HOLE_FDZMW" };
79 const char SRA_PB_HS_6[] = { "SRA_PACBIO_HOLE_FBZMW" };
80 const char SRA_PB_HS_7[] = { "SRA_PACBIO_HOLE_ANTIBEAMLET" };
81 const char SRA_PB_HS_8[] = { "SRA_PACBIO_HOLE_OUTSIDEFOV" };
82 const char SRA_PB_HS_9[] = { "unknown hole-status" };
83 
vdcd_get_hole_status_txt(const uint32_t id)84 const char *vdcd_get_hole_status_txt( const uint32_t id )
85 {
86     switch( id )
87     {
88         case SRA_PACBIO_HOLE_SEQUENCING     : return( SRA_PB_HS_0 ); break;
89         case SRA_PACBIO_HOLE_ANTIHOLE       : return( SRA_PB_HS_1 ); break;
90         case SRA_PACBIO_HOLE_FIDUCIAL       : return( SRA_PB_HS_2 ); break;
91         case SRA_PACBIO_HOLE_SUSPECT        : return( SRA_PB_HS_3 ); break;
92         case SRA_PACBIO_HOLE_ANTIMIRROR     : return( SRA_PB_HS_4 ); break;
93         case SRA_PACBIO_HOLE_FDZMW          : return( SRA_PB_HS_5 ); break;
94         case SRA_PACBIO_HOLE_FBZMW          : return( SRA_PB_HS_6 ); break;
95         case SRA_PACBIO_HOLE_ANTIBEAMLET    : return( SRA_PB_HS_7 ); break;
96         case SRA_PACBIO_HOLE_OUTSIDEFOV     : return( SRA_PB_HS_8 ); break;
97     }
98     return( SRA_PB_HS_9 );
99 }
100 
vdcd_get_platform_txt(const uint32_t id)101 const char *vdcd_get_platform_txt( const uint32_t id )
102 {
103 #define CASE( id ) \
104     case id : return # id; break
105 
106     switch( id )
107     {
108         CASE ( SRA_PLATFORM_UNDEFINED );
109         CASE ( SRA_PLATFORM_454 );
110         CASE ( SRA_PLATFORM_ILLUMINA );
111         CASE ( SRA_PLATFORM_ABSOLID );
112         CASE ( SRA_PLATFORM_COMPLETE_GENOMICS );
113         CASE ( SRA_PLATFORM_HELICOS );
114         CASE ( SRA_PLATFORM_PACBIO_SMRT );
115         CASE ( SRA_PLATFORM_ION_TORRENT );
116         CASE ( SRA_PLATFORM_CAPILLARY );
117         CASE ( SRA_PLATFORM_OXFORD_NANOPORE );
118     }
119 #undef CASE
120 
121     return "unknown platform";
122 }
123 
124 const char SRA_RT_0[] = { SRA_NAME  ( READ_TYPE_TECHNICAL ) };
125 const char SRA_RT_1[] = { SRA_NAME  ( READ_TYPE_BIOLOGICAL ) };
126 const char SRA_RT_2[] = { SRA_NAMES ( READ_TYPE_TECHNICAL, READ_TYPE_FORWARD ) };
127 const char SRA_RT_3[] = { SRA_NAMES ( READ_TYPE_BIOLOGICAL, READ_TYPE_FORWARD ) };
128 const char SRA_RT_4[] = { SRA_NAMES ( READ_TYPE_TECHNICAL, READ_TYPE_REVERSE ) };
129 const char SRA_RT_5[] = { SRA_NAMES ( READ_TYPE_BIOLOGICAL, READ_TYPE_REVERSE ) };
130 const char SRA_RT_6[] = { "unknown read-type" };
131 
vdcd_get_read_type_txt(const uint32_t id)132 const char *vdcd_get_read_type_txt( const uint32_t id )
133 {
134     switch( id )
135     {
136         case 0 : return( SRA_RT_0 ); break;
137         case 1 : return( SRA_RT_1 ); break;
138         case 2 : return( SRA_RT_2 ); break;
139         case 3 : return( SRA_RT_3 ); break;
140         case 4 : return( SRA_RT_4 ); break;
141         case 5 : return( SRA_RT_5 ); break;
142     }
143     return( SRA_RT_6 );
144 }
145 
146 const char SRA_FT_0[] = { SRA_NAME ( READ_FILTER_PASS ) };
147 const char SRA_FT_1[] = { SRA_NAME ( READ_FILTER_REJECT ) };
148 const char SRA_FT_2[] = { SRA_NAME ( READ_FILTER_CRITERIA ) };
149 const char SRA_FT_3[] = { SRA_NAME ( READ_FILTER_REDACTED ) };
150 const char SRA_FT_4[] = { "unknown read-filter" };
151 
vdcd_get_read_filter_txt(const uint32_t id)152 const char *vdcd_get_read_filter_txt( const uint32_t id )
153 {
154     switch( id )
155     {
156         case 0 : return( SRA_FT_0 ); break;
157         case 1 : return( SRA_FT_1 ); break;
158         case 2 : return( SRA_FT_2 ); break;
159         case 3 : return( SRA_FT_3 ); break;
160     }
161     return( SRA_FT_4 );
162 }
163 
164 /* hardcoded values taken from asm-trace/interface/sra/sradb.h */
165 #define SRA_KEY_PLATFORM_ID "INSDC:SRA:platform_id"
166 #define SRA_KEY_XREAD_TYPE "INSDC:SRA:xread_type"
167 #define SRA_KEY_READ_TYPE "INSDC:SRA:read_type"
168 #define SRA_KEY_READ_FILTER "INSDC:SRA:read_filter"
169 #define SRA_PACBIO_HOLE_STATUS "PacBio:hole:status"
170 
171 
vdcd_type_cmp(const VSchema * my_schema,VTypedecl * typedecl,const char * to_check)172 static bool vdcd_type_cmp( const VSchema *my_schema, VTypedecl * typedecl, const char * to_check )
173 {
174     VTypedecl type_to_check;
175     rc_t rc = VSchemaResolveTypedecl ( my_schema, &type_to_check, "%s", to_check );
176     if ( 0 == rc )
177     {
178         return VTypedeclToTypedecl ( typedecl, my_schema, &type_to_check, NULL, NULL );
179     }
180     return false;
181 }
182 
183 
vdcd_get_value_trans_fct(const VSchema * my_schema,VTypedecl * typedecl)184 static value_trans_fct_t vdcd_get_value_trans_fct( const VSchema *my_schema, VTypedecl * typedecl )
185 {
186     value_trans_fct_t res = NULL;
187 
188     if ( NULL == my_schema || NULL == typedecl )
189     {
190         return res;
191     }
192 
193     if ( vdcd_type_cmp( my_schema, typedecl, SRA_KEY_PLATFORM_ID ) )
194     {
195         res = vdcd_get_platform_txt;
196     }
197     else if ( vdcd_type_cmp( my_schema, typedecl, SRA_KEY_XREAD_TYPE ) )
198     {
199         res = vdcd_get_read_type_txt;
200     }
201     else if ( vdcd_type_cmp( my_schema, typedecl, SRA_KEY_READ_TYPE ) )
202     {
203         res = vdcd_get_read_type_txt;
204     }
205     else if ( vdcd_type_cmp( my_schema, typedecl, SRA_KEY_READ_FILTER ) )
206     {
207         res = vdcd_get_read_filter_txt;
208     }
209     else if ( vdcd_type_cmp( my_schema, typedecl, SRA_PACBIO_HOLE_STATUS ) )
210     {
211         res = vdcd_get_hole_status_txt;
212     }
213 
214     return res;
215 }
216 
217 
218 /* implementation of the dimension-translation-functions */
vdcd_get_read_desc_txt(const uint8_t * src)219 static char *vdcd_get_read_desc_txt( const uint8_t * src )
220 {
221     char *res = calloc( 1, 120 );
222     SRAReadDesc desc;
223     memmove( &desc, src, sizeof( desc ) );
224     string_printf ( res, 119, NULL,
225               "seg.start=%u, seg.len=%u, type=%u, cs_key=%u, label=%s",
226               desc . seg.start, desc . seg.len, desc . type,
227               desc . cs_key, desc . label );
228     return res;
229 }
230 
vdcd_get_spot_desc_txt(const uint8_t * src)231 static char *vdcd_get_spot_desc_txt( const uint8_t *src )
232 {
233     char *res = calloc( 1, 120 );
234     SRASpotDesc desc;
235     memmove( &desc, src, sizeof( desc ) );
236     string_printf ( res, 119, NULL,
237               "spot_len=%u, fixed_len=%u, signal_len=%u, clip_qual_right=%u, num_reads=%u",
238               desc . spot_len, desc . fixed_len, desc . signal_len,
239               desc . clip_qual_right, desc . num_reads );
240     return res;
241 }
242 
243 /* hardcoded values taken from asm-trace/interface/sra/sradb.h */
244 #define SRA_KEY_READ_DESC "NCBI:SRA:ReadDesc"
245 #define SRA_KEY_SPOT_DESC "NCBI:SRA:SpotDesc"
246 
vdcd_get_dim_trans_fct(const VSchema * my_schema,VTypedecl * typedecl)247 static dim_trans_fct_t vdcd_get_dim_trans_fct( const VSchema *my_schema, VTypedecl * typedecl )
248 {
249     dim_trans_fct_t res = NULL;
250 
251     if ( NULL == my_schema || NULL == typedecl )
252     {
253         return res;
254     }
255 
256     if ( vdcd_type_cmp( my_schema, typedecl, SRA_KEY_READ_DESC ) )
257     {
258         res = vdcd_get_read_desc_txt;
259     }
260     else if ( vdcd_type_cmp( my_schema, typedecl, SRA_KEY_SPOT_DESC ) )
261     {
262         res = vdcd_get_spot_desc_txt;
263     }
264     return res;
265 }
266 
267 
268 const char * const_s_Ascii = "Ascii";
269 const char * const_s_Unicode = "Unicode";
270 const char * const_s_Uint = "Uint";
271 const char * const_s_Int = "Int";
272 const char * const_s_Float = "Float";
273 const char * const_s_Bool = "Bool";
274 const char * const_s_Unknown = "unknown";
275 
vdcd_make_domain_txt(const uint32_t domain)276 char *vdcd_make_domain_txt( const uint32_t domain )
277 {
278     char* res = NULL;
279     switch( domain )
280     {
281         case vtdAscii   : res = string_dup_measure( const_s_Ascii, NULL ); break;
282         case vtdUnicode : res = string_dup_measure( const_s_Unicode, NULL ); break;
283         case vtdUint    : res = string_dup_measure( const_s_Uint, NULL ); break;
284         case vtdInt     : res = string_dup_measure( const_s_Int, NULL ); break;
285         case vtdFloat   : res = string_dup_measure( const_s_Float, NULL ); break;
286         case vtdBool    : res = string_dup_measure( const_s_Bool, NULL ); break;
287         default : res = string_dup_measure( const_s_Unknown, NULL ); break;
288     }
289     return res;
290 }
291 
292 /* a single column-definition */
vdcd_init_col(const char * name,const size_t str_limit)293 p_col_def vdcd_init_col( const char* name, const size_t str_limit )
294 {
295     p_col_def res = NULL;
296     if ( NULL == name ) return res;
297     if ( 0 == name[ 0 ] ) return res;
298     res = ( p_col_def )calloc( 1, sizeof( col_def ) );
299     if ( res != NULL )
300     {
301         res -> name = string_dup_measure ( name, NULL );
302         vds_make( &( res -> content ), str_limit, DUMP_STR_INC );
303     }
304     return res;
305 }
306 
vdcd_destroy_col(p_col_def col_def)307 void vdcd_destroy_col( p_col_def col_def )
308 {
309     if ( NULL != col_def )
310     {
311         if ( col_def -> name )
312         {
313             free( col_def -> name );
314         }
315         vds_free( &( col_def -> content ) );
316         free( col_def );
317     }
318 }
319 
320 /* a vector of column-definitions */
vdcd_init(col_defs ** defs,const size_t str_limit)321 bool vdcd_init( col_defs** defs, const size_t str_limit )
322 {
323     bool res = false;
324     if ( NULL != defs )
325     {
326         ( *defs ) = calloc( 1, sizeof( col_defs ) );
327         if ( NULL != *defs )
328         {
329             VectorInit( &( ( *defs ) -> cols ), 0, 5 );
330             ( *defs ) -> max_colname_chars = 0;
331             res = true;
332         }
333         ( *defs ) -> str_limit = str_limit;
334     }
335     return res;
336 }
337 
vdcd_destroy_node(void * node,void * data)338 static void CC vdcd_destroy_node( void* node, void* data )
339 {
340     vdcd_destroy_col( ( p_col_def ) node );
341 }
342 
vdcd_destroy(col_defs * defs)343 void vdcd_destroy( col_defs* defs )
344 {
345     if ( NULL != defs )
346     {
347         VectorWhack( &( defs -> cols ), vdcd_destroy_node, NULL );
348         free( defs );
349     }
350 }
351 
vdcd_append_col(col_defs * defs,const char * name)352 static p_col_def vdcd_append_col( col_defs* defs, const char* name )
353 {
354     p_col_def col = vdcd_init_col( name, defs -> str_limit );
355     if ( NULL != col )
356     {
357         if ( 0 == VectorAppend( &( defs -> cols ), NULL, col ) )
358         {
359             size_t len = string_size( name );
360             if ( len > defs -> max_colname_chars )
361             {
362                 defs -> max_colname_chars = ( uint16_t )len;
363             }
364         }
365     }
366     return col;
367 }
368 
split_column_string(col_defs * defs,const char * src,size_t limit)369 static uint32_t split_column_string( col_defs* defs, const char* src, size_t limit )
370 {
371     size_t i_dest = 0;
372     size_t i_src = 0;
373     char colname[ MAX_COL_NAME_LEN + 1 ];
374 
375     if ( NULL == defs || NULL == src )
376     {
377         return 0;
378     }
379 
380     while ( i_src < limit && src[ i_src ] )
381     {
382         if ( src[ i_src ] == ',' )
383         {
384             if ( i_dest > 0 )
385             {
386                 colname[ i_dest ] = 0;
387                 vdcd_append_col( defs, colname );
388             }
389             i_dest = 0;
390         }
391         else
392         {
393             if ( i_dest < MAX_COL_NAME_LEN )
394                 colname[ i_dest++ ] = src[ i_src ];
395         }
396         i_src++;
397     }
398     if ( i_dest > 0 )
399     {
400         colname[ i_dest ] = 0;
401         vdcd_append_col( defs, colname );
402     }
403     return VectorLength( &defs -> cols );
404 }
405 
vdcd_parse_string(col_defs * defs,const char * src,const VTable * tbl,uint32_t * invalid_columns)406 uint32_t vdcd_parse_string( col_defs* defs, const char* src, const VTable *tbl,
407                             uint32_t * invalid_columns )
408 {
409     uint32_t valid_columns = 0;
410     uint32_t count = split_column_string( defs, src, 4096 );
411 
412     if ( count > 0 && NULL != tbl )
413     {
414         const VCursor *probing_cursor;
415         rc_t rc = VTableCreateCursorRead( tbl, &probing_cursor );
416         DISP_RC( rc, "VTableCreateCursorRead() failed" );
417         if ( 0 == rc )
418         {
419             uint32_t idx;
420             for ( idx = 0; idx < count; ++idx )
421             {
422                 col_def *col = ( col_def * )VectorGet( &( defs -> cols ), idx );
423                 if ( col != NULL )
424                 {
425                     rc = VCursorAddColumn( probing_cursor, &( col -> idx ), "%s", col -> name );
426                     DISP_RC( rc, "VCursorAddColumn() failed in vdcd_parse_string()" );
427                     if ( 0 == rc )
428                     {
429                         rc = VCursorDatatype( probing_cursor, col -> idx, &( col -> type_decl ), &( col -> type_desc ) );
430                         DISP_RC( rc, "VCursorDatatype() failed" );
431                         if ( 0 == rc )
432                         {
433                             valid_columns++;
434                             col -> valid = true;
435                         }
436                         else
437                         {
438                             col -> valid = false;
439                             ( *invalid_columns )++;
440                         }
441                     }
442                     else
443                     {
444                         col -> valid = false;
445                         ( *invalid_columns )++;
446                     }
447                 }
448             }
449             rc = VCursorRelease( probing_cursor );
450             DISP_RC( rc, "VCursorRelease() failed" );
451         }
452     }
453     return valid_columns;
454 }
455 
456 
vdcd_table_has_column(const VTable * tbl,const char * to_find)457 bool vdcd_table_has_column( const VTable *tbl, const char * to_find )
458 {
459 	bool res = false;
460 	if ( NULL != tbl && NULL != to_find )
461 	{
462 		size_t to_find_len = string_size( to_find );
463 		if ( to_find_len > 0 )
464 		{
465 			KNamelist * names;
466 			rc_t rc = VTableListCol( tbl, &names );
467 			DISP_RC( rc, "VTableListCol() failed" );
468 			if ( 0 == rc )
469 			{
470 				uint32_t n;
471 				rc = KNamelistCount( names, &n );
472 				DISP_RC( rc, "KNamelistCount() failed" );
473 				if ( 0 == rc )
474 				{
475 					uint32_t i;
476 					for ( i = 0; ( i < n ) && ( 0 == rc ) && !res; ++i )
477 					{
478 						const char * col_name;
479 						rc = KNamelistGet( names, i, &col_name );
480 						DISP_RC( rc, "KNamelistGet() failed" );
481 						if ( 0 == rc )
482 						{
483 							size_t col_name_len = string_size( col_name );
484 							if ( col_name_len == to_find_len )
485                             {
486 								res = ( 0 == string_cmp( to_find, to_find_len, col_name, col_name_len, ( uint32_t )col_name_len ) );
487                             }
488 						}
489 					}
490 				}
491 				KNamelistRelease( names );
492 			}
493 		}
494 	}
495 	return res;
496 }
497 
vdcd_extract_from_table(col_defs * defs,const VTable * tbl,uint32_t * invalid_columns)498 uint32_t vdcd_extract_from_table( col_defs* defs, const VTable *tbl, uint32_t *invalid_columns )
499 {
500     uint32_t found = 0;
501     KNamelist *names;
502     rc_t rc = VTableListCol( tbl, &names );
503     DISP_RC( rc, "VTableListCol() failed" );
504     if ( NULL != invalid_columns ) *invalid_columns = 0;
505     if ( 0 == rc )
506     {
507         const VCursor *curs;
508         rc = VTableCreateCursorRead( tbl, &curs );
509         DISP_RC( rc, "VTableCreateCursorRead() failed" );
510         if ( 0 == rc )
511         {
512             uint32_t n;
513             rc = KNamelistCount( names, &n );
514             DISP_RC( rc, "KNamelistCount() failed" );
515             if ( 0 == rc )
516             {
517                 uint32_t i;
518                 for ( i = 0; i < n && 0 == rc; ++i )
519                 {
520                     const char *col_name;
521                     rc = KNamelistGet( names, i, &col_name );
522                     DISP_RC( rc, "KNamelistGet() failed" );
523                     if ( 0 == rc )
524                     {
525                         p_col_def def = vdcd_append_col( defs, col_name );
526                         rc = VCursorAddColumn( curs, &(def->idx), "%s", def -> name );
527                         DISP_RC( rc, "VCursorAddColumn() failed in vdcd_extract_from_table()" );
528                         if ( 0 == rc )
529                         {
530                             rc = VCursorDatatype( curs, def->idx, &(def->type_decl), &(def->type_desc) );
531                             DISP_RC( rc, "VCursorDatatype() failed" );
532                             if ( 0 == rc )
533                             {
534                                 found++;
535                                 def -> valid = true;
536                             }
537                             else
538                             {
539                                 if ( NULL != invalid_columns )
540                                 {
541                                     ( *invalid_columns )++;
542                                 }
543                                 def -> valid = false;
544                             }
545 
546                         }
547                         else
548                         {
549                             if ( NULL != invalid_columns )
550                             {
551                                 ( *invalid_columns )++;
552                             }
553                             def -> valid = false;
554                         }
555 
556                     }
557                 }
558             }
559             rc = VCursorRelease( curs );
560             DISP_RC( rc, "VCursorRelease() failed" );
561         }
562         rc = KNamelistRelease( names );
563         DISP_RC( rc, "KNamelistRelease() failed" );
564     }
565     return found;
566 }
567 
568 
vdcd_extract_from_phys_table(col_defs * defs,const VTable * tbl)569 bool vdcd_extract_from_phys_table( col_defs* defs, const VTable *tbl )
570 {
571     bool col_defs_found = false;
572     KNamelist *names;
573     rc_t rc = VTableListPhysColumns( tbl, &names );
574     DISP_RC( rc, "VTableListPhysColumns() failed" );
575     if ( 0 == rc )
576     {
577         uint32_t n;
578         rc = KNamelistCount( names, &n );
579         DISP_RC( rc, "KNamelistCount() failed" );
580         if ( 0 == rc )
581         {
582             uint32_t i, found;
583             for ( i = 0, found = 0; i < n && 0 == rc; ++i )
584             {
585                 const char *col_name;
586                 rc = KNamelistGet( names, i, &col_name );
587                 DISP_RC( rc, "KNamelistGet() failed" );
588                 if ( 0 == rc )
589                 {
590                     vdcd_append_col( defs, col_name );
591                     found++;
592                 }
593             }
594             col_defs_found = ( found > 0 );
595         }
596         rc = KNamelistRelease( names );
597         DISP_RC( rc, "KNamelistRelease() failed" );
598     }
599     return col_defs_found;
600 }
601 
602 typedef struct add_2_cur_context
603 {
604     const VCursor *curs;
605     uint32_t count;
606 } add_2_cur_context;
607 typedef add_2_cur_context* p_add_2_cur_context;
608 
609 
vdcd_add_1_to_cursor(void * item,void * data)610 static void CC vdcd_add_1_to_cursor( void *item, void *data )
611 {
612     rc_t rc;
613     p_col_def col_def = ( p_col_def )item;
614     p_add_2_cur_context ctx = ( p_add_2_cur_context )data;
615 
616     if ( NULL == col_def || NULL == ctx )
617     {
618         return;
619     }
620     if ( NULL == ctx -> curs || ! col_def -> valid )
621     {
622         return;
623     }
624 
625     rc = VCursorAddColumn( ctx -> curs, &( col_def -> idx ), "%s", col_def -> name );
626     DISP_RC( rc, "VCursorAddColumn() failed in vdcd_add_1_to_cursor" );
627 
628     /***************************************************************************
629     !!! extract type information !!!
630     **************************************************************************/
631     if ( 0 == rc )
632     {
633         rc = VCursorDatatype( ctx -> curs, col_def -> idx,
634                               &( col_def -> type_decl ), &( col_def->type_desc ) );
635         DISP_RC( rc, "VCursorDatatype() failed" );
636         if ( 0 == rc )
637         {
638             ctx -> count++;
639             col_def -> valid = true;
640         }
641     }
642     else
643     {
644         col_def -> valid = false;
645     }
646 }
647 
vdcd_add_to_cursor(col_defs * defs,const VCursor * curs)648 uint32_t vdcd_add_to_cursor( col_defs* defs, const VCursor *curs )
649 {
650     add_2_cur_context ctx;
651     ctx . count = 0;
652     ctx . curs = curs;
653     VectorForEach( &( defs -> cols ), false, vdcd_add_1_to_cursor, &ctx );
654     return ctx . count;
655 }
656 
vdcd_reset_1_content(void * item,void * data)657 static void CC vdcd_reset_1_content( void *item, void *data )
658 {
659     p_col_def my_col_def = ( p_col_def )item;
660     if ( NULL != my_col_def )
661     {
662         rc_t rc = vds_clear( &( my_col_def -> content ) );
663         DISP_RC( rc, "dump_str_clear() failed" );
664     }
665 }
666 
vdcd_reset_content(col_defs * defs)667 void vdcd_reset_content( col_defs* defs )
668 {
669     VectorForEach( &( defs -> cols), false, vdcd_reset_1_content, NULL );
670 }
671 
vdcd_ins_1_trans_fkt(void * item,void * data)672 static void CC vdcd_ins_1_trans_fkt( void *item, void *data )
673 {
674     p_col_def col_def = ( p_col_def )item;
675     const VSchema *schema = ( const VSchema * )data;
676 
677     if ( NULL != col_def && NULL != schema )
678     {
679         /* resolves special sra-types and retrieves the addr of
680         a function that later can translate the values into plain-text
681         --- is defined in this file! */
682         col_def -> value_trans_fct = vdcd_get_value_trans_fct( schema, &( col_def -> type_decl ) );
683         col_def -> dim_trans_fct = vdcd_get_dim_trans_fct( schema, &( col_def -> type_decl ) );
684     }
685 }
686 
vdcd_ins_trans_fkt(col_defs * defs,const VSchema * schema)687 void vdcd_ins_trans_fkt( col_defs* defs, const VSchema *schema )
688 {
689     if ( NULL != defs && NULL != schema )
690     {
691         VectorForEach( &( defs -> cols ), false, vdcd_ins_1_trans_fkt, ( void* )schema );
692     }
693 }
694 
vdcd_exclude_column_cb(void * item,void * data)695 static void CC vdcd_exclude_column_cb( void *item, void *data )
696 {
697     const char * s = ( const char * )data;
698     p_col_def col_def = ( p_col_def )item;
699     if ( NULL != s && NULL != col_def )
700     {
701         if ( 0 == strcmp( col_def -> name, s ) )
702             col_def -> excluded = true;
703     }
704 }
705 
vdcd_exclude_this_column(col_defs * defs,const char * column_name)706 void vdcd_exclude_this_column( col_defs* defs, const char* column_name )
707 {
708     VectorForEach( &( defs -> cols ), false, vdcd_exclude_column_cb, ( void* )column_name );
709 }
710 
vdcd_exclude_these_columns(col_defs * defs,const char * column_names)711 void vdcd_exclude_these_columns( col_defs* defs, const char* column_names )
712 {
713     char colname[ MAX_COL_NAME_LEN + 1 ];
714     size_t i_dest = 0;
715     if ( NULL != defs && NULL != column_names )
716     {
717         while ( *column_names )
718         {
719             if ( *column_names == ',' )
720             {
721                 if ( i_dest > 0 )
722                 {
723                     colname[ i_dest ] = 0;
724                     vdcd_exclude_this_column( defs, colname );
725                 }
726                 i_dest = 0;
727             }
728             else
729             {
730                 if ( i_dest < MAX_COL_NAME_LEN )
731                 {
732                     colname[ i_dest++ ] = *column_names;
733                 }
734             }
735             column_names++;
736         }
737         if ( i_dest > 0 )
738         {
739             colname[ i_dest ] = 0;
740             vdcd_exclude_this_column( defs, colname );
741         }
742     }
743 }
744 
vdcd_get_first_none_static_column_idx(col_defs * defs,const VCursor * cur,uint32_t * idx)745 bool vdcd_get_first_none_static_column_idx( col_defs *defs, const VCursor *cur, uint32_t *idx )
746 {
747     bool res = false;
748     if ( NULL != defs && NULL != cur && NULL != idx )
749     {
750         uint32_t len = VectorLength( &( defs -> cols ) );
751         if ( len > 0 )
752         {
753             uint32_t start = VectorStart( &( defs -> cols ) );
754             uint32_t run_idx = start;
755             while ( ( run_idx < ( start + len ) ) && !res )
756             {
757                 col_def * cd = VectorGet( &( defs -> cols ), run_idx );
758                 if ( NULL != cd )
759                 {
760                     int64_t  first;
761                     uint64_t count;
762 
763                     rc_t rc = VCursorIdRange( cur, cd -> idx, &first, &count );
764                     if ( 0 == rc && count > 0 )
765                     {
766                         *idx = cd -> idx;
767                         res = true;
768                     }
769                 }
770                 run_idx++;
771             }
772         }
773     }
774     return res;
775 }
776 
777 /* ******************************************************************************************************** */
778 typedef struct spread
779 {
780 	uint64_t count;
781 	double sum, sum_sq;
782 	int64_t min, max;
783 } spread;
784 
785 
786 /*
787 	s ... spread * s
788 	b ... const void * base
789 	l ... uint32_t row_len
790 	t ... type ( int64_t, uint64_t ... )
791 */
792 #define COUNTVALUES( S, b, l, t )							\
793 	{														\
794 		const t * values = base;							\
795 		uint32_t i;											\
796 		for ( i = 0; i < l; ++i )							\
797 		{													\
798 			t value = ( t )values[ i ];		    			\
799 			if ( value != 0 )								\
800 			{												\
801 				double value_d = ( double ) value;			\
802 				if ( value < ( t )(S)->min ) (S)->min = value;	\
803 				if ( value > ( t )(S)->max ) (S)->max = value;	\
804 				(S)->sum += value_d;						\
805 				(S)->sum_sq += ( value_d * value_d );		\
806 				(S)->count++;								\
807 			}												\
808 		}													\
809 	}														\
810 
round_to_uint64_t(double value)811 static uint64_t round_to_uint64_t( double value )
812 {
813 	double floor_value = floor( value );
814 	double x = ( value - floor_value ) > 0.5 ? ceil( value ) : floor_value;
815 	return ( uint64_t )x;
816 }
817 
vdcd_collect_spread_col(const struct num_gen * row_set,col_def * cd,const VCursor * curs)818 static rc_t vdcd_collect_spread_col( const struct num_gen *row_set, col_def *cd, const VCursor * curs )
819 {
820 	const struct num_gen_iter *iter;
821 	rc_t rc = num_gen_iterator_make( row_set, &iter );
822 	if ( 0 == rc )
823 	{
824 		const void * base;
825 		uint32_t row_len, elem_bits;
826 		int64_t row_id;
827 		spread s;
828 		spread * sp = &s;
829 
830 		s . max = 0;
831         s . sum = 0;
832         s . sum_sq = 0;
833         s . count = 0;
834 		s . min = INT64_MAX;
835 
836 		while ( ( 0 == rc ) && num_gen_iterator_next( iter, &row_id, &rc ) )
837 		{
838 			if ( 0 == rc )
839             {
840                 rc = Quitting();
841             }
842 			if ( 0 != rc )
843             {
844                 break;
845             }
846 			rc = VCursorCellDataDirect( curs, row_id, cd -> idx, &elem_bits, &base, NULL, &row_len );
847 			if ( 0 == rc )
848 			{
849 				if ( cd -> type_desc.domain == vtdUint )
850 				{
851 					/* unsigned int's */
852 					switch( elem_bits )
853 					{
854 						case 64 : COUNTVALUES( sp, base, row_len, uint64_t ) break;
855 						case 32 : COUNTVALUES( sp, base, row_len, uint32_t ) break;
856 						case 16 : COUNTVALUES( sp, base, row_len, uint16_t ) break;
857 						case 8  : COUNTVALUES( sp, base, row_len, uint8_t )  break;
858 					}
859 				}
860 				else
861 				{
862 					/* signed int's */
863 					switch( elem_bits )
864 					{
865 						case 64 : COUNTVALUES( sp, base, row_len, int64_t ) break;
866 						case 32 : COUNTVALUES( sp, base, row_len, int32_t ) break;
867 						case 16 : COUNTVALUES( sp, base, row_len, int16_t ) break;
868 						case 8  : COUNTVALUES( sp, base, row_len, int8_t )  break;
869 					}
870 				}
871 			}
872 		}
873 
874 		if ( s . count > 0 )
875 		{
876 			rc = KOutMsg( "\n[%s]\n", cd -> name );
877 			if ( 0 == rc )
878             {
879 				rc = KOutMsg( "min    = %,ld\n", s . min );
880             }
881 			if ( 0 == rc )
882             {
883 				rc = KOutMsg( "max    = %,ld\n", s . max );
884             }
885 			if ( 0 == rc )
886             {
887 				rc = KOutMsg( "count  = %,ld\n", s . count );
888             }
889 			if ( 0 == rc )
890 			{
891 				double median = ( s . sum / s . count );
892 				rc = KOutMsg( "median = %,ld\n", round_to_uint64_t( median ) );
893 				if ( 0 == rc )
894 				{
895 					double stdev = sqrt( ( ( s . sum_sq - ( s . sum * s . sum ) / s . count ) ) / ( s . count - 1 ) );
896 					rc = KOutMsg( "stdev  = %,ld\n", round_to_uint64_t( stdev ) );
897 				}
898 			}
899 		}
900 		num_gen_iterator_destroy( iter );
901 	}
902 	return rc;
903 }
904 #undef COUNTVALUES
905 
vdcd_collect_spread(const struct num_gen * row_set,col_defs * cols,const VCursor * curs)906 rc_t vdcd_collect_spread( const struct num_gen * row_set, col_defs * cols, const VCursor * curs )
907 {
908 	rc_t rc = 0;
909 	uint32_t i, n = VectorLength( &( cols -> cols ) );
910 	for ( i = 0; i < n && 0 == rc; ++i )
911 	{
912 		col_def * cd = VectorGet( &( cols -> cols ), i );
913 		if ( NULL != cd )
914 		{
915 			if ( vtdUint == cd -> type_desc . domain || vtdInt == cd -> type_desc . domain )
916             {
917 				rc = vdcd_collect_spread_col( row_set, cd, curs );
918             }
919 		}
920 	}
921 	return rc;
922 }
923 
same_values(const VCursor * curs,uint32_t col_idx,int64_t first,uint32_t test_rows)924 static uint32_t same_values( const VCursor * curs, uint32_t col_idx, int64_t first, uint32_t test_rows )
925 {
926     uint32_t res = 0;
927     const void * base;
928     uint32_t elem_bits, boff, row_len;
929     rc_t rc = VCursorCellDataDirect( curs, first, col_idx, &elem_bits, &base, &boff, &row_len );
930     while ( 0 == rc && res < test_rows && 0 == rc )
931     {
932         const void * base_1;
933         uint32_t elem_bits_1, boff_1, row_len_1;
934         rc = VCursorCellDataDirect( curs, first + res + 1, col_idx, &elem_bits_1, &base_1, &boff_1, &row_len_1 );
935         if ( 0 == rc )
936         {
937             if ( elem_bits != elem_bits_1 ||
938                  boff != boff_1 ||
939                  row_len != row_len_1 ||
940                  base != base_1 )
941             {
942                 return res;
943             }
944         }
945         res += 1;
946     }
947     return res;
948 }
949 
vdcd_is_static_column1(const VTable * tbl,col_def * col,uint32_t test_rows)950 static bool vdcd_is_static_column1( const VTable *tbl, col_def *col, uint32_t test_rows )
951 {
952     bool res = false;
953     const VCursor * curs;
954     rc_t rc = VTableCreateCursorRead( tbl, &curs );
955     if ( 0 == rc )
956     {
957         uint32_t idx;
958         rc = VCursorAddColumn( curs, &idx, "%s", col -> name );
959         if ( 0 == rc )
960         {
961             rc = VCursorOpen( curs );
962             if ( 0 == rc )
963             {
964                 int64_t first;
965                 uint64_t count;
966                 rc = VCursorIdRange( curs, idx, &first, &count );
967                 if ( 0 == rc && 0 == count )
968                 {
969                     res = ( same_values( curs, idx, first, test_rows ) == test_rows );
970                 }
971             }
972         }
973         VCursorRelease( curs );
974     }
975     return res;
976 }
977 
978 #define TEST_ROWS 20
979 
vdcd_extract_static_columns(col_defs * defs,const VTable * tbl,const size_t str_limit,uint32_t * invalid_columns)980 uint32_t vdcd_extract_static_columns( col_defs *defs, const VTable *tbl,
981                                       const size_t str_limit, uint32_t *invalid_columns )
982 {
983     col_defs * temp_defs;
984     uint32_t res = 0;
985     if ( vdcd_init( &temp_defs, str_limit ) )
986     {
987         uint32_t count = vdcd_extract_from_table( temp_defs, tbl, invalid_columns );
988         uint32_t idx;
989         for ( idx = 0; idx < count; ++idx )
990         {
991             col_def * col = VectorGet( &( temp_defs -> cols ), idx );
992             if ( col != NULL && col -> valid )
993             {
994                 if ( vdcd_is_static_column1( tbl, col, TEST_ROWS ) )
995                 {
996                     p_col_def c = vdcd_append_col( defs, col -> name  );
997                     if ( c != NULL )
998                     {
999                         res++;
1000                     }
1001                 }
1002             }
1003         }
1004         vdcd_destroy( temp_defs );
1005     }
1006     else
1007     {
1008         if ( NULL != invalid_columns )
1009         {
1010             *invalid_columns = 0;
1011         }
1012     }
1013     return res;
1014 }
1015