1 /*===========================================================================
2 *
3 *                            PUBLIC DOMAIN NOTICE
4 *               National Center for Biotechnology Information
5 *
6 *  This software/database is a "United States Government Work" under the
7 *  terms of the United States Copyright Act.  It was written as part of
8 *  the author's official duties as a United States Government employee and
9 *  thus cannot be copyrighted.  This software/database is freely available
10 *  to the public for use. The National Library of Medicine and the U.S.
11 *  Government have not placed any restriction on its use or reproduction.
12 *
13 *  Although all reasonable efforts have been taken to ensure the accuracy
14 *  and reliability of the software and data, the NLM and the U.S.
15 *  Government do not and cannot warrant the performance or results that
16 *  may be obtained by using this software or data. The NLM and the U.S.
17 *  Government disclaim all warranties, express or implied, including
18 *  warranties of performance, merchantability or fitness for any particular
19 *  purpose.
20 *
21 *  Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 
27 #include "pl-regions.h"
28 #include <insdc/sra.h>
29 #include <sysalloc.h>
30 
31 #include <stdlib.h>
32 #include <stdio.h>
33 #include <string.h>
34 #include <assert.h>
35 
36 enum
37 {
38     pacbio_idx_spot_id = 0,
39     pacbio_idx_type,
40     pacbio_idx_start,
41     pacbio_idx_end,
42     pacbio_idx_score
43 };
44 
45 
46 /* HDF5-Groups and tables used to use the REGIONS-table */
47 static const char * region_groups_to_check[] =
48 {
49     "PulseData",
50     NULL
51 };
52 
53 static const char * region_tables_to_check[] =
54 {
55     "PulseData/Regions",
56     NULL
57 };
58 
59 
rgn_stat_init(regions_stat * stat)60 void rgn_stat_init( regions_stat * stat )
61 {
62     stat->inserts = 0;
63     stat->expands_a = 0;
64     stat->expands_i = 0;
65     stat->inserts_spots = 0;
66     stat->expands_spots = 0;
67     stat->end_gap = 0;
68     stat->overlapps = 0;
69     stat->removed = 0;
70 }
71 
72 
rgn_init(regions * rgn)73 void rgn_init( regions *rgn )
74 {
75     init_array_file( &rgn->hdf5_regions );
76 
77     VectorInit ( &rgn->read_Regions, 0, 5 );
78     VectorInit ( &rgn->sort_Regions, 0, 5 );
79     VectorInit ( &rgn->stock_Regions, 0, 5 );
80 
81     rgn->data_32 = NULL;
82     rgn->data_32_len = 0;
83     rgn->data_8 = NULL;
84     rgn->data_8_len = 0;
85 
86     rgn->offset = 0;
87     rgn->spot_id = 0;
88     rgn->spot_len = 0;
89     rgn->hq_rgn.start = 0;
90     rgn->hq_rgn.end = 0;
91 
92     rgn_stat_init( &( rgn->stat ) );
93 
94     rgn->complete_table = NULL;
95     rgn->table_index = NULL;
96 }
97 
98 
region_whack(void * item,void * data)99 static void CC region_whack( void * item, void * data )
100 {
101     free( item );
102 }
103 
104 
rgn_vector_move(Vector * src,Vector * dst)105 static rc_t rgn_vector_move( Vector * src, Vector * dst )
106 {
107     rc_t rc = 0;
108     while ( VectorLength( src ) > 0 && rc == 0 )
109     {
110         region *ptr;
111         rc = VectorRemove ( src, 0, (void**)&ptr );
112         if ( rc == 0 )
113             rc = VectorAppend ( dst, NULL, ptr );
114     }
115     return rc;
116 }
117 
118 
rgn_free(regions * rgn)119 void rgn_free( regions *rgn )
120 {
121     free_array_file( &rgn->hdf5_regions );
122 
123     VectorWhack ( &rgn->read_Regions, region_whack, NULL );
124     VectorWhack ( &rgn->sort_Regions, region_whack, NULL );
125     VectorWhack ( &rgn->stock_Regions, region_whack, NULL );
126 
127     if ( rgn->data_32 != NULL )
128         free( rgn->data_32 );
129     if ( rgn->data_8 != NULL )
130         free( rgn->data_8 );
131 
132     if ( rgn->complete_table != NULL )
133         free( rgn->complete_table );
134     if ( rgn->table_index != NULL )
135         free( rgn->table_index );
136 }
137 
138 
139 static
rgn_sort_callback(const void * p1,const void * p2,void * data)140 int64_t CC rgn_sort_callback( const void *p1, const void *p2, void * data )
141 {
142     regions *rgn = ( regions * ) data;
143     int32_t idx1 = *( int32_t * ) p1;
144     int32_t idx2 = *( int32_t * ) p2;
145     int32_t spot_id1 = rgn->complete_table[ idx1 * RGN_COLUMN_COUNT ];
146     int32_t spot_id2 = rgn->complete_table[ idx2 * RGN_COLUMN_COUNT ];
147     if ( spot_id1 == spot_id2 )
148         return ( idx1 - idx2 );
149     return ( spot_id1 - spot_id2 );
150 }
151 
152 
153 static
rgn_read_complete_table(regions * rgn)154 rc_t rgn_read_complete_table( regions *rgn )
155 {
156     rc_t rc;
157     uint32_t rowcount = rgn->hdf5_regions.extents[ 0 ];
158     uint32_t rowsize = sizeof( int32_t ) * RGN_COLUMN_COUNT;
159 
160     rgn->complete_table = malloc( rowcount * rowsize );
161     if ( rgn->complete_table == NULL )
162         rc = RC( rcExe, rcNoTarg, rcLoading, rcMemory, rcExhausted );
163     else
164     {
165         rgn->table_index = malloc( sizeof( uint32_t ) * rowcount );
166         if ( rgn->table_index == NULL )
167         {
168             free( rgn->complete_table );
169             rgn->complete_table = NULL;
170             rc = RC( rcExe, rcNoTarg, rcLoading, rcMemory, rcExhausted );
171         }
172         else
173         {
174             uint64_t n_read = 0;
175 
176             /* now let's read the whole table... */
177             rc = array_file_read_dim2( &(rgn->hdf5_regions), 0, rgn->complete_table,
178                                        rowcount, RGN_COLUMN_COUNT, &n_read );
179             if ( rc == 0 )
180             {
181                 uint32_t idx, first_spot_id;
182 
183                 first_spot_id = rgn->complete_table[ pacbio_idx_spot_id ];
184                 if ( first_spot_id != 0 )
185                 {
186                     /* in case the file we are loading is part of a multi-file submission */
187                     for ( idx = 0; idx < rowcount; ++idx )
188                         rgn->complete_table[ ( idx * RGN_COLUMN_COUNT ) + pacbio_idx_spot_id ] -= first_spot_id;
189                 }
190 
191                 /* first let's fill the index, first with ascending row-id's */
192                 for ( idx = 0; idx < rowcount; ++idx )
193                     rgn->table_index[ idx ] = idx;
194 
195                 /* now sort the index-array by the content's spot-id's */
196                 ksort ( rgn->table_index, rowcount, sizeof( uint32_t ),
197                         rgn_sort_callback, rgn );
198 
199                 /* left here to print a debug-output of the sorted table-index */
200                 /*
201                 for ( idx = rowcount - 128; idx < rowcount; ++idx )
202                     OUTMSG(( "idx[%i] = %i -> %i\n",
203                              idx, rgn->table_index[ idx ],
204                              rgn->complete_table[ rgn->table_index[ idx ] * RGN_COLUMN_COUNT ] ));
205                 */
206 
207                 /* the table and the index is now ready to use... */
208             }
209         }
210     }
211     return rc;
212 }
213 
214 
rgn_open(const KDirectory * hdf5_dir,regions * rgn)215 rc_t rgn_open( const KDirectory *hdf5_dir, regions *rgn )
216 {
217     rc_t rc;
218     rgn_init( rgn );
219     /* check if the necessary groups/tables are there */
220     rc = check_src_objects( hdf5_dir, region_groups_to_check,
221                             region_tables_to_check, false );
222     if ( rc == 0 )
223     {
224         /* open the region-table... */
225         rc = open_element( hdf5_dir, &rgn->hdf5_regions,
226                            region_groups_to_check[ 0 ], "Regions",
227                            REGIONS_BITSIZE, REGIONS_COLS, true, false, true );
228         if ( rc == 0 )
229             rc = rgn_read_complete_table( rgn );
230     }
231     if ( rc != 0 )
232         rgn_free( rgn );
233     return rc;
234 }
235 
236 
rgn_get_or_make(Vector * stock,region ** r)237 static rc_t rgn_get_or_make( Vector * stock, region ** r )
238 {
239     rc_t rc = 0;
240     /* take it out of the stock or make a new one... */
241     if ( VectorLength( stock ) > 0 )
242         rc = VectorRemove ( stock, 0, (void**)r );
243     else
244         *r = malloc( sizeof ** r );
245 
246     if ( *r == NULL )
247         rc = RC( rcExe, rcNoTarg, rcLoading, rcMemory, rcExhausted );
248     return rc;
249 }
250 
251 
rgn_sort_by_start(const void * item,const void * n)252 static int64_t CC rgn_sort_by_start( const void *item, const void *n )
253 {
254     region * v1 = ( region * )item;
255     region * v2 = ( region * )n;
256     if ( v1 -> start != v2 -> start )
257         return ( v1->start - v2->start );
258     return ( v1->end - v2->end );
259 }
260 
261 
rgn_store_bio_or_adapter(Vector * stock,Vector * to,const int32_t * block,int32_t sra_read_type)262 static rc_t rgn_store_bio_or_adapter( Vector * stock, Vector * to,
263                                       const int32_t * block, int32_t sra_read_type )
264 {
265     rc_t rc = 0;
266 
267     if ( block[ pacbio_idx_start ] < block[ pacbio_idx_end ] )
268     {
269         region * a_region;
270         rc = rgn_get_or_make( stock, &a_region );
271         if ( rc == 0 )
272         {
273             a_region->spot_id = block[ pacbio_idx_spot_id ];
274             a_region->type    = sra_read_type;
275             a_region->start   = block[ pacbio_idx_start ];
276             a_region->end     = block[ pacbio_idx_end ];
277 
278             /* see every region shorter as MIN_BIOLOGICAL_LEN
279                as a technical (adapter) region */
280             if ( ( a_region->end - a_region->start ) <= MIN_BIOLOGICAL_LEN )
281                 a_region->type = SRA_READ_TYPE_TECHNICAL;
282 
283             a_region->filter  = SRA_READ_FILTER_PASS;
284             rc = VectorInsert ( to, a_region, NULL, rgn_sort_by_start );
285         }
286     }
287     return rc;
288 }
289 
rgn_store_block(Vector * stock,Vector * to,hq_region * hq,const int32_t * block,region_type_mapping * mapping,bool * have_high_quality)290 static rc_t rgn_store_block( Vector * stock, Vector * to, hq_region * hq,
291                              const int32_t * block, region_type_mapping *mapping,
292                              bool *have_high_quality )
293 {
294     rc_t rc = 0;
295     int32_t type = block[ pacbio_idx_type ];
296 
297     if ( mapping->rgn_type_hq >=0 && type == mapping->rgn_type_hq )
298     {
299         /* it is an error if we have more than one high-quality-region! */
300         assert ( ! * have_high_quality );
301 
302         if ( * have_high_quality )
303         {
304             rc = RC ( rcExe, rcNoTarg, rcLoading, rcData, rcInconsistent );
305             LOGERR( klogErr, rc, "(* have_high_quality) in rgn_store_block()'" );
306             return rc;
307         }
308 
309         hq->start = block[ pacbio_idx_start ];
310         hq->end   = block[ pacbio_idx_end ];
311         * have_high_quality = true;
312     }
313     else if ( mapping->rgn_type_ga >= 0 && type == mapping->rgn_type_ga )
314     {   /* so far do nothing with the "global accuracy" region */
315     }
316     else if ( mapping->rgn_type_adapter >= 0 && type == mapping->rgn_type_adapter )
317     {   /* it is an adapter */
318         rc = rgn_store_bio_or_adapter( stock, to, block, SRA_READ_TYPE_TECHNICAL );
319     }
320     else if ( mapping->rgn_type_insert >= 0 && type == mapping->rgn_type_insert )
321     {   /* it is an insert */
322         rc = rgn_store_bio_or_adapter( stock, to, block, SRA_READ_TYPE_BIOLOGICAL );
323     }
324     else
325     {   /* the type is unknown */
326         ( mapping->count_of_unknown_rgn_types )++;
327         /*
328         rc = RC ( rcExe, rcNoTarg, rcLoading, rcData, rcInconsistent );
329         LOGERR( klogErr, rc, "( region type is unknown ) in rgn_store_block()'" );
330         */
331     }
332     return rc;
333 }
334 
335 
336 /* inserts the generated region into the sort_Regions */
rgn_generate(Vector * stock,Vector * dst,const int32_t spot_id,const uint32_t start,const uint32_t len)337 static rc_t rgn_generate( Vector * stock, Vector * dst,
338                           const int32_t spot_id, const uint32_t start, const uint32_t len )
339 {
340     region * a_region;
341     rc_t rc = rgn_get_or_make( stock, &a_region );
342     if ( rc == 0 )
343     {
344         a_region->spot_id = spot_id;
345         a_region->type    = SRA_READ_TYPE_TECHNICAL; /*means "i dont know"*/
346         a_region->start   = start;
347         a_region->end     = start + len;
348         a_region->filter  = SRA_READ_FILTER_CRITERIA;
349         rc = VectorInsert ( dst, a_region, NULL, rgn_sort_by_start );
350     }
351     return rc;
352 }
353 
354 
355 /* *************************************************************
356 declares all regions inside and touching the HQ-Regions as
357     SRA_READ_FILTER_PASS outside becomes SRA_READ_FILTER_CRITERIA;
358 ************************************************************* */
359 #if 0
360 static rc_t rgn_apply_filter( Vector * stock, Vector * v, hq_region * hq,
361                               const int32_t spot_id, const uint32_t spot_len )
362 {
363     rc_t rc = 0;
364 
365     if ( hq->start == 0 && hq->end == 0 )
366     {
367         /* we have no HQ-Region, discard everything and create one
368            READ for the whole spot, that will be TECHNICAL... */
369         rgn_vector_move( v, stock );
370 
371         if ( spot_len > 0 )
372             /* inserts the generated region into the sort_Regions */
373             rc = rgn_generate( stock, v, spot_id, 0, spot_len );
374     }
375     else
376     {
377         uint32_t i, count = VectorLength ( v );
378         for ( i = 0; i < count; ++ i )
379         {
380             region * a_region = VectorGet ( v, i );
381             if ( a_region != NULL )
382             {
383                 bool set_invalid = ( ( a_region->end <= hq->start ) ||
384                                      ( a_region->start >= hq->end ) );
385                 if ( set_invalid )
386                 {
387                     /* the region is before the hq-region
388                        ---> set to SRA_READ_FILTER_CRITERIA */
389                     a_region->filter = SRA_READ_FILTER_CRITERIA;
390                     a_region->type = SRA_READ_TYPE_TECHNICAL;
391                 }
392                 else
393                 {
394                     /* the region intersects with the hq-region
395                        ---> set to SRA_READ_FILTER_PASS */
396                     a_region->filter = SRA_READ_FILTER_PASS;
397                 }
398             }
399         }
400     }
401     return rc;
402 }
403 #endif
404 
rgn_expand_last_rgn_by_1(Vector * v,int32_t * expands_a,int32_t * expands_i)405 static bool rgn_expand_last_rgn_by_1( Vector * v, int32_t *expands_a, int32_t *expands_i )
406 {
407     region * a_region = VectorLast ( v );
408     bool res = ( a_region != NULL );
409     if ( res )
410     {
411         a_region->end += 1;
412         if ( a_region->type == SRA_READ_TYPE_TECHNICAL )
413             (*expands_a)++;
414         else
415             (*expands_i)++;
416     }
417     return res;
418 }
419 
420 
421 /* *************************************************************
422 if gap is 1, expand previous region by 1 (correct off-by-1)
423 fill in gaps > 1 ( regions are not consecutive )
424 correct overlapping regions
425 fill in a gap at the end, if the last region does not reach spotlen
426 
427     INTPUT : rgn->read_Regions
428     OUTPUT : rgn->sort_Regions
429 ************************************************************* */
rgn_correct(Vector * stock,Vector * from,Vector * to,const uint32_t spot_id,const uint32_t spot_len,regions_stat * stats)430 static rc_t rgn_correct( Vector * stock, Vector * from, Vector * to,
431                          const uint32_t spot_id, const uint32_t spot_len,
432                          regions_stat * stats )
433 {
434     rc_t rc;
435     int32_t start, expands_a = 0, expands_i = 0, inserts = 0;
436     uint32_t i, count = VectorLength ( from );
437 
438     for ( rc = 0, start = 0, i = 0; i < count && rc == 0; ++ i )
439     {
440         region * a_region;
441 
442         /* take the region out of rgn->read_Regions*/
443         rc = VectorRemove ( from, 0, (void**)&a_region );
444         if ( rc == 0 )
445         {
446             int32_t gap_len = ( a_region->start - start );
447 
448             if ( gap_len == 1 )
449             {
450                 /* the gap-length is one, try to expand the previous region */
451                 if ( !rgn_expand_last_rgn_by_1( to, &expands_a, &expands_i ) )
452                 {
453                     /* there is no previous region ! */
454                     rc = rgn_generate( stock, to, spot_id, start, gap_len );
455                 }
456             }
457             else if ( gap_len > 1 )
458             {
459                 /* generate a artificial gap in the middle or the start of the spot */
460                 rc = rgn_generate( stock, to, spot_id, start, gap_len );
461                 inserts++;
462             }
463             else if ( gap_len < 0 )
464             {
465                 /* a negative gap would be an error in the sorting of the regions,
466                    or an overlapp of regions */
467                 if ( ( a_region->start - gap_len )  > a_region->end )
468                 {
469                     rc = RC ( rcExe, rcNoTarg, rcLoading, rcData, rcInconsistent );
470                     LOGERR( klogErr, rc, "((a_region->start-gap_len)>a_region->end) in rgn_correct()'" );
471                 }
472                 else /** move the start point ***/
473                 {
474                     a_region->start -= gap_len;
475                     stats->overlapps++;
476                 }
477             }
478 
479             if ( rc == 0 )
480             {
481                 rc = VectorInsert ( to, a_region, NULL, rgn_sort_by_start );
482                 if ( rc != 0 )
483                     LOGERR( klogErr, rc, "VectorInsert(rgn_sort_by_start) in rgn_correct()'" );
484                 start = a_region->end;
485             }
486             else
487             {
488                 rc_t rc1 = VectorInsert( stock, a_region, NULL, NULL );
489                 if ( rc1 != 0 )
490                     LOGERR( klogErr, rc, "VectorInsert(NULL) in rgn_correct()'" );
491             }
492         }
493     }
494 
495     if ( rc == 0 )
496     {
497         /* if the last region does not reach to the end of the spot */
498         if ( start < spot_len )
499         {
500             int32_t gap_len = ( spot_len - start );
501             if ( gap_len == 1 )
502             {
503                 /* !!! this can also happen if spot_len == 1 !!! */
504                 /* the gap-length is one, try to expand the previous region */
505                 if ( ! rgn_expand_last_rgn_by_1( to, &expands_a, &expands_i ) )
506                 {
507                     /* there is no previous region ! */
508                     rc = rgn_generate( stock, to, spot_id, start, gap_len );
509                     stats->end_gap++;
510                 }
511             }
512             else if ( gap_len > 0 )
513             {
514                 /* fill the gap to the end... */
515                 rc = rgn_generate( stock, to, spot_id, start, gap_len );
516                 stats->end_gap++;
517             }
518             else if ( gap_len < 0 )
519             {
520                 /* a negative gap would be an error in the sorting of the regions,
521                    or an overlapp of regions */
522                 rc = RC ( rcExe, rcNoTarg, rcLoading, rcData, rcInconsistent );
523                 LOGERR( klogErr, rc, "(gap_len<0) in rgn_correct()'" );
524             }
525         }
526     }
527 
528     /* do some statistics */
529     if ( ( expands_i + expands_a ) > 0 )
530     {
531         stats->expands_a += expands_a;
532         stats->expands_i += expands_i;
533         stats->expands_spots++;
534     }
535     if ( inserts > 0 )
536     {
537         stats->inserts += inserts;
538         stats->inserts_spots++;
539     }
540 
541     return rc;
542 }
543 
544 
545 /* *************************************************************
546 tries to merge overlapping adapter-regions...
547 ( uses rgn->read_Regions as scratch-pad )
548 
549     INTPUT : rgn->sort_Regions
550     OUTPUT : rgn->sort_Regions
551 ************************************************************* */
rgn_merge_consecutive_regions(Vector * stock,Vector * from,Vector * to)552 static rc_t rgn_merge_consecutive_regions( Vector * stock, Vector * from, Vector * to )
553 {
554     rc_t rc = 0;
555     uint32_t i, count = VectorLength ( from );
556     region * a_region = NULL;
557     region * prev_region = NULL;
558 
559     for ( i = 0; i < count && rc == 0; ++ i )
560     {
561         /* take the region out of rgn->sort_Regions*/
562         rc = VectorRemove ( from, 0, (void**)&a_region );
563         if ( rc == 0 )
564         {
565             bool copy = true;
566             if ( prev_region != NULL )
567             {
568                 if ( ( a_region->start <= prev_region->end ) &&
569                      ( a_region->type == prev_region->type ) &&
570                      ( a_region->filter == prev_region->filter ) )
571                 {
572                     prev_region->end = a_region->end;
573                     /* put the now unused region back into the stock */
574                     VectorAppend ( stock, NULL, a_region );
575                     /* and keep prev-region! */
576                     copy = false;
577                 }
578             }
579             if ( copy )
580             {
581                 prev_region = a_region;
582                 rc = VectorInsert ( to, a_region, NULL, rgn_sort_by_start );
583             }
584         }
585     }
586     return rc;
587 }
588 
589 /* *************************************************************
590 tests that the spot-len is covered with regions that
591 are not overlapping, have no gaps, start is ascending
592 the regions to check are in rgn->sort_Regions
593 ************************************************************* */
rgn_check(const Vector * v,const uint32_t spot_len)594 static rc_t rgn_check( const Vector * v, const uint32_t spot_len )
595 {
596     rc_t rc = 0;
597     uint32_t i, count = VectorLength ( v );
598     region * a_region = NULL;
599     int32_t start = 0;
600 
601     /* special case, if the spot is empty there are not regions */
602     if ( spot_len == 0 )
603     {
604         if ( count != 0 )
605         {
606             rc = RC ( rcExe, rcNoTarg, rcLoading, rcData, rcInconsistent );
607             LOGERR( klogErr, rc, "(spot_len == 0)&&(count!=0) in region-check()'" );
608         }
609         return rc;
610     }
611 
612     /* check that we have at least one region in the spot */
613     if ( count < 1 )
614     {
615         rc = RC ( rcExe, rcNoTarg, rcLoading, rcData, rcInconsistent );
616         LOGERR( klogErr, rc, "(count<1) in region-check()'" );
617     }
618 
619     for ( i = 0; i < count && rc == 0; ++ i )
620     {
621         a_region = VectorGet ( v, i );
622         if ( a_region == NULL )
623         {
624             rc = RC ( rcExe, rcNoTarg, rcLoading, rcData, rcInconsistent );
625             LOGERR( klogErr, rc, "(a_region==NULL) in region-check()'" );
626         }
627         else
628         {
629             /* check that the region has no gap and is not overlapping */
630             if ( a_region->start != start )
631             {
632                 rc = RC ( rcExe, rcNoTarg, rcLoading, rcData, rcInconsistent );
633                 PLOGERR( klogErr, ( klogErr, rc, "(a_region->start($(rstart))!=start($(start))) in region-check()'",
634                          "rstart=%u,start=%u", a_region->start, start ) );
635             }
636             else
637             {
638                 /* check that the region is ascending */
639                 if ( a_region->end < start )
640                 {
641                     rc = RC ( rcExe, rcNoTarg, rcLoading, rcData, rcInconsistent );
642                     LOGERR( klogErr, rc, "(a_region->end<start) in region-check()'" );
643                 }
644                 start = a_region->end;
645             }
646         }
647     }
648 
649     if ( rc == 0 )
650     {
651         /* check that the region is fully covered */
652         if ( a_region->end != spot_len )
653         {
654             rc = RC ( rcExe, rcNoTarg, rcLoading, rcData, rcInconsistent );
655             LOGERR( klogErr, rc, "(a_region->end!=spot_len) in region-check()'" );
656         }
657     }
658     return rc;
659 }
660 
661 
rgn_load(regions * rgn,const uint32_t spot_id,region_type_mapping * mapping,const uint32_t spot_len)662 rc_t rgn_load( regions *rgn, const uint32_t spot_id,
663                region_type_mapping *mapping, const uint32_t spot_len )
664 {
665     rc_t rc;
666     uint64_t row_count = rgn->hdf5_regions.extents[ 0 ];
667 
668     /* predefine that we have no HQ-regions read */
669     rgn->hq_rgn.start = 0;
670     rgn->hq_rgn.end = 0;
671 
672     /* clear out the read and the sorted vector */
673     rc = rgn_vector_move( &rgn->read_Regions, &rgn->stock_Regions );
674     if ( rc == 0 )
675         rc = rgn_vector_move( &rgn->sort_Regions, &rgn->stock_Regions );
676 
677     if ( rc == 0 )
678     {
679         if ( !( rgn->spot_id == 0 || rgn->spot_id == ( spot_id - 1 ) ) )
680             rc = RC( rcExe, rcNoTarg, rcLoading, rcParam, rcInvalid );
681     }
682 
683     if ( rc == 0 )
684     {
685         int32_t * block;
686         bool have_high_quality = false;
687         do
688         {
689             int32_t idx = rgn->table_index[ rgn->offset ];
690             block = &( rgn->complete_table[ idx * RGN_COLUMN_COUNT ] );
691             if ( block[ pacbio_idx_spot_id ] == spot_id )
692             {
693                 rc = rgn_store_block( &(rgn->stock_Regions), &(rgn->read_Regions),
694                                       &(rgn->hq_rgn), block,
695                                       mapping, & have_high_quality );
696                 if ( rc == 0 )
697                     rgn->offset++;
698             }
699 
700         } while( rc == 0 &&
701                  block[ pacbio_idx_spot_id ] == spot_id &&
702                  rgn->offset < row_count );
703         rgn->spot_id  = spot_id;
704         rgn->spot_len = spot_len;
705     }
706 
707     if ( rc == 0 )
708     {
709 #if 0  /**** does not seem to match PacBio ***/
710         /* changes READ_FILTER and READ_TYPE if a region is completely
711            outside of the hq-region... ( if we have one )
712            if there is none the whole spot becomes one CRITERIA/TECHNICAL-read */
713         rc = rgn_apply_filter( &(rgn->stock_Regions), &(rgn->read_Regions),
714                                &(rgn->hq_rgn), rgn->spot_id, rgn->spot_len );
715 #endif
716 
717 
718         /* INPUT : rgn->read_Regions / OUTPUT : rgn->sort_Regions */
719         /* fills gaps, corrects off-by-1-errors and overlapping regions */
720         if ( rc == 0 )
721             rc = rgn_correct( &(rgn->stock_Regions),
722                               &(rgn->read_Regions), &(rgn->sort_Regions),
723                               rgn->spot_id, rgn->spot_len, &(rgn->stat) );
724 
725         /* INPUT : rgn->sort_Regions / OUTPUT : rgn->read_Regions */
726         /* merges consecutive regions if READ_TYPE/READ_FILTER are the same */
727         if ( rc == 0 )
728             rc = rgn_merge_consecutive_regions( &(rgn->stock_Regions),
729                                                 &(rgn->sort_Regions),
730                                                 &(rgn->read_Regions) );
731 
732         /* INPUT : rgn->read_Regions */
733         /* checks that the whole spot is covered, no overlapps/gaps occur,
734            regions have to be sorted in ascending order */
735 
736         if ( rc == 0 )
737             rc = rgn_check( &(rgn->read_Regions), rgn->spot_len );
738 
739     }
740     return rc;
741 }
742 
743 
rgn_resize_data_32(regions * rgn)744 static rc_t rgn_resize_data_32( regions *rgn )
745 {
746     rc_t rc = 0;
747     size_t needed_len;
748 
749     needed_len = ( sizeof( *rgn->data_32 ) * VectorLength( &rgn->read_Regions ) );
750     if ( rgn->data_32 == NULL )
751     {
752         rgn->data_32 = malloc( needed_len );
753     }
754     else if ( rgn->data_32_len < VectorLength( &rgn->read_Regions ) )
755     {
756         rgn->data_32 = realloc ( rgn->data_32, needed_len );
757     }
758     if ( rgn->data_32 == NULL )
759         rc = RC( rcExe, rcNoTarg, rcLoading, rcMemory, rcExhausted );
760     else
761         rgn->data_32_len = VectorLength( &rgn->read_Regions );
762     return rc;
763 }
764 
765 
rgn_set_filter_value_for_all(regions * rgn,const uint32_t filter_value)766 void rgn_set_filter_value_for_all( regions *rgn, const uint32_t filter_value )
767 {
768     uint32_t i, n = VectorLength( &rgn->read_Regions );
769     for ( i = 0; i < n; ++i )
770     {
771         region * a_region = VectorGet ( &rgn->read_Regions, i );
772         if ( a_region != NULL )
773             a_region->filter = filter_value;
774     }
775 }
776 
777 
rgn_resize_data_8(regions * rgn)778 static rc_t rgn_resize_data_8( regions *rgn )
779 {
780     rc_t rc = 0;
781     size_t needed_len;
782 
783     needed_len = ( sizeof( *rgn->data_8 ) * VectorLength( &rgn->read_Regions ) );
784     needed_len = (needed_len + 3 ) & ~3; /** to make valgrind happy ***/
785     if ( rgn->data_8 == NULL )
786     {
787         rgn->data_8 = malloc( needed_len );
788     }
789     else if ( rgn->data_8_len < VectorLength( &rgn->read_Regions ) )
790     {
791         rgn->data_8 = realloc ( rgn->data_8, needed_len );
792     }
793     if ( rgn->data_8 == NULL )
794         rc = RC( rcExe, rcNoTarg, rcLoading, rcMemory, rcExhausted );
795     else
796         rgn->data_8_len = VectorLength( &rgn->read_Regions );
797     return rc;
798 }
799 
800 
rgn_start_data(regions * rgn,uint32_t * count)801 rc_t rgn_start_data( regions *rgn, uint32_t *count )
802 {
803     rc_t rc = rgn_resize_data_32( rgn );
804     if ( rc == 0 )
805     {
806         uint32_t i;
807         uint32_t *ptr = rgn->data_32;
808         *count = VectorLength( &rgn->read_Regions );
809         for ( i = 0; i < (*count); ++i )
810         {
811             region * a_region = VectorGet ( &rgn->read_Regions, i );
812             if ( a_region != NULL )
813                 ptr[ i ] = a_region->start;
814         }
815     }
816     return rc;
817 }
818 
819 
rgn_len_data(regions * rgn,uint32_t * count)820 rc_t rgn_len_data( regions *rgn, uint32_t *count )
821 {
822     rc_t rc = rgn_resize_data_32( rgn );
823     if ( rc == 0 )
824     {
825         uint32_t i;
826         uint32_t *ptr = rgn->data_32;
827         *count = VectorLength( &rgn->read_Regions );
828         for ( i = 0; i < (*count); ++i )
829         {
830             region * a_region = VectorGet ( &rgn->read_Regions, i );
831             if ( a_region != NULL )
832                 ptr[ i ] = ( a_region->end - a_region->start );
833         }
834     }
835     return rc;
836 }
837 
838 
rgn_type_data(regions * rgn,uint32_t * count)839 rc_t rgn_type_data( regions *rgn, uint32_t *count )
840 {
841     rc_t rc = rgn_resize_data_8( rgn );
842     if ( rc == 0 )
843     {
844         uint32_t i;
845         uint8_t *ptr = rgn->data_8;
846 
847         *count = VectorLength( &rgn->read_Regions );
848         for ( i = 0; i < (*count); ++i )
849         {
850             region * a_region = VectorGet ( &rgn->read_Regions, i );
851             if ( a_region != NULL )
852                 ptr[ i ] = a_region->type;
853         }
854     }
855     return rc;
856 }
857 
858 
rgn_filter_data(regions * rgn,uint32_t * count)859 rc_t rgn_filter_data( regions *rgn, uint32_t *count )
860 {
861     rc_t rc = rgn_resize_data_8( rgn );
862     if ( rc == 0 )
863     {
864         uint32_t i;
865         uint8_t *ptr = rgn->data_8;
866 
867         *count = VectorLength( &rgn->read_Regions );
868         for ( i = 0; i < (*count); ++i )
869         {
870             region * a_region = VectorGet ( &rgn->read_Regions, i );
871             if ( a_region != NULL )
872                 ptr[ i ] = a_region->filter;
873         }
874     }
875     return rc;
876 }
877 
878 
rgn_label_start_data(regions * rgn,uint32_t * count)879 rc_t rgn_label_start_data( regions *rgn, uint32_t *count )
880 {
881     rc_t rc = rgn_resize_data_32( rgn );
882     if ( rc == 0 )
883     {
884         uint32_t i;
885         uint32_t *ptr = rgn->data_32;
886         uint32_t value;
887 
888         *count = VectorLength( &rgn->read_Regions );
889         for ( i = 0; i < (*count); ++i )
890         {
891             region * a_region = VectorGet ( &rgn->read_Regions, i );
892             value = label_lowquality_start; /* default value */
893             if ( a_region != NULL )
894                 switch( a_region->type )
895                 {
896                 case SRA_READ_TYPE_BIOLOGICAL :
897                     value = label_insert_start;
898                     break;
899                 case SRA_READ_TYPE_TECHNICAL  :
900                     value = label_adapter_start;
901 /*
902                     if ( a_region->filter == SRA_READ_FILTER_PASS )
903                         value = label_adapter_start;
904                     else
905                         value = label_lowquality_start;
906 */
907                     break;
908                 }
909             ptr[ i ] = value;
910         }
911     }
912     return rc;
913 }
914 
915 
rgn_label_len_data(regions * rgn,uint32_t * count)916 rc_t rgn_label_len_data( regions *rgn, uint32_t *count )
917 {
918     rc_t rc = rgn_resize_data_32( rgn );
919     if ( rc == 0 )
920     {
921         uint32_t i;
922         uint32_t *ptr = rgn->data_32;
923         uint32_t value;
924 
925         *count = VectorLength( &rgn->read_Regions );
926         for ( i = 0; i < (*count); ++i )
927         {
928             region * a_region = VectorGet ( &rgn->read_Regions, i );
929             value = label_lowquality_len; /* default value */
930             if ( a_region != NULL )
931                 switch ( a_region->type )
932                 {
933                 case SRA_READ_TYPE_BIOLOGICAL :
934                     value = label_insert_len;
935                     break;
936                 case SRA_READ_TYPE_TECHNICAL  :
937                     value = label_adapter_len;
938 /*
939                     if ( a_region->filter == SRA_READ_FILTER_PASS )
940                         value = label_adapter_len;
941                     else
942                         value = label_lowquality_len;
943 */
944                     break;
945                 }
946             ptr[ i ] = value;
947         }
948     }
949     return rc;
950 }
951 
952 
953 static const char rgn_string_adapter[] = "Adapter";
954 static const char rgn_string_insert[] = "Insert";
955 static const char rgn_string_hq[] = "HQRegion";
956 static const char rgn_string_ga[] = "GlobalAccuracy";
957 
958 
rgn_str_cmp(const char * a,const char * b)959 static int rgn_str_cmp( const char *a, const char *b )
960 {
961     size_t asize = string_size ( a );
962     size_t bsize = string_size ( b );
963     return strcase_cmp ( a, asize, b, bsize, ( asize > bsize ) ? asize : bsize );
964 }
965 
966 
rgn_set_type_code(int32_t * dst,const uint32_t type_idx)967 static rc_t rgn_set_type_code( int32_t *dst, const uint32_t type_idx )
968 {
969     if ( *dst == -1 )
970     {
971         *dst = type_idx;
972         return 0;
973     }
974     else
975         return RC( rcExe, rcNoTarg, rcLoading, rcName, rcDuplicate );
976 }
977 
978 
rgn_type_string(const char * type_string,uint32_t type_idx,region_type_mapping * mapping)979 static rc_t rgn_type_string( const char *type_string, uint32_t type_idx,
980                              region_type_mapping *mapping )
981 {
982     if ( rgn_str_cmp( type_string, rgn_string_adapter ) == 0 )
983         return rgn_set_type_code( &(mapping->rgn_type_adapter), type_idx );
984 
985     if ( rgn_str_cmp( type_string, rgn_string_insert ) == 0 )
986         return rgn_set_type_code( &(mapping->rgn_type_insert), type_idx );
987 
988     if ( rgn_str_cmp( type_string, rgn_string_hq ) == 0 )
989         return rgn_set_type_code( &(mapping->rgn_type_hq), type_idx );
990 
991     if ( rgn_str_cmp( type_string, rgn_string_ga ) == 0 )
992         return rgn_set_type_code( &(mapping->rgn_type_ga), type_idx );
993 
994     return RC( rcExe, rcNoTarg, rcLoading, rcName, rcUnknown );
995 }
996 
997 
998 /* read the mapping out of the region-types out of a string... */
rgn_extract_type_mappings(const KNamelist * rgn_names,region_type_mapping * mapping,bool check_completenes)999 rc_t rgn_extract_type_mappings( const KNamelist *rgn_names,
1000                                 region_type_mapping *mapping, bool check_completenes )
1001 {
1002     rc_t rc = 0;
1003     uint32_t count, idx;
1004 
1005     mapping->rgn_type_adapter = -1;
1006     mapping->rgn_type_insert  = -1;
1007     mapping->rgn_type_hq      = -1;
1008     mapping->rgn_type_ga      = -1;
1009 
1010     rc = KNamelistCount ( rgn_names, &count );
1011     for ( idx = 0; idx < count && rc == 0; ++idx )
1012     {
1013         const char *name;
1014         rc = KNamelistGet ( rgn_names, idx, &name );
1015         if ( rc == 0 )
1016             rgn_type_string( name, idx, mapping );
1017     }
1018 
1019     if ( rc == 0 && check_completenes )
1020     {
1021         if ( mapping->rgn_type_adapter == -1 ||
1022              mapping->rgn_type_insert == -1 ||
1023              mapping->rgn_type_hq == -1 )
1024             rc = RC( rcExe, rcNoTarg, rcLoading, rcName, rcIncomplete );
1025     }
1026 
1027     mapping->count_of_unknown_rgn_types = 0;
1028     return rc;
1029 }
1030 
1031 
1032 
rgn_show_type_mappings(region_type_mapping * mapping)1033 rc_t rgn_show_type_mappings( region_type_mapping *mapping )
1034 {
1035     rc_t rc;
1036     if ( mapping == NULL )
1037         rc = RC( rcExe, rcNoTarg, rcLoading, rcParam, rcInvalid );
1038     else
1039         rc = KOutMsg( "rgn-type-mapping->adapter   = %i\n", mapping->rgn_type_adapter );
1040     if ( rc == 0 )
1041         rc = KOutMsg( "rgn-type-mapping->insert    = %i\n", mapping->rgn_type_insert );
1042     if ( rc == 0 )
1043         rc = KOutMsg( "rgn-type-mapping->high_qual = %i\n", mapping->rgn_type_hq );
1044     if ( rc == 0 )
1045         rc = KOutMsg( "rgn-type-mapping->globe_acc = %i\n", mapping->rgn_type_ga );
1046     return rc;
1047 }
1048 
1049