1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 #include "pl-regions.h"
28 #include <insdc/sra.h>
29 #include <sysalloc.h>
30
31 #include <stdlib.h>
32 #include <stdio.h>
33 #include <string.h>
34 #include <assert.h>
35
36 enum
37 {
38 pacbio_idx_spot_id = 0,
39 pacbio_idx_type,
40 pacbio_idx_start,
41 pacbio_idx_end,
42 pacbio_idx_score
43 };
44
45
46 /* HDF5-Groups and tables used to use the REGIONS-table */
47 static const char * region_groups_to_check[] =
48 {
49 "PulseData",
50 NULL
51 };
52
53 static const char * region_tables_to_check[] =
54 {
55 "PulseData/Regions",
56 NULL
57 };
58
59
rgn_stat_init(regions_stat * stat)60 void rgn_stat_init( regions_stat * stat )
61 {
62 stat->inserts = 0;
63 stat->expands_a = 0;
64 stat->expands_i = 0;
65 stat->inserts_spots = 0;
66 stat->expands_spots = 0;
67 stat->end_gap = 0;
68 stat->overlapps = 0;
69 stat->removed = 0;
70 }
71
72
rgn_init(regions * rgn)73 void rgn_init( regions *rgn )
74 {
75 init_array_file( &rgn->hdf5_regions );
76
77 VectorInit ( &rgn->read_Regions, 0, 5 );
78 VectorInit ( &rgn->sort_Regions, 0, 5 );
79 VectorInit ( &rgn->stock_Regions, 0, 5 );
80
81 rgn->data_32 = NULL;
82 rgn->data_32_len = 0;
83 rgn->data_8 = NULL;
84 rgn->data_8_len = 0;
85
86 rgn->offset = 0;
87 rgn->spot_id = 0;
88 rgn->spot_len = 0;
89 rgn->hq_rgn.start = 0;
90 rgn->hq_rgn.end = 0;
91
92 rgn_stat_init( &( rgn->stat ) );
93
94 rgn->complete_table = NULL;
95 rgn->table_index = NULL;
96 }
97
98
region_whack(void * item,void * data)99 static void CC region_whack( void * item, void * data )
100 {
101 free( item );
102 }
103
104
rgn_vector_move(Vector * src,Vector * dst)105 static rc_t rgn_vector_move( Vector * src, Vector * dst )
106 {
107 rc_t rc = 0;
108 while ( VectorLength( src ) > 0 && rc == 0 )
109 {
110 region *ptr;
111 rc = VectorRemove ( src, 0, (void**)&ptr );
112 if ( rc == 0 )
113 rc = VectorAppend ( dst, NULL, ptr );
114 }
115 return rc;
116 }
117
118
rgn_free(regions * rgn)119 void rgn_free( regions *rgn )
120 {
121 free_array_file( &rgn->hdf5_regions );
122
123 VectorWhack ( &rgn->read_Regions, region_whack, NULL );
124 VectorWhack ( &rgn->sort_Regions, region_whack, NULL );
125 VectorWhack ( &rgn->stock_Regions, region_whack, NULL );
126
127 if ( rgn->data_32 != NULL )
128 free( rgn->data_32 );
129 if ( rgn->data_8 != NULL )
130 free( rgn->data_8 );
131
132 if ( rgn->complete_table != NULL )
133 free( rgn->complete_table );
134 if ( rgn->table_index != NULL )
135 free( rgn->table_index );
136 }
137
138
139 static
rgn_sort_callback(const void * p1,const void * p2,void * data)140 int64_t CC rgn_sort_callback( const void *p1, const void *p2, void * data )
141 {
142 regions *rgn = ( regions * ) data;
143 int32_t idx1 = *( int32_t * ) p1;
144 int32_t idx2 = *( int32_t * ) p2;
145 int32_t spot_id1 = rgn->complete_table[ idx1 * RGN_COLUMN_COUNT ];
146 int32_t spot_id2 = rgn->complete_table[ idx2 * RGN_COLUMN_COUNT ];
147 if ( spot_id1 == spot_id2 )
148 return ( idx1 - idx2 );
149 return ( spot_id1 - spot_id2 );
150 }
151
152
153 static
rgn_read_complete_table(regions * rgn)154 rc_t rgn_read_complete_table( regions *rgn )
155 {
156 rc_t rc;
157 uint32_t rowcount = rgn->hdf5_regions.extents[ 0 ];
158 uint32_t rowsize = sizeof( int32_t ) * RGN_COLUMN_COUNT;
159
160 rgn->complete_table = malloc( rowcount * rowsize );
161 if ( rgn->complete_table == NULL )
162 rc = RC( rcExe, rcNoTarg, rcLoading, rcMemory, rcExhausted );
163 else
164 {
165 rgn->table_index = malloc( sizeof( uint32_t ) * rowcount );
166 if ( rgn->table_index == NULL )
167 {
168 free( rgn->complete_table );
169 rgn->complete_table = NULL;
170 rc = RC( rcExe, rcNoTarg, rcLoading, rcMemory, rcExhausted );
171 }
172 else
173 {
174 uint64_t n_read = 0;
175
176 /* now let's read the whole table... */
177 rc = array_file_read_dim2( &(rgn->hdf5_regions), 0, rgn->complete_table,
178 rowcount, RGN_COLUMN_COUNT, &n_read );
179 if ( rc == 0 )
180 {
181 uint32_t idx, first_spot_id;
182
183 first_spot_id = rgn->complete_table[ pacbio_idx_spot_id ];
184 if ( first_spot_id != 0 )
185 {
186 /* in case the file we are loading is part of a multi-file submission */
187 for ( idx = 0; idx < rowcount; ++idx )
188 rgn->complete_table[ ( idx * RGN_COLUMN_COUNT ) + pacbio_idx_spot_id ] -= first_spot_id;
189 }
190
191 /* first let's fill the index, first with ascending row-id's */
192 for ( idx = 0; idx < rowcount; ++idx )
193 rgn->table_index[ idx ] = idx;
194
195 /* now sort the index-array by the content's spot-id's */
196 ksort ( rgn->table_index, rowcount, sizeof( uint32_t ),
197 rgn_sort_callback, rgn );
198
199 /* left here to print a debug-output of the sorted table-index */
200 /*
201 for ( idx = rowcount - 128; idx < rowcount; ++idx )
202 OUTMSG(( "idx[%i] = %i -> %i\n",
203 idx, rgn->table_index[ idx ],
204 rgn->complete_table[ rgn->table_index[ idx ] * RGN_COLUMN_COUNT ] ));
205 */
206
207 /* the table and the index is now ready to use... */
208 }
209 }
210 }
211 return rc;
212 }
213
214
rgn_open(const KDirectory * hdf5_dir,regions * rgn)215 rc_t rgn_open( const KDirectory *hdf5_dir, regions *rgn )
216 {
217 rc_t rc;
218 rgn_init( rgn );
219 /* check if the necessary groups/tables are there */
220 rc = check_src_objects( hdf5_dir, region_groups_to_check,
221 region_tables_to_check, false );
222 if ( rc == 0 )
223 {
224 /* open the region-table... */
225 rc = open_element( hdf5_dir, &rgn->hdf5_regions,
226 region_groups_to_check[ 0 ], "Regions",
227 REGIONS_BITSIZE, REGIONS_COLS, true, false, true );
228 if ( rc == 0 )
229 rc = rgn_read_complete_table( rgn );
230 }
231 if ( rc != 0 )
232 rgn_free( rgn );
233 return rc;
234 }
235
236
rgn_get_or_make(Vector * stock,region ** r)237 static rc_t rgn_get_or_make( Vector * stock, region ** r )
238 {
239 rc_t rc = 0;
240 /* take it out of the stock or make a new one... */
241 if ( VectorLength( stock ) > 0 )
242 rc = VectorRemove ( stock, 0, (void**)r );
243 else
244 *r = malloc( sizeof ** r );
245
246 if ( *r == NULL )
247 rc = RC( rcExe, rcNoTarg, rcLoading, rcMemory, rcExhausted );
248 return rc;
249 }
250
251
rgn_sort_by_start(const void * item,const void * n)252 static int64_t CC rgn_sort_by_start( const void *item, const void *n )
253 {
254 region * v1 = ( region * )item;
255 region * v2 = ( region * )n;
256 if ( v1 -> start != v2 -> start )
257 return ( v1->start - v2->start );
258 return ( v1->end - v2->end );
259 }
260
261
rgn_store_bio_or_adapter(Vector * stock,Vector * to,const int32_t * block,int32_t sra_read_type)262 static rc_t rgn_store_bio_or_adapter( Vector * stock, Vector * to,
263 const int32_t * block, int32_t sra_read_type )
264 {
265 rc_t rc = 0;
266
267 if ( block[ pacbio_idx_start ] < block[ pacbio_idx_end ] )
268 {
269 region * a_region;
270 rc = rgn_get_or_make( stock, &a_region );
271 if ( rc == 0 )
272 {
273 a_region->spot_id = block[ pacbio_idx_spot_id ];
274 a_region->type = sra_read_type;
275 a_region->start = block[ pacbio_idx_start ];
276 a_region->end = block[ pacbio_idx_end ];
277
278 /* see every region shorter as MIN_BIOLOGICAL_LEN
279 as a technical (adapter) region */
280 if ( ( a_region->end - a_region->start ) <= MIN_BIOLOGICAL_LEN )
281 a_region->type = SRA_READ_TYPE_TECHNICAL;
282
283 a_region->filter = SRA_READ_FILTER_PASS;
284 rc = VectorInsert ( to, a_region, NULL, rgn_sort_by_start );
285 }
286 }
287 return rc;
288 }
289
rgn_store_block(Vector * stock,Vector * to,hq_region * hq,const int32_t * block,region_type_mapping * mapping,bool * have_high_quality)290 static rc_t rgn_store_block( Vector * stock, Vector * to, hq_region * hq,
291 const int32_t * block, region_type_mapping *mapping,
292 bool *have_high_quality )
293 {
294 rc_t rc = 0;
295 int32_t type = block[ pacbio_idx_type ];
296
297 if ( mapping->rgn_type_hq >=0 && type == mapping->rgn_type_hq )
298 {
299 /* it is an error if we have more than one high-quality-region! */
300 assert ( ! * have_high_quality );
301
302 if ( * have_high_quality )
303 {
304 rc = RC ( rcExe, rcNoTarg, rcLoading, rcData, rcInconsistent );
305 LOGERR( klogErr, rc, "(* have_high_quality) in rgn_store_block()'" );
306 return rc;
307 }
308
309 hq->start = block[ pacbio_idx_start ];
310 hq->end = block[ pacbio_idx_end ];
311 * have_high_quality = true;
312 }
313 else if ( mapping->rgn_type_ga >= 0 && type == mapping->rgn_type_ga )
314 { /* so far do nothing with the "global accuracy" region */
315 }
316 else if ( mapping->rgn_type_adapter >= 0 && type == mapping->rgn_type_adapter )
317 { /* it is an adapter */
318 rc = rgn_store_bio_or_adapter( stock, to, block, SRA_READ_TYPE_TECHNICAL );
319 }
320 else if ( mapping->rgn_type_insert >= 0 && type == mapping->rgn_type_insert )
321 { /* it is an insert */
322 rc = rgn_store_bio_or_adapter( stock, to, block, SRA_READ_TYPE_BIOLOGICAL );
323 }
324 else
325 { /* the type is unknown */
326 ( mapping->count_of_unknown_rgn_types )++;
327 /*
328 rc = RC ( rcExe, rcNoTarg, rcLoading, rcData, rcInconsistent );
329 LOGERR( klogErr, rc, "( region type is unknown ) in rgn_store_block()'" );
330 */
331 }
332 return rc;
333 }
334
335
336 /* inserts the generated region into the sort_Regions */
rgn_generate(Vector * stock,Vector * dst,const int32_t spot_id,const uint32_t start,const uint32_t len)337 static rc_t rgn_generate( Vector * stock, Vector * dst,
338 const int32_t spot_id, const uint32_t start, const uint32_t len )
339 {
340 region * a_region;
341 rc_t rc = rgn_get_or_make( stock, &a_region );
342 if ( rc == 0 )
343 {
344 a_region->spot_id = spot_id;
345 a_region->type = SRA_READ_TYPE_TECHNICAL; /*means "i dont know"*/
346 a_region->start = start;
347 a_region->end = start + len;
348 a_region->filter = SRA_READ_FILTER_CRITERIA;
349 rc = VectorInsert ( dst, a_region, NULL, rgn_sort_by_start );
350 }
351 return rc;
352 }
353
354
355 /* *************************************************************
356 declares all regions inside and touching the HQ-Regions as
357 SRA_READ_FILTER_PASS outside becomes SRA_READ_FILTER_CRITERIA;
358 ************************************************************* */
359 #if 0
360 static rc_t rgn_apply_filter( Vector * stock, Vector * v, hq_region * hq,
361 const int32_t spot_id, const uint32_t spot_len )
362 {
363 rc_t rc = 0;
364
365 if ( hq->start == 0 && hq->end == 0 )
366 {
367 /* we have no HQ-Region, discard everything and create one
368 READ for the whole spot, that will be TECHNICAL... */
369 rgn_vector_move( v, stock );
370
371 if ( spot_len > 0 )
372 /* inserts the generated region into the sort_Regions */
373 rc = rgn_generate( stock, v, spot_id, 0, spot_len );
374 }
375 else
376 {
377 uint32_t i, count = VectorLength ( v );
378 for ( i = 0; i < count; ++ i )
379 {
380 region * a_region = VectorGet ( v, i );
381 if ( a_region != NULL )
382 {
383 bool set_invalid = ( ( a_region->end <= hq->start ) ||
384 ( a_region->start >= hq->end ) );
385 if ( set_invalid )
386 {
387 /* the region is before the hq-region
388 ---> set to SRA_READ_FILTER_CRITERIA */
389 a_region->filter = SRA_READ_FILTER_CRITERIA;
390 a_region->type = SRA_READ_TYPE_TECHNICAL;
391 }
392 else
393 {
394 /* the region intersects with the hq-region
395 ---> set to SRA_READ_FILTER_PASS */
396 a_region->filter = SRA_READ_FILTER_PASS;
397 }
398 }
399 }
400 }
401 return rc;
402 }
403 #endif
404
rgn_expand_last_rgn_by_1(Vector * v,int32_t * expands_a,int32_t * expands_i)405 static bool rgn_expand_last_rgn_by_1( Vector * v, int32_t *expands_a, int32_t *expands_i )
406 {
407 region * a_region = VectorLast ( v );
408 bool res = ( a_region != NULL );
409 if ( res )
410 {
411 a_region->end += 1;
412 if ( a_region->type == SRA_READ_TYPE_TECHNICAL )
413 (*expands_a)++;
414 else
415 (*expands_i)++;
416 }
417 return res;
418 }
419
420
421 /* *************************************************************
422 if gap is 1, expand previous region by 1 (correct off-by-1)
423 fill in gaps > 1 ( regions are not consecutive )
424 correct overlapping regions
425 fill in a gap at the end, if the last region does not reach spotlen
426
427 INTPUT : rgn->read_Regions
428 OUTPUT : rgn->sort_Regions
429 ************************************************************* */
rgn_correct(Vector * stock,Vector * from,Vector * to,const uint32_t spot_id,const uint32_t spot_len,regions_stat * stats)430 static rc_t rgn_correct( Vector * stock, Vector * from, Vector * to,
431 const uint32_t spot_id, const uint32_t spot_len,
432 regions_stat * stats )
433 {
434 rc_t rc;
435 int32_t start, expands_a = 0, expands_i = 0, inserts = 0;
436 uint32_t i, count = VectorLength ( from );
437
438 for ( rc = 0, start = 0, i = 0; i < count && rc == 0; ++ i )
439 {
440 region * a_region;
441
442 /* take the region out of rgn->read_Regions*/
443 rc = VectorRemove ( from, 0, (void**)&a_region );
444 if ( rc == 0 )
445 {
446 int32_t gap_len = ( a_region->start - start );
447
448 if ( gap_len == 1 )
449 {
450 /* the gap-length is one, try to expand the previous region */
451 if ( !rgn_expand_last_rgn_by_1( to, &expands_a, &expands_i ) )
452 {
453 /* there is no previous region ! */
454 rc = rgn_generate( stock, to, spot_id, start, gap_len );
455 }
456 }
457 else if ( gap_len > 1 )
458 {
459 /* generate a artificial gap in the middle or the start of the spot */
460 rc = rgn_generate( stock, to, spot_id, start, gap_len );
461 inserts++;
462 }
463 else if ( gap_len < 0 )
464 {
465 /* a negative gap would be an error in the sorting of the regions,
466 or an overlapp of regions */
467 if ( ( a_region->start - gap_len ) > a_region->end )
468 {
469 rc = RC ( rcExe, rcNoTarg, rcLoading, rcData, rcInconsistent );
470 LOGERR( klogErr, rc, "((a_region->start-gap_len)>a_region->end) in rgn_correct()'" );
471 }
472 else /** move the start point ***/
473 {
474 a_region->start -= gap_len;
475 stats->overlapps++;
476 }
477 }
478
479 if ( rc == 0 )
480 {
481 rc = VectorInsert ( to, a_region, NULL, rgn_sort_by_start );
482 if ( rc != 0 )
483 LOGERR( klogErr, rc, "VectorInsert(rgn_sort_by_start) in rgn_correct()'" );
484 start = a_region->end;
485 }
486 else
487 {
488 rc_t rc1 = VectorInsert( stock, a_region, NULL, NULL );
489 if ( rc1 != 0 )
490 LOGERR( klogErr, rc, "VectorInsert(NULL) in rgn_correct()'" );
491 }
492 }
493 }
494
495 if ( rc == 0 )
496 {
497 /* if the last region does not reach to the end of the spot */
498 if ( start < spot_len )
499 {
500 int32_t gap_len = ( spot_len - start );
501 if ( gap_len == 1 )
502 {
503 /* !!! this can also happen if spot_len == 1 !!! */
504 /* the gap-length is one, try to expand the previous region */
505 if ( ! rgn_expand_last_rgn_by_1( to, &expands_a, &expands_i ) )
506 {
507 /* there is no previous region ! */
508 rc = rgn_generate( stock, to, spot_id, start, gap_len );
509 stats->end_gap++;
510 }
511 }
512 else if ( gap_len > 0 )
513 {
514 /* fill the gap to the end... */
515 rc = rgn_generate( stock, to, spot_id, start, gap_len );
516 stats->end_gap++;
517 }
518 else if ( gap_len < 0 )
519 {
520 /* a negative gap would be an error in the sorting of the regions,
521 or an overlapp of regions */
522 rc = RC ( rcExe, rcNoTarg, rcLoading, rcData, rcInconsistent );
523 LOGERR( klogErr, rc, "(gap_len<0) in rgn_correct()'" );
524 }
525 }
526 }
527
528 /* do some statistics */
529 if ( ( expands_i + expands_a ) > 0 )
530 {
531 stats->expands_a += expands_a;
532 stats->expands_i += expands_i;
533 stats->expands_spots++;
534 }
535 if ( inserts > 0 )
536 {
537 stats->inserts += inserts;
538 stats->inserts_spots++;
539 }
540
541 return rc;
542 }
543
544
545 /* *************************************************************
546 tries to merge overlapping adapter-regions...
547 ( uses rgn->read_Regions as scratch-pad )
548
549 INTPUT : rgn->sort_Regions
550 OUTPUT : rgn->sort_Regions
551 ************************************************************* */
rgn_merge_consecutive_regions(Vector * stock,Vector * from,Vector * to)552 static rc_t rgn_merge_consecutive_regions( Vector * stock, Vector * from, Vector * to )
553 {
554 rc_t rc = 0;
555 uint32_t i, count = VectorLength ( from );
556 region * a_region = NULL;
557 region * prev_region = NULL;
558
559 for ( i = 0; i < count && rc == 0; ++ i )
560 {
561 /* take the region out of rgn->sort_Regions*/
562 rc = VectorRemove ( from, 0, (void**)&a_region );
563 if ( rc == 0 )
564 {
565 bool copy = true;
566 if ( prev_region != NULL )
567 {
568 if ( ( a_region->start <= prev_region->end ) &&
569 ( a_region->type == prev_region->type ) &&
570 ( a_region->filter == prev_region->filter ) )
571 {
572 prev_region->end = a_region->end;
573 /* put the now unused region back into the stock */
574 VectorAppend ( stock, NULL, a_region );
575 /* and keep prev-region! */
576 copy = false;
577 }
578 }
579 if ( copy )
580 {
581 prev_region = a_region;
582 rc = VectorInsert ( to, a_region, NULL, rgn_sort_by_start );
583 }
584 }
585 }
586 return rc;
587 }
588
589 /* *************************************************************
590 tests that the spot-len is covered with regions that
591 are not overlapping, have no gaps, start is ascending
592 the regions to check are in rgn->sort_Regions
593 ************************************************************* */
rgn_check(const Vector * v,const uint32_t spot_len)594 static rc_t rgn_check( const Vector * v, const uint32_t spot_len )
595 {
596 rc_t rc = 0;
597 uint32_t i, count = VectorLength ( v );
598 region * a_region = NULL;
599 int32_t start = 0;
600
601 /* special case, if the spot is empty there are not regions */
602 if ( spot_len == 0 )
603 {
604 if ( count != 0 )
605 {
606 rc = RC ( rcExe, rcNoTarg, rcLoading, rcData, rcInconsistent );
607 LOGERR( klogErr, rc, "(spot_len == 0)&&(count!=0) in region-check()'" );
608 }
609 return rc;
610 }
611
612 /* check that we have at least one region in the spot */
613 if ( count < 1 )
614 {
615 rc = RC ( rcExe, rcNoTarg, rcLoading, rcData, rcInconsistent );
616 LOGERR( klogErr, rc, "(count<1) in region-check()'" );
617 }
618
619 for ( i = 0; i < count && rc == 0; ++ i )
620 {
621 a_region = VectorGet ( v, i );
622 if ( a_region == NULL )
623 {
624 rc = RC ( rcExe, rcNoTarg, rcLoading, rcData, rcInconsistent );
625 LOGERR( klogErr, rc, "(a_region==NULL) in region-check()'" );
626 }
627 else
628 {
629 /* check that the region has no gap and is not overlapping */
630 if ( a_region->start != start )
631 {
632 rc = RC ( rcExe, rcNoTarg, rcLoading, rcData, rcInconsistent );
633 PLOGERR( klogErr, ( klogErr, rc, "(a_region->start($(rstart))!=start($(start))) in region-check()'",
634 "rstart=%u,start=%u", a_region->start, start ) );
635 }
636 else
637 {
638 /* check that the region is ascending */
639 if ( a_region->end < start )
640 {
641 rc = RC ( rcExe, rcNoTarg, rcLoading, rcData, rcInconsistent );
642 LOGERR( klogErr, rc, "(a_region->end<start) in region-check()'" );
643 }
644 start = a_region->end;
645 }
646 }
647 }
648
649 if ( rc == 0 )
650 {
651 /* check that the region is fully covered */
652 if ( a_region->end != spot_len )
653 {
654 rc = RC ( rcExe, rcNoTarg, rcLoading, rcData, rcInconsistent );
655 LOGERR( klogErr, rc, "(a_region->end!=spot_len) in region-check()'" );
656 }
657 }
658 return rc;
659 }
660
661
rgn_load(regions * rgn,const uint32_t spot_id,region_type_mapping * mapping,const uint32_t spot_len)662 rc_t rgn_load( regions *rgn, const uint32_t spot_id,
663 region_type_mapping *mapping, const uint32_t spot_len )
664 {
665 rc_t rc;
666 uint64_t row_count = rgn->hdf5_regions.extents[ 0 ];
667
668 /* predefine that we have no HQ-regions read */
669 rgn->hq_rgn.start = 0;
670 rgn->hq_rgn.end = 0;
671
672 /* clear out the read and the sorted vector */
673 rc = rgn_vector_move( &rgn->read_Regions, &rgn->stock_Regions );
674 if ( rc == 0 )
675 rc = rgn_vector_move( &rgn->sort_Regions, &rgn->stock_Regions );
676
677 if ( rc == 0 )
678 {
679 if ( !( rgn->spot_id == 0 || rgn->spot_id == ( spot_id - 1 ) ) )
680 rc = RC( rcExe, rcNoTarg, rcLoading, rcParam, rcInvalid );
681 }
682
683 if ( rc == 0 )
684 {
685 int32_t * block;
686 bool have_high_quality = false;
687 do
688 {
689 int32_t idx = rgn->table_index[ rgn->offset ];
690 block = &( rgn->complete_table[ idx * RGN_COLUMN_COUNT ] );
691 if ( block[ pacbio_idx_spot_id ] == spot_id )
692 {
693 rc = rgn_store_block( &(rgn->stock_Regions), &(rgn->read_Regions),
694 &(rgn->hq_rgn), block,
695 mapping, & have_high_quality );
696 if ( rc == 0 )
697 rgn->offset++;
698 }
699
700 } while( rc == 0 &&
701 block[ pacbio_idx_spot_id ] == spot_id &&
702 rgn->offset < row_count );
703 rgn->spot_id = spot_id;
704 rgn->spot_len = spot_len;
705 }
706
707 if ( rc == 0 )
708 {
709 #if 0 /**** does not seem to match PacBio ***/
710 /* changes READ_FILTER and READ_TYPE if a region is completely
711 outside of the hq-region... ( if we have one )
712 if there is none the whole spot becomes one CRITERIA/TECHNICAL-read */
713 rc = rgn_apply_filter( &(rgn->stock_Regions), &(rgn->read_Regions),
714 &(rgn->hq_rgn), rgn->spot_id, rgn->spot_len );
715 #endif
716
717
718 /* INPUT : rgn->read_Regions / OUTPUT : rgn->sort_Regions */
719 /* fills gaps, corrects off-by-1-errors and overlapping regions */
720 if ( rc == 0 )
721 rc = rgn_correct( &(rgn->stock_Regions),
722 &(rgn->read_Regions), &(rgn->sort_Regions),
723 rgn->spot_id, rgn->spot_len, &(rgn->stat) );
724
725 /* INPUT : rgn->sort_Regions / OUTPUT : rgn->read_Regions */
726 /* merges consecutive regions if READ_TYPE/READ_FILTER are the same */
727 if ( rc == 0 )
728 rc = rgn_merge_consecutive_regions( &(rgn->stock_Regions),
729 &(rgn->sort_Regions),
730 &(rgn->read_Regions) );
731
732 /* INPUT : rgn->read_Regions */
733 /* checks that the whole spot is covered, no overlapps/gaps occur,
734 regions have to be sorted in ascending order */
735
736 if ( rc == 0 )
737 rc = rgn_check( &(rgn->read_Regions), rgn->spot_len );
738
739 }
740 return rc;
741 }
742
743
rgn_resize_data_32(regions * rgn)744 static rc_t rgn_resize_data_32( regions *rgn )
745 {
746 rc_t rc = 0;
747 size_t needed_len;
748
749 needed_len = ( sizeof( *rgn->data_32 ) * VectorLength( &rgn->read_Regions ) );
750 if ( rgn->data_32 == NULL )
751 {
752 rgn->data_32 = malloc( needed_len );
753 }
754 else if ( rgn->data_32_len < VectorLength( &rgn->read_Regions ) )
755 {
756 rgn->data_32 = realloc ( rgn->data_32, needed_len );
757 }
758 if ( rgn->data_32 == NULL )
759 rc = RC( rcExe, rcNoTarg, rcLoading, rcMemory, rcExhausted );
760 else
761 rgn->data_32_len = VectorLength( &rgn->read_Regions );
762 return rc;
763 }
764
765
rgn_set_filter_value_for_all(regions * rgn,const uint32_t filter_value)766 void rgn_set_filter_value_for_all( regions *rgn, const uint32_t filter_value )
767 {
768 uint32_t i, n = VectorLength( &rgn->read_Regions );
769 for ( i = 0; i < n; ++i )
770 {
771 region * a_region = VectorGet ( &rgn->read_Regions, i );
772 if ( a_region != NULL )
773 a_region->filter = filter_value;
774 }
775 }
776
777
rgn_resize_data_8(regions * rgn)778 static rc_t rgn_resize_data_8( regions *rgn )
779 {
780 rc_t rc = 0;
781 size_t needed_len;
782
783 needed_len = ( sizeof( *rgn->data_8 ) * VectorLength( &rgn->read_Regions ) );
784 needed_len = (needed_len + 3 ) & ~3; /** to make valgrind happy ***/
785 if ( rgn->data_8 == NULL )
786 {
787 rgn->data_8 = malloc( needed_len );
788 }
789 else if ( rgn->data_8_len < VectorLength( &rgn->read_Regions ) )
790 {
791 rgn->data_8 = realloc ( rgn->data_8, needed_len );
792 }
793 if ( rgn->data_8 == NULL )
794 rc = RC( rcExe, rcNoTarg, rcLoading, rcMemory, rcExhausted );
795 else
796 rgn->data_8_len = VectorLength( &rgn->read_Regions );
797 return rc;
798 }
799
800
rgn_start_data(regions * rgn,uint32_t * count)801 rc_t rgn_start_data( regions *rgn, uint32_t *count )
802 {
803 rc_t rc = rgn_resize_data_32( rgn );
804 if ( rc == 0 )
805 {
806 uint32_t i;
807 uint32_t *ptr = rgn->data_32;
808 *count = VectorLength( &rgn->read_Regions );
809 for ( i = 0; i < (*count); ++i )
810 {
811 region * a_region = VectorGet ( &rgn->read_Regions, i );
812 if ( a_region != NULL )
813 ptr[ i ] = a_region->start;
814 }
815 }
816 return rc;
817 }
818
819
rgn_len_data(regions * rgn,uint32_t * count)820 rc_t rgn_len_data( regions *rgn, uint32_t *count )
821 {
822 rc_t rc = rgn_resize_data_32( rgn );
823 if ( rc == 0 )
824 {
825 uint32_t i;
826 uint32_t *ptr = rgn->data_32;
827 *count = VectorLength( &rgn->read_Regions );
828 for ( i = 0; i < (*count); ++i )
829 {
830 region * a_region = VectorGet ( &rgn->read_Regions, i );
831 if ( a_region != NULL )
832 ptr[ i ] = ( a_region->end - a_region->start );
833 }
834 }
835 return rc;
836 }
837
838
rgn_type_data(regions * rgn,uint32_t * count)839 rc_t rgn_type_data( regions *rgn, uint32_t *count )
840 {
841 rc_t rc = rgn_resize_data_8( rgn );
842 if ( rc == 0 )
843 {
844 uint32_t i;
845 uint8_t *ptr = rgn->data_8;
846
847 *count = VectorLength( &rgn->read_Regions );
848 for ( i = 0; i < (*count); ++i )
849 {
850 region * a_region = VectorGet ( &rgn->read_Regions, i );
851 if ( a_region != NULL )
852 ptr[ i ] = a_region->type;
853 }
854 }
855 return rc;
856 }
857
858
rgn_filter_data(regions * rgn,uint32_t * count)859 rc_t rgn_filter_data( regions *rgn, uint32_t *count )
860 {
861 rc_t rc = rgn_resize_data_8( rgn );
862 if ( rc == 0 )
863 {
864 uint32_t i;
865 uint8_t *ptr = rgn->data_8;
866
867 *count = VectorLength( &rgn->read_Regions );
868 for ( i = 0; i < (*count); ++i )
869 {
870 region * a_region = VectorGet ( &rgn->read_Regions, i );
871 if ( a_region != NULL )
872 ptr[ i ] = a_region->filter;
873 }
874 }
875 return rc;
876 }
877
878
rgn_label_start_data(regions * rgn,uint32_t * count)879 rc_t rgn_label_start_data( regions *rgn, uint32_t *count )
880 {
881 rc_t rc = rgn_resize_data_32( rgn );
882 if ( rc == 0 )
883 {
884 uint32_t i;
885 uint32_t *ptr = rgn->data_32;
886 uint32_t value;
887
888 *count = VectorLength( &rgn->read_Regions );
889 for ( i = 0; i < (*count); ++i )
890 {
891 region * a_region = VectorGet ( &rgn->read_Regions, i );
892 value = label_lowquality_start; /* default value */
893 if ( a_region != NULL )
894 switch( a_region->type )
895 {
896 case SRA_READ_TYPE_BIOLOGICAL :
897 value = label_insert_start;
898 break;
899 case SRA_READ_TYPE_TECHNICAL :
900 value = label_adapter_start;
901 /*
902 if ( a_region->filter == SRA_READ_FILTER_PASS )
903 value = label_adapter_start;
904 else
905 value = label_lowquality_start;
906 */
907 break;
908 }
909 ptr[ i ] = value;
910 }
911 }
912 return rc;
913 }
914
915
rgn_label_len_data(regions * rgn,uint32_t * count)916 rc_t rgn_label_len_data( regions *rgn, uint32_t *count )
917 {
918 rc_t rc = rgn_resize_data_32( rgn );
919 if ( rc == 0 )
920 {
921 uint32_t i;
922 uint32_t *ptr = rgn->data_32;
923 uint32_t value;
924
925 *count = VectorLength( &rgn->read_Regions );
926 for ( i = 0; i < (*count); ++i )
927 {
928 region * a_region = VectorGet ( &rgn->read_Regions, i );
929 value = label_lowquality_len; /* default value */
930 if ( a_region != NULL )
931 switch ( a_region->type )
932 {
933 case SRA_READ_TYPE_BIOLOGICAL :
934 value = label_insert_len;
935 break;
936 case SRA_READ_TYPE_TECHNICAL :
937 value = label_adapter_len;
938 /*
939 if ( a_region->filter == SRA_READ_FILTER_PASS )
940 value = label_adapter_len;
941 else
942 value = label_lowquality_len;
943 */
944 break;
945 }
946 ptr[ i ] = value;
947 }
948 }
949 return rc;
950 }
951
952
953 static const char rgn_string_adapter[] = "Adapter";
954 static const char rgn_string_insert[] = "Insert";
955 static const char rgn_string_hq[] = "HQRegion";
956 static const char rgn_string_ga[] = "GlobalAccuracy";
957
958
rgn_str_cmp(const char * a,const char * b)959 static int rgn_str_cmp( const char *a, const char *b )
960 {
961 size_t asize = string_size ( a );
962 size_t bsize = string_size ( b );
963 return strcase_cmp ( a, asize, b, bsize, ( asize > bsize ) ? asize : bsize );
964 }
965
966
rgn_set_type_code(int32_t * dst,const uint32_t type_idx)967 static rc_t rgn_set_type_code( int32_t *dst, const uint32_t type_idx )
968 {
969 if ( *dst == -1 )
970 {
971 *dst = type_idx;
972 return 0;
973 }
974 else
975 return RC( rcExe, rcNoTarg, rcLoading, rcName, rcDuplicate );
976 }
977
978
rgn_type_string(const char * type_string,uint32_t type_idx,region_type_mapping * mapping)979 static rc_t rgn_type_string( const char *type_string, uint32_t type_idx,
980 region_type_mapping *mapping )
981 {
982 if ( rgn_str_cmp( type_string, rgn_string_adapter ) == 0 )
983 return rgn_set_type_code( &(mapping->rgn_type_adapter), type_idx );
984
985 if ( rgn_str_cmp( type_string, rgn_string_insert ) == 0 )
986 return rgn_set_type_code( &(mapping->rgn_type_insert), type_idx );
987
988 if ( rgn_str_cmp( type_string, rgn_string_hq ) == 0 )
989 return rgn_set_type_code( &(mapping->rgn_type_hq), type_idx );
990
991 if ( rgn_str_cmp( type_string, rgn_string_ga ) == 0 )
992 return rgn_set_type_code( &(mapping->rgn_type_ga), type_idx );
993
994 return RC( rcExe, rcNoTarg, rcLoading, rcName, rcUnknown );
995 }
996
997
998 /* read the mapping out of the region-types out of a string... */
rgn_extract_type_mappings(const KNamelist * rgn_names,region_type_mapping * mapping,bool check_completenes)999 rc_t rgn_extract_type_mappings( const KNamelist *rgn_names,
1000 region_type_mapping *mapping, bool check_completenes )
1001 {
1002 rc_t rc = 0;
1003 uint32_t count, idx;
1004
1005 mapping->rgn_type_adapter = -1;
1006 mapping->rgn_type_insert = -1;
1007 mapping->rgn_type_hq = -1;
1008 mapping->rgn_type_ga = -1;
1009
1010 rc = KNamelistCount ( rgn_names, &count );
1011 for ( idx = 0; idx < count && rc == 0; ++idx )
1012 {
1013 const char *name;
1014 rc = KNamelistGet ( rgn_names, idx, &name );
1015 if ( rc == 0 )
1016 rgn_type_string( name, idx, mapping );
1017 }
1018
1019 if ( rc == 0 && check_completenes )
1020 {
1021 if ( mapping->rgn_type_adapter == -1 ||
1022 mapping->rgn_type_insert == -1 ||
1023 mapping->rgn_type_hq == -1 )
1024 rc = RC( rcExe, rcNoTarg, rcLoading, rcName, rcIncomplete );
1025 }
1026
1027 mapping->count_of_unknown_rgn_types = 0;
1028 return rc;
1029 }
1030
1031
1032
rgn_show_type_mappings(region_type_mapping * mapping)1033 rc_t rgn_show_type_mappings( region_type_mapping *mapping )
1034 {
1035 rc_t rc;
1036 if ( mapping == NULL )
1037 rc = RC( rcExe, rcNoTarg, rcLoading, rcParam, rcInvalid );
1038 else
1039 rc = KOutMsg( "rgn-type-mapping->adapter = %i\n", mapping->rgn_type_adapter );
1040 if ( rc == 0 )
1041 rc = KOutMsg( "rgn-type-mapping->insert = %i\n", mapping->rgn_type_insert );
1042 if ( rc == 0 )
1043 rc = KOutMsg( "rgn-type-mapping->high_qual = %i\n", mapping->rgn_type_hq );
1044 if ( rc == 0 )
1045 rc = KOutMsg( "rgn-type-mapping->globe_acc = %i\n", mapping->rgn_type_ga );
1046 return rc;
1047 }
1048
1049