1 /* ===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnologmsgy Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 #include "coverage_iter.h"
27 #include "common.h"
28
29 #include <klib/text.h>
30 #include <klib/log.h>
31 #include <klib/printf.h>
32 #include <klib/out.h>
33
34 #include <kfs/directory.h>
35 #include <kfs/file.h>
36
37 #include <vdb/manager.h>
38 #include <vdb/database.h>
39 #include <vdb/table.h>
40 #include <vdb/cursor.h>
41
42 /* ----------------------------------------------------------------------------------------------- */
43
44 #define COVERAGE_ITER_N_COLS 3
45
46 typedef struct simple_coverage_iter
47 {
48 /* common for both modes: the name of the source */
49 const char * source;
50
51 /* for the CSRA-MODE */
52 const VDatabase * db;
53 const VCursor *curs;
54 uint32_t idx[ COVERAGE_ITER_N_COLS ];
55 row_range to_process;
56 uint64_t rows_processed;
57 int64_t current_row;
58 uint64_t position;
59 uint32_t last_len;
60 } simple_coverage_iter;
61
62
63 /* release an reference-iterator */
simple_coverage_iter_release(struct simple_coverage_iter * self)64 rc_t simple_coverage_iter_release( struct simple_coverage_iter * self )
65 {
66 rc_t rc = 0;
67 if ( self == NULL )
68 rc = RC( rcApp, rcNoTarg, rcReleasing, rcParam, rcNull );
69 else
70 {
71 if ( self->curs != NULL )
72 {
73 rc = VCursorRelease( self->curs );
74 if ( rc != 0 )
75 log_err( "error (%R) releasing VCursor for %s", rc, self->source );
76 }
77 if ( self->db != NULL )
78 {
79 rc = VDatabaseRelease( self->db );
80 if ( rc != 0 )
81 log_err( "error (%R) releasing VDatabase for %s", rc, self->source );
82
83 }
84 free( ( void * ) self );
85 }
86 return rc;
87 }
88
simple_coverage_iter_initialize(struct simple_coverage_iter * self,size_t cache_capacity,const RefT * ref)89 static rc_t simple_coverage_iter_initialize( struct simple_coverage_iter * self,
90 size_t cache_capacity,
91 const RefT * ref )
92 {
93 const VDBManager * mgr;
94 rc_t rc = VDBManagerMakeRead( &mgr, NULL );
95 if ( rc != 0 )
96 log_err( "coverage_iter_initialize.initialize.VDBManagerMakeRead() %R", rc );
97 else
98 {
99 rc = VDBManagerOpenDBRead( mgr, &self->db, NULL, "%s", self->source );
100 if ( rc != 0 )
101 log_err( "coverage_iter_initialize.initialize.VDBManagerOpenDBRead( '%s' ) %R", self->source, rc );
102 else
103 {
104 const VTable * tbl;
105 rc = VDatabaseOpenTableRead( self->db, &tbl, "%s", "REFERENCE" );
106 if ( rc != 0 )
107 log_err( "coverage_iter_initialize.initialize.VDatabaseOpenTableRead( '%s'.REFERENCE ) %R", self->source, rc );
108 else
109 {
110 if ( cache_capacity > 0 )
111 rc = VTableCreateCachedCursorRead( tbl, &self->curs, cache_capacity );
112 else
113 rc = VTableCreateCursorRead( tbl, &self->curs );
114 if ( rc != 0 )
115 log_err( "coverage_iter_initialize.initialize.TableCreateCursorRead( '%s'.REFERENCE ) %R", self->source, rc );
116 else
117 {
118 rc = add_cols_to_cursor( self->curs, self->idx, "REFERENCE", self->source,
119 COVERAGE_ITER_N_COLS,
120 "PRIMARY_ALIGNMENT_IDS",
121 "SECONDARY_ALIGNMENT_IDS",
122 "SEQ_LEN" );
123 if ( rc == 0 )
124 {
125 self->to_process.first_row = ref->start_row_id;
126 self->to_process.row_count = ref->count;
127 self->current_row = ref->start_row_id;
128 }
129 }
130 VTableRelease( tbl );
131 }
132 }
133 VDBManagerRelease( mgr );
134 }
135 return rc;
136 }
137
138 /* construct an reference-iterator from an accession */
simple_coverage_iter_make(struct simple_coverage_iter ** self,const char * src,size_t cache_capacity,const RefT * ref)139 rc_t simple_coverage_iter_make( struct simple_coverage_iter ** self,
140 const char * src,
141 size_t cache_capacity,
142 const RefT * ref )
143 {
144 rc_t rc = 0;
145 if ( self == NULL || src == NULL || ref == NULL )
146 {
147 rc = RC( rcApp, rcNoTarg, rcAllocating, rcParam, rcNull );
148 log_err( "coverage_iter.make() given a NULL-ptr" );
149 }
150 else
151 {
152 simple_coverage_iter * o = calloc( 1, sizeof *o );
153 *self = NULL;
154 if ( o == NULL )
155 {
156 rc = RC( rcApp, rcNoTarg, rcAllocating, rcMemory, rcExhausted );
157 log_err( "coverage_iter_initialize.make() memory exhausted" );
158 }
159 else
160 {
161 o->source = src;
162 rc = simple_coverage_iter_initialize( o, cache_capacity, ref );
163 }
164
165 if ( rc == 0 )
166 *self = o;
167 else
168 simple_coverage_iter_release( o );
169 }
170 return rc;
171 }
172
173
fill_simple_coverage_row(struct simple_coverage_iter * self,int64_t row_id,SimpleCoverageT * ct)174 static rc_t fill_simple_coverage_row( struct simple_coverage_iter * self,
175 int64_t row_id,
176 SimpleCoverageT * ct )
177 {
178 uint32_t elem_bits, boff, row_len;
179 const VCursor * curs = self->curs;
180 uint32_t * idx = self->idx;
181 int64_t * pp64;
182 uint64_t * pp32;
183
184 /* get count of PRIM-ID's */
185 rc_t rc = VCursorCellDataDirect( curs, row_id, idx[ 0 ], &elem_bits, ( const void ** )&pp64, &boff, &row_len );
186 if ( rc != 0 )
187 log_err( "cannot read '%s'.REFERENCE.SEQ_ID[ %ld ] %R", self->source, row_id, rc );
188 else
189 {
190 ct->prim = row_len;
191 ct->prim_ids = pp64;
192 }
193
194 if ( rc == 0 )
195 {
196 /* get count of SEC-ID's */
197 rc = VCursorCellDataDirect( curs, row_id, idx[ 1 ], &elem_bits, ( const void ** )&pp64, &boff, &row_len );
198 if ( rc != 0 )
199 log_err( "cannot read '%s'.REFERENCE.SEQ_LEN[ %ld ] %R", self->source, row_id, rc );
200 else
201 {
202 ct->sec = row_len;
203 ct->sec_ids = pp64;
204 }
205 }
206 if ( rc == 0 )
207 {
208 /* get SEQ_LEN */
209 rc = VCursorCellDataDirect( curs, row_id, idx[ 2 ], &elem_bits, ( const void ** )&pp32, &boff, &row_len );
210 if ( rc != 0 )
211 log_err( "cannot read '%s'.REFERENCE.SEQ_LEN[ %ld ] %R", self->source, row_id, rc );
212 else
213 ct->len = pp32[ 0 ];
214 }
215
216 return rc;
217 }
218
219
220 /* get the next reference from the iter */
simple_coverage_iter_get(struct simple_coverage_iter * self,SimpleCoverageT * ct)221 bool simple_coverage_iter_get( struct simple_coverage_iter * self,
222 SimpleCoverageT * ct )
223 {
224 bool res = ( self != NULL && ct != NULL );
225 if ( res )
226 {
227 res = ( self->rows_processed < self->to_process.row_count );
228 if ( res )
229 {
230 self->position += self->last_len;
231 res = ( fill_simple_coverage_row( self, self->current_row, ct ) == 0 );
232 if ( res )
233 {
234 self->current_row++;
235 self->rows_processed++;
236 self->last_len = ct->len;
237 }
238 }
239 }
240 return res;
241 }
242
243
simple_coverage_iter_get_capped(struct simple_coverage_iter * self,SimpleCoverageT * ct,uint32_t min,uint32_t max)244 bool simple_coverage_iter_get_capped( struct simple_coverage_iter * self,
245 SimpleCoverageT * ct,
246 uint32_t min,
247 uint32_t max )
248 {
249 bool res = false;
250 bool running = true;
251 while ( running )
252 {
253 res = simple_coverage_iter_get( self, ct );
254 if ( res )
255 {
256 running = !( ct->prim >= min && ct->prim <= max );
257 if ( !running )
258 {
259 ct->ref_row_id = self->current_row - 1;
260 ct->start_pos = self->position;
261 }
262 }
263 else
264 running = false;
265 }
266 return res;
267 }
268
269
270 /* ========================================================================================= */
271
272 #define ALIG_ITER_N_COLS 2
273
274 typedef struct simple_alig_iter
275 {
276 /* common for both modes: the name of the source */
277 const char * source;
278
279 /* for the CSRA-MODE */
280 const VCursor *curs;
281 uint32_t idx[ ALIG_ITER_N_COLS ];
282 } simple_alig_iter;
283
284
simple_alig_iter_release(simple_alig_iter * self)285 static rc_t simple_alig_iter_release( simple_alig_iter * self )
286 {
287 rc_t rc = 0;
288 if ( self == NULL )
289 rc = RC( rcApp, rcNoTarg, rcReleasing, rcParam, rcNull );
290 else
291 {
292 if ( self->curs != NULL )
293 {
294 rc = VCursorRelease( self->curs );
295 if ( rc != 0 )
296 log_err( "error (%R) releasing VCursor for %s", rc, self->source );
297 }
298 free( ( void * ) self );
299 }
300 return rc;
301 }
302
303
simple_alig_iter_initialize(simple_alig_iter * self,const VDatabase * db,size_t cache_capacity)304 static rc_t simple_alig_iter_initialize( simple_alig_iter * self,
305 const VDatabase * db,
306 size_t cache_capacity )
307 {
308 const VTable * tbl;
309 rc_t rc = VDatabaseOpenTableRead( db, &tbl, "%s", "PRIMARY_ALIGNMENT" );
310 if ( rc != 0 )
311 log_err( "simple_alig_iter_initialize.VDatabaseOpenTableRead( '%s'.PRIMARY_ALIGNMENT ) %R", self->source, rc );
312 else
313 {
314 if ( cache_capacity > 0 )
315 rc = VTableCreateCachedCursorRead( tbl, &self->curs, cache_capacity );
316 else
317 rc = VTableCreateCursorRead( tbl, &self->curs );
318 if ( rc != 0 )
319 log_err( "simple_alig_iter_initialize.TableCreateCursorRead( '%s'.REFERENCE ) %R", self->source, rc );
320 else
321 {
322 rc = add_cols_to_cursor( self->curs, self->idx, "PRIMARY_ALIGNMENT", self->source,
323 ALIG_ITER_N_COLS,
324 "REF_POS",
325 "REF_LEN" );
326 }
327 VTableRelease( tbl );
328 }
329 return rc;
330 }
331
simple_alig_iter_make(simple_alig_iter ** self,const VDatabase * db,const char * src,size_t cache_capacity)332 static rc_t simple_alig_iter_make( simple_alig_iter ** self,
333 const VDatabase * db,
334 const char * src,
335 size_t cache_capacity )
336 {
337 rc_t rc = 0;
338 if ( self == NULL || src == NULL || db == NULL )
339 {
340 rc = RC( rcApp, rcNoTarg, rcAllocating, rcParam, rcNull );
341 log_err( "simple_alig_iter_make() given a NULL-ptr" );
342 }
343 else
344 {
345 simple_alig_iter * o = calloc( 1, sizeof *o );
346 *self = NULL;
347 if ( o == NULL )
348 {
349 rc = RC( rcApp, rcNoTarg, rcAllocating, rcMemory, rcExhausted );
350 log_err( "simple_alig_iter_make() memory exhausted" );
351 }
352 else
353 {
354 o->source = src;
355 rc = simple_alig_iter_initialize( o, db, cache_capacity );
356 }
357
358 if ( rc == 0 )
359 *self = o;
360 else
361 simple_alig_iter_release( o );
362 }
363 return rc;
364 }
365
366
simple_alig_iter_read(const simple_alig_iter * self,int64_t row_id,uint32_t * start,uint32_t * len)367 static rc_t simple_alig_iter_read( const simple_alig_iter * self,
368 int64_t row_id,
369 uint32_t * start,
370 uint32_t * len )
371 {
372 uint32_t elem_bits, boff, row_len;
373 const VCursor * curs = self->curs;
374 const uint32_t * idx = self->idx;
375 uint64_t * pp32;
376
377 /* get REF_POS */
378 rc_t rc = VCursorCellDataDirect( curs, row_id, idx[ 0 ], &elem_bits, ( const void ** )&pp32, &boff, &row_len );
379 if ( rc != 0 )
380 log_err( "cannot read '%s'.REFERENCE.SEQ_ID[ %ld ] %R", self->source, row_id, rc );
381 else
382 *start = pp32[ 0 ];
383
384 if ( rc == 0 )
385 {
386 /* row_len of HAS_MISMATCH */
387 rc = VCursorCellDataDirect( curs, row_id, idx[ 1 ], &elem_bits, ( const void ** )&pp32, &boff, &row_len );
388 if ( rc != 0 )
389 log_err( "cannot read '%s'.REFERENCE.SEQ_LEN[ %ld ] %R", self->source, row_id, rc );
390 else
391 *len = pp32[ 0 ];
392 }
393 return rc;
394
395 }
396
397
detailed_coverage_make(const char * src,size_t cache_capacity,const slice * slice,DetailedCoverage * dcoverage)398 rc_t detailed_coverage_make( const char * src,
399 size_t cache_capacity,
400 const slice * slice,
401 DetailedCoverage * dcoverage )
402 {
403 rc_t rc = 0;
404 if ( src == NULL || slice == NULL || dcoverage == NULL )
405 {
406 rc = RC( rcApp, rcNoTarg, rcAllocating, rcParam, rcNull );
407 log_err( "detailed_coverage_make() given a NULL-ptr" );
408 }
409 else
410 {
411 dcoverage->coverage = calloc( slice->count, sizeof *( dcoverage->coverage ) );
412 if ( dcoverage->coverage == NULL )
413 {
414 rc = RC( rcApp, rcNoTarg, rcAllocating, rcMemory, rcExhausted );
415 log_err( "detailed_coverage_make() memory exhausted" );
416 }
417 else
418 {
419 RefT * ref;
420 rc = ref_iter_find( &ref, src, cache_capacity, slice->refname );
421 if ( rc == 0 )
422 {
423 simple_coverage_iter * c_iter;
424
425 dcoverage->start_pos = slice->start;
426 dcoverage->len = slice->count;
427
428 /* adjust *ref to restrict it to the given slice... */
429 if ( slice->count > 0 )
430 {
431 int64_t start_offset = slice->start / ref->block_size;
432 int64_t end_offset = slice->end / ref->block_size;
433 if ( start_offset > 0 ) start_offset--;
434 ref->start_row_id += start_offset;
435 ref->count = ( end_offset - start_offset ) + 1;
436 }
437
438 rc = simple_coverage_iter_make( &c_iter, src, cache_capacity, ref );
439 if ( rc == 0 )
440 {
441 simple_alig_iter * a_iter;
442
443 rc = simple_alig_iter_make( &a_iter, c_iter->db, src, cache_capacity );
444 if ( rc == 0 )
445 {
446 SimpleCoverageT ct;
447 while( rc == 0 &&
448 simple_coverage_iter_get( c_iter, &ct ) )
449 {
450 uint32_t idx;
451 for ( idx = 0; rc == 0 && idx < ct.prim; ++idx )
452 {
453 uint32_t ref_pos, ref_len;
454 rc = simple_alig_iter_read( a_iter, ct.prim_ids[ idx ], &ref_pos, &ref_len );
455 if ( rc == 0 )
456 {
457 if ( ( ( ref_pos + ref_len - 1 ) >= slice->start ) &&
458 ref_pos < slice->end )
459 {
460 int32_t rel_start = ( ref_pos - ( uint32_t )slice->start );
461 int32_t i;
462
463 if ( rel_start < 0 )
464 {
465 ref_len += rel_start;
466 rel_start = 0;
467 }
468 for ( i = rel_start; i < ref_len && i < slice->count; ++i )
469 {
470 dcoverage->coverage[ i ]++;
471 }
472 }
473 }
474 }
475 }
476 simple_alig_iter_release( a_iter );
477 }
478 simple_coverage_iter_release( c_iter );
479 }
480 RefT_release( ref );
481 }
482 }
483 }
484 return rc;
485 }
486
487
detailed_coverage_release(DetailedCoverage * self)488 rc_t detailed_coverage_release( DetailedCoverage * self )
489 {
490 rc_t rc = 0;
491 if ( self == NULL )
492 rc = RC( rcApp, rcNoTarg, rcReleasing, rcParam, rcNull );
493 else
494 {
495 if ( self->coverage != NULL )
496 {
497 free( ( void * ) self->coverage );
498 self->coverage = NULL;
499 }
500 }
501 return rc;
502
503 }