1 /* ===========================================================================
2  *
3  *                            PUBLIC DOMAIN NOTICE
4  *               National Center for Biotechnologmsgy Information
5  *
6  *  This software/database is a "United States Government Work" under the
7  *  terms of the United States Copyright Act.  It was written as part of
8  *  the author's official duties as a United States Government employee and
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  */
26 #include "coverage_iter.h"
27 #include "common.h"
28 
29 #include <klib/text.h>
30 #include <klib/log.h>
31 #include <klib/printf.h>
32 #include <klib/out.h>
33 
34 #include <kfs/directory.h>
35 #include <kfs/file.h>
36 
37 #include <vdb/manager.h>
38 #include <vdb/database.h>
39 #include <vdb/table.h>
40 #include <vdb/cursor.h>
41 
42 /* ----------------------------------------------------------------------------------------------- */
43 
44 #define COVERAGE_ITER_N_COLS 3
45 
46 typedef struct simple_coverage_iter
47 {
48     /* common for both modes: the name of the source */
49     const char * source;
50 
51     /* for the CSRA-MODE */
52     const VDatabase * db;
53     const VCursor *curs;
54     uint32_t idx[ COVERAGE_ITER_N_COLS ];
55     row_range to_process;
56     uint64_t rows_processed;
57     int64_t current_row;
58     uint64_t position;
59     uint32_t last_len;
60 } simple_coverage_iter;
61 
62 
63 /* release an reference-iterator */
simple_coverage_iter_release(struct simple_coverage_iter * self)64 rc_t simple_coverage_iter_release( struct simple_coverage_iter * self )
65 {
66     rc_t rc = 0;
67     if ( self == NULL )
68         rc = RC( rcApp, rcNoTarg, rcReleasing, rcParam, rcNull );
69     else
70     {
71         if ( self->curs != NULL )
72         {
73             rc = VCursorRelease( self->curs );
74             if ( rc != 0 )
75                 log_err( "error (%R) releasing VCursor for %s", rc, self->source );
76         }
77         if ( self->db != NULL )
78         {
79             rc = VDatabaseRelease( self->db );
80             if ( rc != 0 )
81                 log_err( "error (%R) releasing VDatabase for %s", rc, self->source );
82 
83         }
84         free( ( void * ) self );
85     }
86     return rc;
87 }
88 
simple_coverage_iter_initialize(struct simple_coverage_iter * self,size_t cache_capacity,const RefT * ref)89 static rc_t simple_coverage_iter_initialize( struct simple_coverage_iter * self,
90                                              size_t cache_capacity,
91                                              const RefT * ref )
92 {
93     const VDBManager * mgr;
94     rc_t rc = VDBManagerMakeRead( &mgr, NULL );
95     if ( rc != 0 )
96         log_err( "coverage_iter_initialize.initialize.VDBManagerMakeRead() %R", rc );
97     else
98     {
99         rc = VDBManagerOpenDBRead( mgr, &self->db, NULL, "%s", self->source );
100         if ( rc != 0 )
101             log_err( "coverage_iter_initialize.initialize.VDBManagerOpenDBRead( '%s' ) %R", self->source, rc );
102         else
103         {
104             const VTable * tbl;
105             rc = VDatabaseOpenTableRead( self->db, &tbl, "%s", "REFERENCE" );
106             if ( rc != 0 )
107                 log_err( "coverage_iter_initialize.initialize.VDatabaseOpenTableRead( '%s'.REFERENCE ) %R", self->source, rc );
108             else
109             {
110                 if ( cache_capacity > 0 )
111                     rc = VTableCreateCachedCursorRead( tbl, &self->curs, cache_capacity );
112                 else
113                     rc = VTableCreateCursorRead( tbl, &self->curs );
114                 if ( rc != 0 )
115                     log_err( "coverage_iter_initialize.initialize.TableCreateCursorRead( '%s'.REFERENCE ) %R", self->source, rc );
116                 else
117                 {
118                     rc = add_cols_to_cursor( self->curs, self->idx, "REFERENCE", self->source,
119                                              COVERAGE_ITER_N_COLS,
120                                              "PRIMARY_ALIGNMENT_IDS",
121                                              "SECONDARY_ALIGNMENT_IDS",
122                                              "SEQ_LEN" );
123                     if ( rc == 0 )
124                     {
125                         self->to_process.first_row = ref->start_row_id;
126                         self->to_process.row_count = ref->count;
127                         self->current_row = ref->start_row_id;
128                     }
129                 }
130                 VTableRelease( tbl );
131             }
132         }
133         VDBManagerRelease( mgr );
134     }
135     return rc;
136 }
137 
138 /* construct an reference-iterator from an accession */
simple_coverage_iter_make(struct simple_coverage_iter ** self,const char * src,size_t cache_capacity,const RefT * ref)139 rc_t simple_coverage_iter_make( struct simple_coverage_iter ** self,
140                                 const char * src,
141                                 size_t cache_capacity,
142                                 const RefT * ref )
143 {
144     rc_t rc = 0;
145     if ( self == NULL || src == NULL || ref == NULL )
146     {
147         rc = RC( rcApp, rcNoTarg, rcAllocating, rcParam, rcNull );
148         log_err( "coverage_iter.make() given a NULL-ptr" );
149     }
150     else
151     {
152         simple_coverage_iter * o = calloc( 1, sizeof *o );
153         *self = NULL;
154         if ( o == NULL )
155         {
156             rc = RC( rcApp, rcNoTarg, rcAllocating, rcMemory, rcExhausted );
157             log_err( "coverage_iter_initialize.make() memory exhausted" );
158         }
159         else
160         {
161             o->source = src;
162             rc = simple_coverage_iter_initialize( o, cache_capacity, ref );
163         }
164 
165         if ( rc == 0 )
166             *self = o;
167         else
168             simple_coverage_iter_release( o );
169     }
170     return rc;
171 }
172 
173 
fill_simple_coverage_row(struct simple_coverage_iter * self,int64_t row_id,SimpleCoverageT * ct)174 static rc_t fill_simple_coverage_row( struct simple_coverage_iter * self,
175                                       int64_t row_id,
176                                       SimpleCoverageT * ct )
177 {
178     uint32_t elem_bits, boff, row_len;
179     const VCursor * curs = self->curs;
180     uint32_t * idx = self->idx;
181     int64_t * pp64;
182     uint64_t * pp32;
183 
184     /* get count of PRIM-ID's */
185     rc_t rc = VCursorCellDataDirect( curs, row_id, idx[ 0 ], &elem_bits, ( const void ** )&pp64, &boff, &row_len );
186     if ( rc != 0 )
187         log_err( "cannot read '%s'.REFERENCE.SEQ_ID[ %ld ] %R", self->source, row_id, rc );
188     else
189     {
190         ct->prim = row_len;
191         ct->prim_ids = pp64;
192     }
193 
194     if ( rc == 0 )
195     {
196         /* get count of SEC-ID's */
197         rc = VCursorCellDataDirect( curs, row_id, idx[ 1 ], &elem_bits, ( const void ** )&pp64, &boff, &row_len );
198         if ( rc != 0 )
199             log_err( "cannot read '%s'.REFERENCE.SEQ_LEN[ %ld ] %R", self->source, row_id, rc );
200         else
201         {
202             ct->sec = row_len;
203             ct->sec_ids = pp64;
204         }
205     }
206     if ( rc == 0 )
207     {
208         /* get SEQ_LEN */
209         rc = VCursorCellDataDirect( curs, row_id, idx[ 2 ], &elem_bits, ( const void ** )&pp32, &boff, &row_len );
210         if ( rc != 0 )
211             log_err( "cannot read '%s'.REFERENCE.SEQ_LEN[ %ld ] %R", self->source, row_id, rc );
212         else
213             ct->len = pp32[ 0 ];
214     }
215 
216     return rc;
217 }
218 
219 
220 /* get the next reference from the iter */
simple_coverage_iter_get(struct simple_coverage_iter * self,SimpleCoverageT * ct)221 bool simple_coverage_iter_get( struct simple_coverage_iter * self,
222                                SimpleCoverageT * ct )
223 {
224     bool res = ( self != NULL && ct != NULL  );
225     if ( res )
226     {
227         res = ( self->rows_processed < self->to_process.row_count );
228         if ( res )
229         {
230             self->position += self->last_len;
231             res = ( fill_simple_coverage_row( self, self->current_row, ct ) == 0 );
232             if ( res )
233             {
234                 self->current_row++;
235                 self->rows_processed++;
236                 self->last_len = ct->len;
237             }
238         }
239     }
240     return res;
241 }
242 
243 
simple_coverage_iter_get_capped(struct simple_coverage_iter * self,SimpleCoverageT * ct,uint32_t min,uint32_t max)244 bool simple_coverage_iter_get_capped( struct simple_coverage_iter * self,
245                                       SimpleCoverageT * ct,
246                                       uint32_t min,
247                                       uint32_t max )
248 {
249     bool res = false;
250     bool running = true;
251     while ( running )
252     {
253         res = simple_coverage_iter_get( self, ct );
254         if ( res )
255         {
256             running = !( ct->prim >= min && ct->prim <= max );
257             if ( !running )
258             {
259                 ct->ref_row_id = self->current_row - 1;
260                 ct->start_pos = self->position;
261             }
262         }
263         else
264             running = false;
265     }
266     return res;
267 }
268 
269 
270 /* ========================================================================================= */
271 
272 #define ALIG_ITER_N_COLS 2
273 
274 typedef struct simple_alig_iter
275 {
276     /* common for both modes: the name of the source */
277     const char * source;
278 
279     /* for the CSRA-MODE */
280     const VCursor *curs;
281     uint32_t idx[ ALIG_ITER_N_COLS ];
282 } simple_alig_iter;
283 
284 
simple_alig_iter_release(simple_alig_iter * self)285 static rc_t simple_alig_iter_release( simple_alig_iter * self )
286 {
287     rc_t rc = 0;
288     if ( self == NULL )
289         rc = RC( rcApp, rcNoTarg, rcReleasing, rcParam, rcNull );
290     else
291     {
292         if ( self->curs != NULL )
293         {
294             rc = VCursorRelease( self->curs );
295             if ( rc != 0 )
296                 log_err( "error (%R) releasing VCursor for %s", rc, self->source );
297         }
298         free( ( void * ) self );
299     }
300     return rc;
301 }
302 
303 
simple_alig_iter_initialize(simple_alig_iter * self,const VDatabase * db,size_t cache_capacity)304 static rc_t simple_alig_iter_initialize( simple_alig_iter * self,
305                                          const VDatabase * db,
306                                          size_t cache_capacity )
307 {
308     const VTable * tbl;
309     rc_t rc = VDatabaseOpenTableRead( db, &tbl, "%s", "PRIMARY_ALIGNMENT" );
310     if ( rc != 0 )
311         log_err( "simple_alig_iter_initialize.VDatabaseOpenTableRead( '%s'.PRIMARY_ALIGNMENT ) %R", self->source, rc );
312     else
313     {
314         if ( cache_capacity > 0 )
315             rc = VTableCreateCachedCursorRead( tbl, &self->curs, cache_capacity );
316         else
317             rc = VTableCreateCursorRead( tbl, &self->curs );
318         if ( rc != 0 )
319             log_err( "simple_alig_iter_initialize.TableCreateCursorRead( '%s'.REFERENCE ) %R", self->source, rc );
320         else
321         {
322             rc = add_cols_to_cursor( self->curs, self->idx, "PRIMARY_ALIGNMENT", self->source,
323                                       ALIG_ITER_N_COLS,
324                                      "REF_POS",
325                                      "REF_LEN" );
326         }
327         VTableRelease( tbl );
328     }
329     return rc;
330 }
331 
simple_alig_iter_make(simple_alig_iter ** self,const VDatabase * db,const char * src,size_t cache_capacity)332 static rc_t simple_alig_iter_make( simple_alig_iter ** self,
333                                     const VDatabase * db,
334                                     const char * src,
335                                     size_t cache_capacity )
336 {
337     rc_t rc = 0;
338     if ( self == NULL || src == NULL || db == NULL )
339     {
340         rc = RC( rcApp, rcNoTarg, rcAllocating, rcParam, rcNull );
341         log_err( "simple_alig_iter_make() given a NULL-ptr" );
342     }
343     else
344     {
345         simple_alig_iter * o = calloc( 1, sizeof *o );
346         *self = NULL;
347         if ( o == NULL )
348         {
349             rc = RC( rcApp, rcNoTarg, rcAllocating, rcMemory, rcExhausted );
350             log_err( "simple_alig_iter_make() memory exhausted" );
351         }
352         else
353         {
354             o->source = src;
355             rc = simple_alig_iter_initialize( o, db, cache_capacity );
356         }
357 
358         if ( rc == 0 )
359             *self = o;
360         else
361             simple_alig_iter_release( o );
362     }
363     return rc;
364 }
365 
366 
simple_alig_iter_read(const simple_alig_iter * self,int64_t row_id,uint32_t * start,uint32_t * len)367 static rc_t simple_alig_iter_read( const simple_alig_iter * self,
368                                    int64_t row_id,
369                                    uint32_t * start,
370                                    uint32_t * len )
371 {
372     uint32_t elem_bits, boff, row_len;
373     const VCursor * curs = self->curs;
374     const uint32_t * idx = self->idx;
375     uint64_t * pp32;
376 
377     /* get REF_POS */
378     rc_t rc = VCursorCellDataDirect( curs, row_id, idx[ 0 ], &elem_bits, ( const void ** )&pp32, &boff, &row_len );
379     if ( rc != 0 )
380         log_err( "cannot read '%s'.REFERENCE.SEQ_ID[ %ld ] %R", self->source, row_id, rc );
381     else
382         *start = pp32[ 0 ];
383 
384     if ( rc == 0 )
385     {
386         /* row_len of HAS_MISMATCH */
387         rc = VCursorCellDataDirect( curs, row_id, idx[ 1 ], &elem_bits, ( const void ** )&pp32, &boff, &row_len );
388         if ( rc != 0 )
389             log_err( "cannot read '%s'.REFERENCE.SEQ_LEN[ %ld ] %R", self->source, row_id, rc );
390         else
391             *len = pp32[ 0 ];
392     }
393     return rc;
394 
395 }
396 
397 
detailed_coverage_make(const char * src,size_t cache_capacity,const slice * slice,DetailedCoverage * dcoverage)398 rc_t detailed_coverage_make( const char * src,
399                              size_t cache_capacity,
400                              const slice * slice,
401                              DetailedCoverage * dcoverage )
402 {
403     rc_t rc = 0;
404     if ( src == NULL || slice == NULL || dcoverage == NULL )
405     {
406         rc = RC( rcApp, rcNoTarg, rcAllocating, rcParam, rcNull );
407         log_err( "detailed_coverage_make() given a NULL-ptr" );
408     }
409     else
410     {
411         dcoverage->coverage = calloc( slice->count, sizeof *( dcoverage->coverage ) );
412         if ( dcoverage->coverage == NULL )
413         {
414             rc = RC( rcApp, rcNoTarg, rcAllocating, rcMemory, rcExhausted );
415             log_err( "detailed_coverage_make() memory exhausted" );
416         }
417         else
418         {
419             RefT * ref;
420             rc = ref_iter_find( &ref, src, cache_capacity, slice->refname );
421             if ( rc == 0 )
422             {
423                 simple_coverage_iter * c_iter;
424 
425                 dcoverage->start_pos = slice->start;
426                 dcoverage->len = slice->count;
427 
428                 /* adjust *ref to restrict it to the given slice... */
429                 if ( slice->count > 0 )
430                 {
431                     int64_t start_offset = slice->start / ref->block_size;
432                     int64_t end_offset = slice->end / ref->block_size;
433                     if ( start_offset > 0 ) start_offset--;
434                     ref->start_row_id += start_offset;
435                     ref->count = ( end_offset - start_offset ) + 1;
436                 }
437 
438                 rc = simple_coverage_iter_make( &c_iter, src, cache_capacity, ref );
439                 if ( rc == 0 )
440                 {
441                     simple_alig_iter * a_iter;
442 
443                     rc = simple_alig_iter_make( &a_iter, c_iter->db, src, cache_capacity );
444                     if ( rc == 0 )
445                     {
446                         SimpleCoverageT ct;
447                         while( rc == 0 &&
448                                 simple_coverage_iter_get( c_iter, &ct ) )
449                         {
450                             uint32_t idx;
451                             for ( idx = 0; rc == 0 && idx < ct.prim; ++idx )
452                             {
453                                 uint32_t ref_pos, ref_len;
454                                 rc = simple_alig_iter_read( a_iter, ct.prim_ids[ idx ], &ref_pos, &ref_len );
455                                 if ( rc == 0 )
456                                 {
457                                     if ( ( ( ref_pos + ref_len - 1 ) >= slice->start ) &&
458                                           ref_pos < slice->end )
459                                     {
460                                         int32_t rel_start = ( ref_pos - ( uint32_t )slice->start );
461                                         int32_t i;
462 
463                                         if ( rel_start < 0 )
464                                         {
465                                             ref_len += rel_start;
466                                             rel_start = 0;
467                                         }
468                                         for ( i = rel_start; i < ref_len && i < slice->count; ++i )
469                                         {
470                                             dcoverage->coverage[ i ]++;
471                                         }
472                                     }
473                                 }
474                             }
475                         }
476                         simple_alig_iter_release( a_iter );
477                     }
478                     simple_coverage_iter_release( c_iter );
479                 }
480                 RefT_release( ref );
481             }
482         }
483     }
484     return rc;
485 }
486 
487 
detailed_coverage_release(DetailedCoverage * self)488 rc_t detailed_coverage_release( DetailedCoverage * self )
489 {
490     rc_t rc = 0;
491     if ( self == NULL )
492         rc = RC( rcApp, rcNoTarg, rcReleasing, rcParam, rcNull );
493     else
494     {
495         if ( self->coverage != NULL )
496         {
497             free( ( void * ) self->coverage );
498             self->coverage = NULL;
499         }
500     }
501     return rc;
502 
503 }