1 /*===========================================================================
2 *
3 *                            PUBLIC DOMAIN NOTICE
4 *               National Center for Biotechnology Information
5 *
6 *  This software/database is a "United States Government Work" under the
7 *  terms of the United States Copyright Act.  It was written as part of
8 *  the author's official duties as a United States Government employee and
9 *  thus cannot be copyrighted.  This software/database is freely available
10 *  to the public for use. The National Library of Medicine and the U.S.
11 *  Government have not placed any restriction on its use or reproduction.
12 *
13 *  Although all reasonable efforts have been taken to ensure the accuracy
14 *  and reliability of the software and data, the NLM and the U.S.
15 *  Government do not and cannot warrant the performance or results that
16 *  may be obtained by using this software or data. The NLM and the U.S.
17 *  Government disclaim all warranties, express or implied, including
18 *  warranties of performance, merchantability or fitness for any particular
19 *  purpose.
20 *
21 *  Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 
27 #ifndef _h_align_iterator_
28 #define _h_align_iterator_
29 
30 #ifndef _h_align_extern_
31 #include <align/extern.h>
32 #endif
33 
34 #ifndef _h_klib_container_
35 #include <klib/container.h>
36 #endif
37 
38 #ifndef _h_insdc_insdc_
39 #include <insdc/insdc.h>
40 #endif
41 
42 #ifndef _h_vdb_database_
43 #include <vdb/database.h>
44 #endif
45 
46 #ifdef __cplusplus
47 extern "C" {
48 #endif
49 
50 
51 /*--------------------------------------------------------------------------
52  * forwards
53  */
54 struct VCursor;
55 struct AlignMgr;
56 struct ReferenceObj;
57 
58 
59 /*--------------------------------------------------------------------------
60  * AlignmentIterator
61  *  walk across a single alignment in reference space
62  */
63 typedef struct AlignmentIterator AlignmentIterator;
64 
65 
66 /* Make
67  *  create an encapsulation of alignment
68  *
69  *  iter [ OUT ] - return parameter for new iterator
70  *
71  *  copy [ IN ] - when "true" the data will be copied.
72  *  otherwise, pointers will be expected to refer to data
73  *  with a lifetime >= that of the iterator being created.
74  *
75  *  "ref_pos" [ IN ] and "ref_len" [ IN ] - projection onto reference
76  *
77  *  "read" [ IN ] and "read_len" [ IN ] - full sequence of alignment
78  *
79  *  "has_mismatch" [ IN ] - describes comparison result of each base
80  *  in "read" against the alignment.
81  *
82  *  "has_ref_offset" [ IN ] - describes positions of reference offsets
83  *
84  *  "ref_offset" [ IN ] and "ref_offset_len" [ IN ] - packed array of
85  *  offsets of position against reference.
86  */
87 ALIGN_EXTERN rc_t CC AlignMgrMakeAlignmentIterator ( struct AlignMgr const *self,
88     AlignmentIterator **iter,
89     bool copy,
90     INSDC_coord_zero ref_pos,
91     INSDC_coord_len ref_len,
92     const INSDC_4na_bin *read,
93     INSDC_coord_len read_len,
94     const bool *has_mismatch,
95     const bool *has_ref_offset,
96     const int32_t *ref_offset,
97     uint32_t ref_offset_len,
98     INSDC_coord_zero ref_window_start,
99     INSDC_coord_len ref_window_len );
100 
101 
102 /* AddRef
103  * Release
104  */
105 ALIGN_EXTERN rc_t CC AlignmentIteratorAddRef ( const AlignmentIterator *self );
106 ALIGN_EXTERN rc_t CC AlignmentIteratorRelease ( const AlignmentIterator *self );
107 
108 
109 /* Next
110  *  advance position by 1 in reference space
111  *  must be called initially to advance to first element
112  *
113  *  returns RCState = rcDone when done
114  */
115 ALIGN_EXTERN rc_t CC AlignmentIteratorNext ( AlignmentIterator *self );
116 
117 
118 /* State
119  *  returns bitmap of state bits and event code at the current position
120  *  will return invalid before initial Next message or after final
121  *
122  *  bits [ 0..7 ] :
123  *    { 0..15 }             = 4na mismatch (NACMGRSVTWYHKDBN)
124  *    64                    = match
125  *    128                   = skip
126  *  bit [ 8 ]               = have insert if ! 0
127  *  bit [ 9 ]               = have delete if ! 0
128  *  bit [ 10 ]              = first base if ! 0
129  *  bit [ 11 ]              = last base if ! 0
130  *  bit [ 31 ]              = iterator position is invalid if ! 0
131  *                            NB - converts state word to negative
132  */
133 enum
134 {
135     align_iter_match      = ( 1 <<  8 ),
136     align_iter_skip       = ( 1 <<  9 ),
137 
138     align_iter_insert     = ( 1 << 10 ),
139     align_iter_delete     = ( 1 << 11 ),
140     align_iter_first      = ( 1 << 12 ),
141     align_iter_last       = ( 1 << 13 ),
142 
143     align_iter_invalid    = ( int ) ( 1U << 31 )
144 };
145 
146 ALIGN_EXTERN int32_t CC AlignmentIteratorState ( const AlignmentIterator *self,
147                                                  INSDC_coord_zero *seq_pos );
148 
149 
150 /* Position
151  *  return current position of iterator relative to reference
152  */
153 ALIGN_EXTERN rc_t CC AlignmentIteratorPosition ( const AlignmentIterator *self,
154                                                  INSDC_coord_zero *pos );
155 
156 
157 /* BasesInserted
158  *  return the number of inserted bases and a pointer to their values
159  *
160  *  "bases" [ OUT, NULL OKAY ] - optional output parameter to inserted bases
161  *
162  *  returns count of bases inserted at current position
163  */
164 ALIGN_EXTERN uint32_t CC AlignmentIteratorBasesInserted
165     ( const AlignmentIterator *self, const INSDC_4na_bin **bases );
166 
167 
168 /* BasesDeleted
169  *  return the number of bases deleted at the current position
170  *  also returns the location on the reference where the delete starts
171  *
172  *  "pos" [ OUT ] - return parameter for location on the reference
173  *  where delete starts, and continues for the number of bases given by function return
174  *
175  *  returns count of bases deleted at current position
176  */
177 ALIGN_EXTERN uint32_t CC AlignmentIteratorBasesDeleted
178     ( const AlignmentIterator *self, INSDC_coord_zero *pos );
179 
180 
181 /*--------------------------------------------------------------------------
182  * PlacementRecord
183  *  record describing a placement
184  */
185 typedef struct PlacementRecord PlacementRecord;
186 struct PlacementRecord
187 {
188     DLNode n;
189 
190     /* row id of alignment record */
191     int64_t id;
192 
193     /* object representing reference sequence */
194     struct ReferenceObj const *ref;
195 
196     /* placement position and length on reference */
197     INSDC_coord_zero pos;
198     INSDC_coord_len len;
199 
200     /* mapping quality of alignment */
201     int32_t mapq;
202 
203     /* spotgroup is now in here too */
204     uint32_t spot_group_len;
205     char * spot_group;
206 };
207 
208 
209 /* Cast
210  *  cast to an extended object
211  *
212  *  "ext" [ IN ] - selects the extended object level
213  *  can be placementRecordExtension0 or placementRecordExtension1
214  */
215 
216 enum { placementRecordExtension0, placementRecordExtension1 };
217 
218 ALIGN_EXTERN void* CC PlacementRecordCast ( const PlacementRecord *self, uint32_t ext );
219 
220 
221 ALIGN_EXTERN void* CC PlacementRecord_get_ext_data_ptr ( const PlacementRecord *self, uint32_t ext );
222 
223 
224 /* Whack
225  *  destroys PlacementRecord and any associated extensions
226  */
227 ALIGN_EXTERN void CC PlacementRecordWhack ( const PlacementRecord *self );
228 
229 
230 /* structure of function pointers for creating extensions
231    all function pointers are optional ( NULL OKAY ) */
232 typedef struct PlacementRecordExtendFuncs PlacementRecordExtendFuncs;
233 struct PlacementRecordExtendFuncs
234 {
235     /* opaque pointer to data passed to each function */
236     void *data;
237 
238     /* destructor */
239     void ( CC * destroy ) ( void *obj, void *data );
240 
241     /* constructor */
242     rc_t ( CC * populate ) ( void *obj, const PlacementRecord *placement,
243         struct VCursor const *curs, INSDC_coord_zero ref_window_start,
244         INSDC_coord_len ref_window_len, void *data, void * placement_ctx );
245 
246     /* variable allocation size calculation
247        when non-NULL, takes precedence over "fixed_size" */
248     bool ( CC * filter ) ( struct VCursor const *curs, int64_t row_id,
249         const PlacementRecord *placement, INSDC_coord_zero ref_window_start,
250         INSDC_coord_len ref_window_len, void *data, void * placement_ctx );
251 
252     /* variable allocation size calculation
253        when non-NULL, takes precedence over "fixed_size" */
254     rc_t ( CC * alloc_size ) ( struct VCursor const *curs, int64_t row_id, size_t * size, void *data, void * placement_ctx );
255 
256     /* fixed allocation size
257        ignored if "alloc_size" is non-NULL,
258        must be non-zero otherwise */
259     size_t fixed_size;
260 };
261 
262 
263 /* external functions for extension of a placement record
264    to include ( construct ) an AlignmentIterator */
265 ALIGN_EXTERN void CC AlignIteratorRecordDestroy ( void *obj, void *data );
266 ALIGN_EXTERN rc_t CC AlignIteratorRecordPopulate ( void *obj,
267     const PlacementRecord *placement, struct VCursor const *curs,
268     INSDC_coord_zero ref_window_start, INSDC_coord_len ref_window_len, void *data );
269 ALIGN_EXTERN rc_t CC AlignIteratorRecordSize ( struct VCursor const *curs, int64_t row_id, size_t * size, void *data );
270 
271 
272 /*--------------------------------------------------------------------------
273  * PlacementIterator
274  *  walk across placements from an alignment db within a reference window
275  */
276 typedef struct PlacementIterator PlacementIterator;
277 
278 
279 /* Make
280  *  create a placement iterator
281  *
282  *  "iter" [ OUT ] - return parameter for iterator
283  *
284  *  "ref_obj" [ IN, NULL OKAY ] - optional parameter giving an object
285  *  representing the reference sequence for this iterator. it will be
286  *  inserted into each PlacementRecord (see above) and made available to
287  *  outer code.
288  *
289  *  "ref_pos" [ IN ] and "ref_len" [ IN ] - window onto reference
290  *
291  *  "min_mapq" [ IN ] - minimum map quality value
292  *
293  *  "ref_cur" [ IN ] - read-only cursor on REFERENCE table
294  *  will be modified as necessary to contain requisite columns
295  *  will be opened by iterator.
296  *
297  *  "align_cur" [ IN ] - read-only cursor on PRIMARY_ALIGNMENT or SECONDARY_ALIGNMENT
298  *  table ( see "secondary" ). will be modified as necessary to contain
299  *  requisite columns. will be opened by iterator.
300  *
301  *  "ids" [ IN ] - an enum describing which column of alignment ids should
302  *  be used when reading "ref"
303  *
304  *  "ext_0" [ IN, NULL OKAY ] and "ext_1" [ IN, NULL OKAY ] - optional pointers
305  *  to blocks describing how to extend the basic placement record
306  *
307  *  rd_group [ IN, NULL OKAY ]
308  *      != NULL, non empty string ... produce all alignments with this string as
309  *                  spot-group ( no matter what the "real" spot-group of the
310  *                  alignment is )
311  *
312  *      != NULL, empty string ... produce all alignments with the "real" spot-group
313  *                  read from the column "SPOT_GROUP"
314  *
315  *      == NULL, ... produce all alignments with no spot-group assigned ( the user
316  *                  does not wish the data to be read, the alignment to be bined )
317  */
318 
319 typedef uint8_t align_id_src;
320 enum { primary_align_ids, secondary_align_ids, evidence_align_ids };
321 
322 ALIGN_EXTERN rc_t CC AlignMgrMakePlacementIterator ( struct AlignMgr const *self,
323     PlacementIterator **iter, struct ReferenceObj const *ref_obj,
324     INSDC_coord_zero ref_pos, INSDC_coord_len ref_len, int32_t min_mapq,
325     struct VCursor const *ref_cur, struct VCursor const *align_cur, align_id_src ids,
326     const PlacementRecordExtendFuncs *ext_0, const PlacementRecordExtendFuncs *ext_1,
327     const char * spot_group );
328 
329 
330 /* AddRef
331  * Release
332  */
333 ALIGN_EXTERN rc_t CC PlacementIteratorAddRef ( const PlacementIterator *self );
334 ALIGN_EXTERN rc_t CC PlacementIteratorRelease ( const PlacementIterator *self );
335 
336 
337 /* RefWindow
338  *  returns the reference identification string and iteration window
339  */
340 ALIGN_EXTERN rc_t CC PlacementIteratorRefWindow ( const PlacementIterator *self,
341     const char **idstr, INSDC_coord_zero *pos, INSDC_coord_len *len );
342 
343 
344 /* RefObj
345  *  returns the Ref-obj, that was used to create this placement-iterator
346  */
347 ALIGN_EXTERN rc_t CC PlacementIteratorRefObj( const PlacementIterator * self,
348     struct ReferenceObj const ** refobj );
349 
350 
351 /* NextAvailPos
352  *  check the next available position having one or more placements
353  *
354  *  "pos" [ OUT ] - next position on reference having one or more placements
355  *  may return negative position, indicating an alignment that wraps around
356  *  a circular reference, and starts in negative space after linearization.
357  *
358  *  "len" [ OUT, NULL OKAY ] - optional return parameter for length of
359  *  placement at that position
360  *
361  *  returns non-zero rc when no window is done
362  *  (rcRange, rcDone)
363  */
364 ALIGN_EXTERN rc_t CC PlacementIteratorNextAvailPos ( const PlacementIterator *self,
365     INSDC_coord_zero *pos, INSDC_coord_len *len );
366 
367 
368 /* NextRecordAt
369  *  retrieve a placement at the requested position
370  *
371  *  "pos" [ IN ] - required position of the placement
372  *  obtained from "NextAvailPos"
373  *
374  *  "rec" [ OUT ] - returned record
375  *  must be freed via PlacementRecordWhack
376  *
377  *  returns non-zero rc when no more placements are available
378  *  (rcOffset, rcDone)
379  */
380 ALIGN_EXTERN rc_t CC PlacementIteratorNextRecordAt ( PlacementIterator *self,
381     INSDC_coord_zero pos, const PlacementRecord **rec );
382 
383 
384 /* NextIdAt
385  *  retrieve a row id at the requested position
386  *
387  *  "pos" [ IN ] - required position of the placement
388  *  obtained from "NextAvailPos"
389  *
390  *  "id" [ OUT ] - returned row-id, within domain of align cursor
391  *
392  *  "len" [ OUT, NULL OKAY ] - optional return parameter for length of
393  *  placement on reference
394  *
395  *  returns non-zero rc when no more placements are available
396  *  (rcOffset, rcDone)
397  */
398 ALIGN_EXTERN rc_t CC PlacementIteratorNextIdAt ( PlacementIterator *self,
399     INSDC_coord_zero pos, int64_t *row_id, INSDC_coord_len *len );
400 
401 
402 /*--------------------------------------------------------------------------
403  * PlacementSetIterator
404  *  walk across placements from an alignment db within a reference window
405  */
406 typedef struct PlacementSetIterator PlacementSetIterator;
407 
408 
409 /* Make
410  *  create a placement set iterator
411  *
412  *  "iter" [ OUT ] - return parameter for iterator
413  *
414  *  "ref_pos" [ IN ] and "ref_len" [ IN ] - window onto reference
415  */
416 ALIGN_EXTERN rc_t CC AlignMgrMakePlacementSetIterator ( struct AlignMgr const *self,
417     PlacementSetIterator **iter );
418 
419 
420 /* AddPlacementIterator
421  *  adds a placement iterator
422  *  used to provide ordered placements within window
423  */
424 ALIGN_EXTERN rc_t CC PlacementSetIteratorAddPlacementIterator
425     ( PlacementSetIterator *self, PlacementIterator *pi );
426 
427 
428 /* AddRef
429  * Release
430  */
431 ALIGN_EXTERN rc_t CC PlacementSetIteratorAddRef ( const PlacementSetIterator *self );
432 ALIGN_EXTERN rc_t CC PlacementSetIteratorRelease ( const PlacementSetIterator *self );
433 
434 
435 ALIGN_EXTERN rc_t CC PlacementSetIteratorNextReference ( PlacementSetIterator *self,
436     INSDC_coord_zero *first_pos, INSDC_coord_len *len, struct ReferenceObj const ** refobj );
437 
438 ALIGN_EXTERN rc_t CC PlacementSetIteratorNextWindow ( PlacementSetIterator *self,
439     INSDC_coord_zero *first_pos, INSDC_coord_len *len );
440 
441 /* NextAvailPos
442  *  check the next available position having one or more placements
443  *
444  *  "pos" [ OUT ] - next position on reference having one or more placements
445  *  may return negative position, indicating an alignment that wraps around
446  *  a circular reference, and starts in negative space after linearization.
447  *
448  *  "len" [ OUT, NULL OKAY ] - optional return parameter for length of
449  *  placement at that position
450  *
451  *  returns non-zero rc when no more placements are available
452  *  TBD - define a proper value
453  */
454 ALIGN_EXTERN rc_t CC PlacementSetIteratorNextAvailPos ( const PlacementSetIterator *self,
455     INSDC_coord_zero *pos, INSDC_coord_len *len );
456 
457 
458 /* NextRecordAt
459  *  retrieve a placement at the requested position
460  *
461  *  "pos" [ IN ] - required position of the placement
462  *  obtained from "NextAvailPos"
463  *
464  *  "rec" [ OUT ] - returned record
465  *  must be freed via PlacementRecordWhack
466  */
467 ALIGN_EXTERN rc_t CC PlacementSetIteratorNextRecordAt ( PlacementSetIterator *self,
468     INSDC_coord_zero pos, const PlacementRecord **rec );
469 
470 
471 /* NextIdAt
472  *  retrieve a row id at the requested position
473  *
474  *  "pos" [ IN ] - required position of the placement
475  *  obtained from "NextAvailPos"
476  *
477  *  "id" [ OUT ] - returned row-id, within domain of align cursor
478  *
479  *  "len" [ OUT, NULL OKAY ] - optional return parameter for length of
480  *  placement on reference
481  */
482 ALIGN_EXTERN rc_t CC PlacementSetIteratorNextIdAt ( PlacementSetIterator *self,
483     INSDC_coord_zero pos, int64_t *row_id, INSDC_coord_len *len );
484 
485 
486 /*--------------------------------------------------------------------------
487  * ReferenceIterator
488  *  walk across placements from an alignment db within a reference window
489  */
490 typedef struct ReferenceIterator ReferenceIterator;
491 
492 
493 /* Make
494  *  create a reference iterator
495  *
496  *  "iter" [ OUT ] - return parameter for iterator
497  *
498  *  "ext_1" [ IN, NULL OKAY ] - optional pointer to a block describing how
499  *  to extend the align-iterator record
500  *
501  *  "min_mapq" [ IN ] - minimum map quality value
502  */
503 ALIGN_EXTERN rc_t CC AlignMgrMakeReferenceIterator ( struct AlignMgr const *self,
504     ReferenceIterator **iter, const PlacementRecordExtendFuncs *ext_1, int32_t min_mapq );
505 
506 
507 /* AddRef
508  * Release
509  */
510 ALIGN_EXTERN rc_t CC ReferenceIteratorAddRef ( const ReferenceIterator *self );
511 ALIGN_EXTERN rc_t CC ReferenceIteratorRelease ( const ReferenceIterator *self );
512 
513 
514 /* AddPlacementIterator
515  *  adds a placement iterator
516  *  used to provide ordered placements within window
517  */
518 #if 0
519 ALIGN_EXTERN rc_t CC ReferenceIteratorAddPlacementIterator
520     ( ReferenceIterator *self, PlacementIterator *pi );
521 #endif
522 
523 /* AddPlacements
524  *  adds a source for placements (file/table)
525  *  used to provide ordered placements within window
526  */
527 ALIGN_EXTERN rc_t CC ReferenceIteratorAddPlacements ( ReferenceIterator *self,
528     struct ReferenceObj const *ref_obj, INSDC_coord_zero ref_pos, INSDC_coord_len ref_len,
529     struct VCursor const *ref, struct VCursor const *align, align_id_src ids,
530     const char * spot_group, void * placement_ctx );
531 
532 
533 /* NextReference
534  *  advance to the next reference
535  */
536 ALIGN_EXTERN rc_t CC ReferenceIteratorNextReference ( ReferenceIterator *self,
537     INSDC_coord_zero *first_pos, INSDC_coord_len *len, struct ReferenceObj const ** refobj );
538 
539 /* NextWindow
540  *  advance to the next window on the reference
541  */
542 ALIGN_EXTERN rc_t CC ReferenceIteratorNextWindow ( ReferenceIterator *self,
543     INSDC_coord_zero *first_pos, INSDC_coord_len *len );
544 
545 /* NextSpotGroup
546  *  advance to the next spot_group on the reference
547  */
548 ALIGN_EXTERN rc_t CC ReferenceIteratorNextSpotGroup ( ReferenceIterator *self,
549     const char ** name, size_t * len );
550 
551 
552 /* NextPos
553  *  advance to the next position on current reference
554  *  resets internal iterator on placements at that position
555  */
556 ALIGN_EXTERN rc_t CC ReferenceIteratorNextPos ( ReferenceIterator *self, bool skip_empty );
557 
558 
559 /* Position
560  *  return current position on the reference
561  */
562 ALIGN_EXTERN rc_t CC ReferenceIteratorPosition ( const ReferenceIterator *self,
563     INSDC_coord_zero *pos, uint32_t * depth, INSDC_4na_bin * base );
564 
565 
566 /* NextPlacement
567  *  advance internal iterator to next placement in list
568  *  returns a pointer to the next placement object at current position
569  */
570 ALIGN_EXTERN rc_t CC ReferenceIteratorNextPlacement ( ReferenceIterator *self,
571     const PlacementRecord **rec );
572 
573 
574 /* State
575  *  return state of current placement at current position
576  */
577 ALIGN_EXTERN int32_t CC ReferenceIteratorState ( const ReferenceIterator *self, INSDC_coord_zero *seq_pos );
578 
579 
580 /* BasesInserted
581  *  return the number of inserted bases and a pointer to their values
582  *
583  *  "bases" [ OUT, NULL OKAY ] - optional output parameter to inserted bases
584  *
585  *  returns count of bases inserted at current position
586  */
587 ALIGN_EXTERN uint32_t CC ReferenceIteratorBasesInserted ( const ReferenceIterator *self,
588     const INSDC_4na_bin **bases );
589 
590 
591 /* BasesDeleted
592  *  return the number of bases deleted at the current position
593  *  also returns the location on the reference where the delete starts
594  *
595  *  "pos" [ OUT ] - return parameter for location on the reference
596  *  where delete starts, and continues for the number of bases given by function return
597  *
598  *  "bases" [ OUT, NULL OKAY ] - optional output parameter to deleted bases
599  *
600  *  returns count of bases deleted at current position
601  */
602 ALIGN_EXTERN uint32_t CC ReferenceIteratorBasesDeleted ( const ReferenceIterator *self,
603     INSDC_coord_zero *pos, const INSDC_4na_bin **bases );
604 
605 
606 #ifdef __cplusplus
607 }
608 #endif
609 
610 #endif /* _h_align_iterator_ */
611