1 /*===========================================================================
2  *
3  *                            PUBLIC DOMAIN NOTICE
4  *               National Center for Biotechnology Information
5  *
6  *  This software/database is a "United States Government Work" under the
7  *  terms of the United States Copyright Act.  It was written as part of
8  *  the author's official duties as a United States Government employee and
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  */
26 #include <vdb/extern.h>
27 
28 #include <vdb/xform.h>
29 #include <vdb/database.h>
30 #include <vdb/table.h>
31 #include <vdb/cursor.h>
32 #include <vdb/vdb-priv.h>
33 #include <insdc/insdc.h>
34 #include <klib/data-buffer.h>
35 #include <klib/rc.h>
36 #include <klib/debug.h>
37 #include <sysalloc.h>
38 
39 #include <bitstr.h>
40 
41 #include "ref-tbl.h"
42 
43 #include <stdlib.h>
44 #include <string.h>
45 #include <assert.h>
46 
47 
48 #ifdef _DEBUGGING
49 #define SUB_DEBUG(msg) DBGMSG(DBG_SRA,DBG_FLAG(DBG_SRA_SUB),msg)
50 #else
51 #define SUB_DEBUG(msg)
52 #endif
53 
54 
55 typedef struct RefTableSubSelect RefTableSubSelect;
56 struct RefTableSubSelect
57 {
58     rc_t (*func)(struct RefTableSubSelect* self, int64_t ref_row_id,
59                  INSDC_coord_zero offset, INSDC_coord_len ref_len,
60                  uint32_t ref_ploidy, VRowResult* rslt);
61     const VCursor *curs;
62     uint32_t out_idx;
63     union {
64         struct {
65             uint32_t circular_idx;
66             uint32_t name_idx;
67             uint32_t name_range_idx;
68             uint32_t seq_len_idx;
69             uint32_t max_seq_len_idx;
70             uint32_t cmp_read_idx;
71             /* set once upon 1st call */
72             /* set once per each call, if changed from previous */
73             char* name;
74             int64_t start_id;
75             int64_t stop_id;
76             uint32_t name_len;
77             uint32_t max_seq_len;
78             INSDC_coord_len seq_len;
79             bool circular;
80             bool local;
81         } ref;
82         struct {
83             uint32_t ref_id_idx;
84             uint32_t ref_start_idx;
85             uint32_t ref_len_idx;
86             uint32_t read_start_idx;
87             uint32_t read_len_idx;
88             struct RefTableSubSelect* parent;
89         } mod;
90     } u;
91 };
92 
93 /*
94   ref_ploidy != 0 means that offset here is relative to ref_row_id, so it can be
95     negative or positive and can extend between rows within same refseq
96   ref_ploidy means that offset is normal REF_START
97  */
98 
99 static
REFERENCE_TABLE_sub_select(RefTableSubSelect * self,int64_t ref_row_id,INSDC_coord_zero offset,INSDC_coord_len ref_len,uint32_t ref_ploidy,VRowResult * rslt)100 rc_t CC REFERENCE_TABLE_sub_select( RefTableSubSelect* self, int64_t ref_row_id,
101                                     INSDC_coord_zero offset, INSDC_coord_len ref_len,
102                                     uint32_t ref_ploidy, VRowResult* rslt )
103 {
104     rc_t rc = 0;
105     INSDC_coord_len num_read;
106 
107     if ( ref_row_id < self->u.ref.start_id || ref_row_id > self->u.ref.stop_id )
108     {
109         /* update cached ref data if ref has changed */
110         const char* n;
111         uint32_t n_len;
112         struct {
113             int64_t start_id;
114             int64_t stop_id;
115         } *out;
116 
117         SUB_DEBUG( ( "SUB.Rd in 'ref-tbl-sub-select.c' (REF) at #%lu offset %lu\n", ref_row_id, offset ) );
118 
119         rc = VCursorCellDataDirect( self->curs, ref_row_id, self->u.ref.name_idx, NULL, ( const void** )&n, NULL, &n_len );
120         if ( rc == 0 )
121         {
122             rc = VCursorParamsSet( ( const struct VCursorParams * )(self->curs), "QUERY_SEQ_NAME", "%.*s", n_len, n );
123             if ( rc == 0 )
124             {
125                 rc = VCursorCellDataDirect( self->curs, ref_row_id, self->u.ref.name_range_idx, NULL, (const void**)&out, NULL, NULL );
126                 if ( rc == 0 )
127                 {
128                     if ( self->u.ref.name_len < n_len )
129                     {
130                         void* p = realloc( self->u.ref.name, n_len );
131                         if ( p == NULL )
132                             rc = RC( rcXF, rcFunction, rcSelecting, rcMemory, rcExhausted );
133                         else
134                             self->u.ref.name = ( char* )p;
135                     }
136 
137                     if ( rc == 0 )
138                     {
139                         const bool* c;
140                         INSDC_coord_len* sl;
141                         uint32_t* m;
142                         uint32_t cmp_read_len = 0;
143                         int64_t row;
144 
145                         memmove( self->u.ref.name, n, n_len );
146                         self->u.ref.name_len = n_len;
147                         self->u.ref.start_id = out->start_id;
148                         self->u.ref.stop_id = out->stop_id;
149                         for ( row = out->start_id; row <= out->stop_id && cmp_read_len == 0; ++row )
150                         {
151                             uint32_t tmp_len = 0;
152                             void const *dummy = NULL;
153 
154                             rc = VCursorCellDataDirect( self->curs, row, self->u.ref.cmp_read_idx, NULL, &dummy, NULL, &tmp_len );
155                             if ( rc != 0 ) break;
156                             cmp_read_len += tmp_len;
157                         }
158 
159                         if ( rc == 0 )
160                         {
161                             rc = VCursorCellDataDirect( self->curs, self->u.ref.stop_id, self->u.ref.circular_idx, NULL, (const void**)&c, NULL, NULL );
162                             if ( rc == 0 )
163                             {
164                                 rc = VCursorCellDataDirect( self->curs, self->u.ref.stop_id, self->u.ref.seq_len_idx, NULL, (const void**)&sl, NULL, NULL );
165                                 if ( rc == 0 )
166                                 {
167                                     rc = VCursorCellDataDirect( self->curs, self->u.ref.stop_id, self->u.ref.max_seq_len_idx, NULL, (const void**)&m, NULL, NULL);
168                                     if ( rc == 0 )
169                                     {
170                                         self->u.ref.circular = c[ 0 ] || cmp_read_len != 0;
171                                         self->u.ref.seq_len = m[ 0 ] * (INSDC_coord_len)( self->u.ref.stop_id - self->u.ref.start_id ) + sl[0];
172                                         self->u.ref.max_seq_len = m[ 0 ];
173                                     }
174                                 }
175                             }
176                         }
177                     }
178                 }
179             }
180         }
181     }
182 
183     if ( rc == 0 && ref_ploidy != 0 )
184     {
185         /* convert offset to normal from start of ref relative to current row */
186         offset += self->u.ref.max_seq_len * (INSDC_coord_len)( ref_row_id - self->u.ref.start_id );
187 
188         if ( self->u.ref.circular )
189         {
190             /* make offset positive starting from refseq actual start */
191             if ( offset < 0 )
192                 offset = self->u.ref.seq_len - ( ( -offset ) % self->u.ref.seq_len );
193             else
194                 offset %= self->u.ref.seq_len;
195         }
196         else if ( offset < 0 || offset >= (INSDC_coord_zero)self->u.ref.seq_len )
197             rc = RC( rcXF, rcFunction, rcSelecting, rcData, rcCorrupt );
198 
199         ref_row_id = self->u.ref.start_id + offset / self->u.ref.max_seq_len;
200         offset %= self->u.ref.max_seq_len;
201     }
202 
203     /* read the data */
204     for ( num_read = 0; rc == 0 && num_read < ref_len && ref_row_id <= self->u.ref.stop_id; offset = 0 )
205     {
206         uint32_t bits;
207         const void* output;
208         uint32_t boff;
209         uint32_t row_len;
210 
211         rc = VCursorCellDataDirect( self->curs, ref_row_id, self->out_idx, &bits, &output, &boff, &row_len );
212         if ( rc == 0 )
213         {
214             /* row_len MUST be > offset */
215             if ( row_len <= (uint32_t)offset )
216                 rc = RC(rcXF, rcFunction, rcSelecting, rcData, rcCorrupt);
217             else
218             {
219                 row_len -= offset;
220                 if ( ref_len < row_len + num_read )
221                     row_len = ref_len - num_read;
222 
223                 /* copy data */
224                 bitcpy( rslt->data->base, rslt->elem_count * bits, output, offset * bits + boff, row_len * bits );
225                 rslt->elem_count += row_len;
226                 num_read += row_len;
227 
228                 if ( ++ref_row_id > self->u.ref.stop_id && self->u.ref.circular )
229                     ref_row_id = self->u.ref.start_id;
230             }
231         }
232     }
233 
234     /* detect incomplete read */
235     if ( rc == 0 && num_read == 0 )
236         rc = RC( rcXF, rcFunction, rcSelecting, rcTransfer, rcIncomplete );
237 
238     return rc;
239 }
240 
241 static
ALIGN_CMN_TABLE_sub_select(RefTableSubSelect * self,int64_t ref_row_id,INSDC_coord_zero offset,INSDC_coord_len ref_len,uint32_t ref_ploidy,VRowResult * rslt)242 rc_t CC ALIGN_CMN_TABLE_sub_select(RefTableSubSelect* self, int64_t ref_row_id,
243                                    INSDC_coord_zero offset, INSDC_coord_len ref_len,
244                                    uint32_t ref_ploidy, VRowResult* rslt)
245 {
246 	rc_t rc=0;
247     INSDC_coord_len num_read = 0;
248     const int64_t* al_ref_id = NULL;
249     int64_t al_ref_id_value;
250     const INSDC_coord_zero* al_ref_start = NULL;
251 
252     SUB_DEBUG( ( "SUB.Rd in 'ref-tbl-sub-select.c' (ALIGN) at #%lu offset %lu\n", ref_row_id, offset ) );
253 
254     if ( offset < 0 )
255     {
256         rc = VCursorCellDataDirect( self->curs, ref_row_id, self->u.mod.ref_id_idx, NULL, (void const **)&al_ref_id, NULL, NULL );
257         if ( rc == 0 )
258         {
259             rc = VCursorCellDataDirect( self->curs, ref_row_id, self->u.mod.ref_start_idx, NULL, (void const **)&al_ref_start, NULL, NULL);
260             if ( rc == 0 )
261             {
262                 memmove ( &al_ref_id_value, al_ref_id, sizeof *al_ref_id );
263                 if ( -offset > (INSDC_coord_zero)ref_len )
264                 {
265                     /* requested chunk is to the left and is not using allele data */
266                     rc = RC( rcXF, rcFunction, rcSelecting, rcData, rcCorrupt );
267                 }
268                 else
269                 {
270                     rc = self->u.mod.parent->func( self->u.mod.parent, al_ref_id_value, offset + al_ref_start[0],
271                                                     -offset, ref_ploidy, rslt );
272                     if ( rc == 0 )
273                     {
274                         /* read left portion of underlying reference */
275                         num_read += (INSDC_coord_len)rslt->elem_count;
276                         offset = 0;
277                     }
278                 }
279             }
280         }
281     }
282 
283     if ( rc == 0 && num_read < ref_len )
284     {
285         /* copy self */
286         void const* output;
287         uint32_t bits, boff, rs_len, rl_len;
288         const INSDC_coord_zero* rs;
289         const INSDC_coord_len* rl;
290 
291         rc = VCursorCellDataDirect( self->curs, ref_row_id, self->out_idx, &bits, &output, &boff, NULL );
292         if ( rc == 0 )
293         {
294             rc = VCursorCellDataDirect( self->curs, ref_row_id, self->u.mod.read_start_idx, NULL, (void const **)&rs, NULL, &rs_len );
295             if ( rc == 0 )
296             {
297                 rc = VCursorCellDataDirect( self->curs, ref_row_id, self->u.mod.read_len_idx, NULL, (void const **)&rl, NULL, &rl_len );
298                 if ( rc == 0 )
299                 {
300                     assert( rs_len == rl_len );
301                     assert( ref_ploidy > 0 && ref_ploidy <= rl_len );
302                     if ( offset > (INSDC_coord_zero)rl[ ref_ploidy - 1 ] )
303                     {
304                         /* requested chunk starts beyond allele */
305                         rc = RC( rcXF, rcFunction, rcSelecting, rcData, rcCorrupt );
306                     }
307                     else
308                     {
309                         INSDC_coord_len left = ref_len - num_read;
310                         if ( ( rl[ ref_ploidy - 1 ] - offset ) < left )
311                         {
312                             left = rl[ ref_ploidy - 1 ] - offset;
313                         }
314                         bitcpy( rslt->data->base, rslt->elem_count * bits, output, ( rs[ref_ploidy - 1] + offset ) * bits + boff, left * bits );
315                         num_read += left;
316                         rslt->elem_count += left;
317                     }
318                 }
319             }
320         }
321     }
322 
323     if ( rc == 0 && num_read < ref_len )
324     {
325         const INSDC_coord_len* al_ref_len;
326         /* copy right portion of underlying reference */
327         if ( al_ref_id == NULL || al_ref_start == NULL )
328         {
329             rc = VCursorCellDataDirect( self->curs, ref_row_id, self->u.mod.ref_id_idx, NULL, (void const **)&al_ref_id, NULL, NULL );
330             if ( rc == 0 )
331                 rc = VCursorCellDataDirect( self->curs, ref_row_id, self->u.mod.ref_start_idx, NULL, (void const **)&al_ref_start, NULL, NULL );
332         }
333 
334         if ( rc == 0 )
335             rc = VCursorCellDataDirect( self->curs, ref_row_id, self->u.mod.ref_len_idx, NULL, (void const **)&al_ref_len, NULL, NULL );
336 
337         memmove ( & al_ref_id_value, al_ref_id, sizeof *al_ref_id );
338         if ( rc == 0 )
339             rc = self->u.mod.parent->func( self->u.mod.parent, al_ref_id_value, al_ref_start[0] + al_ref_len[0],
340                                             ref_len - num_read, ref_ploidy, rslt );
341 
342     }
343     return rc;
344 }
345 
346 
347 static
RefTableSubSelect_Whack(void * obj)348 void CC RefTableSubSelect_Whack ( void *obj )
349 {
350     if ( obj != NULL )
351     {
352         RefTableSubSelect* self = ( RefTableSubSelect* )obj;
353         VCursorRelease( self->curs );
354         if ( self->func != REFERENCE_TABLE_sub_select )
355         {
356             RefTableSubSelect_Whack( self->u.mod.parent );
357         }
358         else
359         {
360             free( self->u.ref.name );
361         }
362         free( self );
363     }
364 }
365 
366 /* normal way to do it */
367 #define IS_ADDED(c, i, n) ((rc = VCursorAddColumn(c, i, "%s", n)) == 0 || \
368                            (GetRCObject(rc) == (enum RCObject)rcColumn && GetRCState(rc) == rcExists))
369 
370 static
RefTableSubSelect_Make(RefTableSubSelect ** objp,const VTable * tbl,const VCursor * native_curs,const char * out_col_name)371 rc_t RefTableSubSelect_Make( RefTableSubSelect **objp, const VTable *tbl, const VCursor *native_curs, const char* out_col_name )
372 {
373     rc_t rc;
374 
375     /* create the object */
376     RefTableSubSelect* obj = ( RefTableSubSelect* )calloc( 1, sizeof( *obj ) );
377     if ( obj == NULL )
378     {
379         rc = RC( rcXF, rcFunction, rcConstructing, rcMemory, rcExhausted );
380     }
381     else
382     {
383         const VTable *reftbl = NULL;
384 
385         SUB_DEBUG( ( "SUB.Make in 'ref-tbl-sub-select.c' col=%s\n", out_col_name ) );
386 
387         /* open the reference table cursor*/
388         rc = AlignRefTableCursor( tbl, native_curs, &obj->curs, &reftbl );
389         if ( rc == 0 )
390         {
391             /* add columns to cursor */
392             if ( IS_ADDED( obj->curs, &obj->u.ref.circular_idx, "CIRCULAR" ) )
393             {
394                 /* normal REFERENCE table */
395                 if ( IS_ADDED( obj->curs, &obj->u.ref.name_idx, "(utf8)NAME" ) )
396                 {
397                     if ( IS_ADDED( obj->curs, &obj->u.ref.name_range_idx, "NAME_RANGE" ) )
398                     {
399                         if ( IS_ADDED( obj->curs, &obj->u.ref.seq_len_idx, "SEQ_LEN" ) )
400                         {
401                             if ( IS_ADDED( obj->curs, &obj->u.ref.max_seq_len_idx, "MAX_SEQ_LEN" ) )
402                             {
403                                 if ( IS_ADDED( obj->curs, &obj->u.ref.cmp_read_idx, "CMP_READ" ) )
404                                 {
405                                     obj->func = REFERENCE_TABLE_sub_select;
406                                     rc = 0;
407                                 }
408                             }
409                         }
410                     }
411                 }
412             }
413             else if ( GetRCObject( rc ) == ( enum RCObject )rcColumn && GetRCState( rc ) == rcNotFound )
414             {
415                 /* try as align_cmn */
416                 rc = RefTableSubSelect_Make( &obj->u.mod.parent, reftbl, native_curs, out_col_name );
417                 if ( rc == 0 )
418                 {
419                     if ( IS_ADDED( obj->curs, &obj->u.mod.ref_id_idx, "REF_ID" ) )
420                     {
421                         if ( IS_ADDED( obj->curs, &obj->u.mod.ref_start_idx, "REF_START" ) )
422                         {
423                             if ( IS_ADDED( obj->curs, &obj->u.mod.ref_len_idx, "REF_LEN" ) )
424                             {
425                                 if ( IS_ADDED( obj->curs, &obj->u.mod.read_start_idx, "READ_START" ) )
426                                 {
427                                     if ( IS_ADDED( obj->curs, &obj->u.mod.read_len_idx, "READ_LEN" ) )
428                                     {
429                                         obj->func = ALIGN_CMN_TABLE_sub_select;
430                                         rc = 0;
431                                     }
432                                 }
433                             }
434                         }
435                     }
436                 }
437             }
438 
439             if ( rc == 0 )
440             {
441                 if ( IS_ADDED( obj->curs, &obj->out_idx,  out_col_name ) )
442                     rc = 0;
443             }
444 
445             if ( rc == 0 )
446             {
447                 *objp = obj;
448                 VTableRelease( reftbl );
449                 return 0;
450             }
451 
452             VCursorRelease( obj->curs );
453         }
454         VTableRelease( reftbl );
455         free( obj );
456     }
457     return rc;
458 }
459 
460 static
reftbl_sub_select(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])461 rc_t CC reftbl_sub_select ( void *data, const VXformInfo *info,
462                             int64_t row_id, VRowResult *rslt, uint32_t argc, const VRowData argv[] )
463 {
464     rc_t rc;
465     RefTableSubSelect* self = ( RefTableSubSelect* )data;
466     const int64_t* ref_id = ( const int64_t* )argv[ 0 ].u.data.base;
467     const INSDC_coord_zero* ref_start = ( const INSDC_coord_zero* )argv[ 1 ].u.data.base;
468     const INSDC_coord_len* ref_len = ( const INSDC_coord_len* )argv[ 2 ].u.data.base;
469     const uint32_t* ref_ploidy = NULL;
470 
471     if ( argc > 3 )
472     {
473         ref_ploidy = ( const uint32_t* )argv[ 3 ].u.data.base;
474         ref_ploidy += argv[3].u.data.first_elem;
475         assert( argv[ 3 ].u.data.elem_bits == sizeof( *ref_ploidy ) * 8 );
476     }
477     assert( argv[ 0 ].u.data.elem_bits == sizeof( *ref_id ) * 8 );
478     assert( argv[ 1 ].u.data.elem_bits == sizeof( *ref_start ) * 8 );
479     assert( argv[ 2 ].u.data.elem_bits == sizeof( *ref_len ) * 8 );
480 
481     ref_id += argv[ 0 ].u.data.first_elem;
482     ref_start += argv[ 1 ].u.data.first_elem;
483     ref_len += argv[ 2 ].u.data.first_elem;
484 
485     /* get the memory for output row */
486     rslt->data->elem_bits = rslt->elem_bits;
487     rc = KDataBufferResize( rslt->data, ref_len[ 0 ] );
488     if ( rc == 0 )
489     {
490         /* must set it to 0 here - functions above accumulate */
491         rslt->elem_count = 0;
492         if ( ref_len[ 0 ] > 0 )
493         {
494             int64_t ref_id_val;
495             memmove( &ref_id_val, ref_id, sizeof ref_id_val );
496             rc = self->func( self, ref_id_val, ref_start[ 0 ], ref_len[ 0 ], ref_ploidy ? ref_ploidy[ 0 ] : 0, rslt );
497         }
498     }
499 
500     return rc;
501 }
502 
503 
504 VTRANSFACT_IMPL ( ALIGN_ref_sub_select, 1, 0, 0 ) ( const void *self, const VXfactInfo *info,
505     VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
506 {
507     RefTableSubSelect *fself;
508     rc_t rc = RefTableSubSelect_Make( &fself, info -> tbl, ( const VCursor* )info->parms, "(INSDC:4na:bin)READ" );
509     if ( rc == 0 )
510     {
511         rslt -> self = fself;
512         rslt -> whack = RefTableSubSelect_Whack;
513         rslt -> u . ndf = reftbl_sub_select;
514         rslt -> variant = vftRow;
515     }
516     return rc;
517 }
518 
519 
520 VTRANSFACT_IMPL ( NCBI_align_ref_sub_select_preserve_qual, 1, 0, 0 ) ( const void *self, const VXfactInfo *info,
521                                                    VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
522 {
523     RefTableSubSelect *fself;
524     rc_t rc = RefTableSubSelect_Make( & fself, info -> tbl, (const VCursor*)info->parms , "(bool)PRESERVE_QUAL" );
525     if ( rc == 0 )
526     {
527         rslt -> self = fself;
528         rslt -> whack = RefTableSubSelect_Whack;
529         rslt -> u . ndf = reftbl_sub_select;
530         rslt -> variant = vftRow;
531     }
532     return rc;
533 }
534