1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 #include <vdb/extern.h>
27
28 #include <vdb/xform.h>
29 #include <vdb/database.h>
30 #include <vdb/table.h>
31 #include <vdb/cursor.h>
32 #include <vdb/vdb-priv.h>
33 #include <insdc/insdc.h>
34 #include <klib/data-buffer.h>
35 #include <klib/rc.h>
36 #include <klib/debug.h>
37 #include <sysalloc.h>
38
39 #include <bitstr.h>
40
41 #include "ref-tbl.h"
42
43 #include <stdlib.h>
44 #include <string.h>
45 #include <assert.h>
46
47
48 #ifdef _DEBUGGING
49 #define SUB_DEBUG(msg) DBGMSG(DBG_SRA,DBG_FLAG(DBG_SRA_SUB),msg)
50 #else
51 #define SUB_DEBUG(msg)
52 #endif
53
54
55 typedef struct RefTableSubSelect RefTableSubSelect;
56 struct RefTableSubSelect
57 {
58 rc_t (*func)(struct RefTableSubSelect* self, int64_t ref_row_id,
59 INSDC_coord_zero offset, INSDC_coord_len ref_len,
60 uint32_t ref_ploidy, VRowResult* rslt);
61 const VCursor *curs;
62 uint32_t out_idx;
63 union {
64 struct {
65 uint32_t circular_idx;
66 uint32_t name_idx;
67 uint32_t name_range_idx;
68 uint32_t seq_len_idx;
69 uint32_t max_seq_len_idx;
70 uint32_t cmp_read_idx;
71 /* set once upon 1st call */
72 /* set once per each call, if changed from previous */
73 char* name;
74 int64_t start_id;
75 int64_t stop_id;
76 uint32_t name_len;
77 uint32_t max_seq_len;
78 INSDC_coord_len seq_len;
79 bool circular;
80 bool local;
81 } ref;
82 struct {
83 uint32_t ref_id_idx;
84 uint32_t ref_start_idx;
85 uint32_t ref_len_idx;
86 uint32_t read_start_idx;
87 uint32_t read_len_idx;
88 struct RefTableSubSelect* parent;
89 } mod;
90 } u;
91 };
92
93 /*
94 ref_ploidy != 0 means that offset here is relative to ref_row_id, so it can be
95 negative or positive and can extend between rows within same refseq
96 ref_ploidy means that offset is normal REF_START
97 */
98
99 static
REFERENCE_TABLE_sub_select(RefTableSubSelect * self,int64_t ref_row_id,INSDC_coord_zero offset,INSDC_coord_len ref_len,uint32_t ref_ploidy,VRowResult * rslt)100 rc_t CC REFERENCE_TABLE_sub_select( RefTableSubSelect* self, int64_t ref_row_id,
101 INSDC_coord_zero offset, INSDC_coord_len ref_len,
102 uint32_t ref_ploidy, VRowResult* rslt )
103 {
104 rc_t rc = 0;
105 INSDC_coord_len num_read;
106
107 if ( ref_row_id < self->u.ref.start_id || ref_row_id > self->u.ref.stop_id )
108 {
109 /* update cached ref data if ref has changed */
110 const char* n;
111 uint32_t n_len;
112 struct {
113 int64_t start_id;
114 int64_t stop_id;
115 } *out;
116
117 SUB_DEBUG( ( "SUB.Rd in 'ref-tbl-sub-select.c' (REF) at #%lu offset %lu\n", ref_row_id, offset ) );
118
119 rc = VCursorCellDataDirect( self->curs, ref_row_id, self->u.ref.name_idx, NULL, ( const void** )&n, NULL, &n_len );
120 if ( rc == 0 )
121 {
122 rc = VCursorParamsSet( ( const struct VCursorParams * )(self->curs), "QUERY_SEQ_NAME", "%.*s", n_len, n );
123 if ( rc == 0 )
124 {
125 rc = VCursorCellDataDirect( self->curs, ref_row_id, self->u.ref.name_range_idx, NULL, (const void**)&out, NULL, NULL );
126 if ( rc == 0 )
127 {
128 if ( self->u.ref.name_len < n_len )
129 {
130 void* p = realloc( self->u.ref.name, n_len );
131 if ( p == NULL )
132 rc = RC( rcXF, rcFunction, rcSelecting, rcMemory, rcExhausted );
133 else
134 self->u.ref.name = ( char* )p;
135 }
136
137 if ( rc == 0 )
138 {
139 const bool* c;
140 INSDC_coord_len* sl;
141 uint32_t* m;
142 uint32_t cmp_read_len = 0;
143 int64_t row;
144
145 memmove( self->u.ref.name, n, n_len );
146 self->u.ref.name_len = n_len;
147 self->u.ref.start_id = out->start_id;
148 self->u.ref.stop_id = out->stop_id;
149 for ( row = out->start_id; row <= out->stop_id && cmp_read_len == 0; ++row )
150 {
151 uint32_t tmp_len = 0;
152 void const *dummy = NULL;
153
154 rc = VCursorCellDataDirect( self->curs, row, self->u.ref.cmp_read_idx, NULL, &dummy, NULL, &tmp_len );
155 if ( rc != 0 ) break;
156 cmp_read_len += tmp_len;
157 }
158
159 if ( rc == 0 )
160 {
161 rc = VCursorCellDataDirect( self->curs, self->u.ref.stop_id, self->u.ref.circular_idx, NULL, (const void**)&c, NULL, NULL );
162 if ( rc == 0 )
163 {
164 rc = VCursorCellDataDirect( self->curs, self->u.ref.stop_id, self->u.ref.seq_len_idx, NULL, (const void**)&sl, NULL, NULL );
165 if ( rc == 0 )
166 {
167 rc = VCursorCellDataDirect( self->curs, self->u.ref.stop_id, self->u.ref.max_seq_len_idx, NULL, (const void**)&m, NULL, NULL);
168 if ( rc == 0 )
169 {
170 self->u.ref.circular = c[ 0 ] || cmp_read_len != 0;
171 self->u.ref.seq_len = m[ 0 ] * (INSDC_coord_len)( self->u.ref.stop_id - self->u.ref.start_id ) + sl[0];
172 self->u.ref.max_seq_len = m[ 0 ];
173 }
174 }
175 }
176 }
177 }
178 }
179 }
180 }
181 }
182
183 if ( rc == 0 && ref_ploidy != 0 )
184 {
185 /* convert offset to normal from start of ref relative to current row */
186 offset += self->u.ref.max_seq_len * (INSDC_coord_len)( ref_row_id - self->u.ref.start_id );
187
188 if ( self->u.ref.circular )
189 {
190 /* make offset positive starting from refseq actual start */
191 if ( offset < 0 )
192 offset = self->u.ref.seq_len - ( ( -offset ) % self->u.ref.seq_len );
193 else
194 offset %= self->u.ref.seq_len;
195 }
196 else if ( offset < 0 || offset >= (INSDC_coord_zero)self->u.ref.seq_len )
197 rc = RC( rcXF, rcFunction, rcSelecting, rcData, rcCorrupt );
198
199 ref_row_id = self->u.ref.start_id + offset / self->u.ref.max_seq_len;
200 offset %= self->u.ref.max_seq_len;
201 }
202
203 /* read the data */
204 for ( num_read = 0; rc == 0 && num_read < ref_len && ref_row_id <= self->u.ref.stop_id; offset = 0 )
205 {
206 uint32_t bits;
207 const void* output;
208 uint32_t boff;
209 uint32_t row_len;
210
211 rc = VCursorCellDataDirect( self->curs, ref_row_id, self->out_idx, &bits, &output, &boff, &row_len );
212 if ( rc == 0 )
213 {
214 /* row_len MUST be > offset */
215 if ( row_len <= (uint32_t)offset )
216 rc = RC(rcXF, rcFunction, rcSelecting, rcData, rcCorrupt);
217 else
218 {
219 row_len -= offset;
220 if ( ref_len < row_len + num_read )
221 row_len = ref_len - num_read;
222
223 /* copy data */
224 bitcpy( rslt->data->base, rslt->elem_count * bits, output, offset * bits + boff, row_len * bits );
225 rslt->elem_count += row_len;
226 num_read += row_len;
227
228 if ( ++ref_row_id > self->u.ref.stop_id && self->u.ref.circular )
229 ref_row_id = self->u.ref.start_id;
230 }
231 }
232 }
233
234 /* detect incomplete read */
235 if ( rc == 0 && num_read == 0 )
236 rc = RC( rcXF, rcFunction, rcSelecting, rcTransfer, rcIncomplete );
237
238 return rc;
239 }
240
241 static
ALIGN_CMN_TABLE_sub_select(RefTableSubSelect * self,int64_t ref_row_id,INSDC_coord_zero offset,INSDC_coord_len ref_len,uint32_t ref_ploidy,VRowResult * rslt)242 rc_t CC ALIGN_CMN_TABLE_sub_select(RefTableSubSelect* self, int64_t ref_row_id,
243 INSDC_coord_zero offset, INSDC_coord_len ref_len,
244 uint32_t ref_ploidy, VRowResult* rslt)
245 {
246 rc_t rc=0;
247 INSDC_coord_len num_read = 0;
248 const int64_t* al_ref_id = NULL;
249 int64_t al_ref_id_value;
250 const INSDC_coord_zero* al_ref_start = NULL;
251
252 SUB_DEBUG( ( "SUB.Rd in 'ref-tbl-sub-select.c' (ALIGN) at #%lu offset %lu\n", ref_row_id, offset ) );
253
254 if ( offset < 0 )
255 {
256 rc = VCursorCellDataDirect( self->curs, ref_row_id, self->u.mod.ref_id_idx, NULL, (void const **)&al_ref_id, NULL, NULL );
257 if ( rc == 0 )
258 {
259 rc = VCursorCellDataDirect( self->curs, ref_row_id, self->u.mod.ref_start_idx, NULL, (void const **)&al_ref_start, NULL, NULL);
260 if ( rc == 0 )
261 {
262 memmove ( &al_ref_id_value, al_ref_id, sizeof *al_ref_id );
263 if ( -offset > (INSDC_coord_zero)ref_len )
264 {
265 /* requested chunk is to the left and is not using allele data */
266 rc = RC( rcXF, rcFunction, rcSelecting, rcData, rcCorrupt );
267 }
268 else
269 {
270 rc = self->u.mod.parent->func( self->u.mod.parent, al_ref_id_value, offset + al_ref_start[0],
271 -offset, ref_ploidy, rslt );
272 if ( rc == 0 )
273 {
274 /* read left portion of underlying reference */
275 num_read += (INSDC_coord_len)rslt->elem_count;
276 offset = 0;
277 }
278 }
279 }
280 }
281 }
282
283 if ( rc == 0 && num_read < ref_len )
284 {
285 /* copy self */
286 void const* output;
287 uint32_t bits, boff, rs_len, rl_len;
288 const INSDC_coord_zero* rs;
289 const INSDC_coord_len* rl;
290
291 rc = VCursorCellDataDirect( self->curs, ref_row_id, self->out_idx, &bits, &output, &boff, NULL );
292 if ( rc == 0 )
293 {
294 rc = VCursorCellDataDirect( self->curs, ref_row_id, self->u.mod.read_start_idx, NULL, (void const **)&rs, NULL, &rs_len );
295 if ( rc == 0 )
296 {
297 rc = VCursorCellDataDirect( self->curs, ref_row_id, self->u.mod.read_len_idx, NULL, (void const **)&rl, NULL, &rl_len );
298 if ( rc == 0 )
299 {
300 assert( rs_len == rl_len );
301 assert( ref_ploidy > 0 && ref_ploidy <= rl_len );
302 if ( offset > (INSDC_coord_zero)rl[ ref_ploidy - 1 ] )
303 {
304 /* requested chunk starts beyond allele */
305 rc = RC( rcXF, rcFunction, rcSelecting, rcData, rcCorrupt );
306 }
307 else
308 {
309 INSDC_coord_len left = ref_len - num_read;
310 if ( ( rl[ ref_ploidy - 1 ] - offset ) < left )
311 {
312 left = rl[ ref_ploidy - 1 ] - offset;
313 }
314 bitcpy( rslt->data->base, rslt->elem_count * bits, output, ( rs[ref_ploidy - 1] + offset ) * bits + boff, left * bits );
315 num_read += left;
316 rslt->elem_count += left;
317 }
318 }
319 }
320 }
321 }
322
323 if ( rc == 0 && num_read < ref_len )
324 {
325 const INSDC_coord_len* al_ref_len;
326 /* copy right portion of underlying reference */
327 if ( al_ref_id == NULL || al_ref_start == NULL )
328 {
329 rc = VCursorCellDataDirect( self->curs, ref_row_id, self->u.mod.ref_id_idx, NULL, (void const **)&al_ref_id, NULL, NULL );
330 if ( rc == 0 )
331 rc = VCursorCellDataDirect( self->curs, ref_row_id, self->u.mod.ref_start_idx, NULL, (void const **)&al_ref_start, NULL, NULL );
332 }
333
334 if ( rc == 0 )
335 rc = VCursorCellDataDirect( self->curs, ref_row_id, self->u.mod.ref_len_idx, NULL, (void const **)&al_ref_len, NULL, NULL );
336
337 memmove ( & al_ref_id_value, al_ref_id, sizeof *al_ref_id );
338 if ( rc == 0 )
339 rc = self->u.mod.parent->func( self->u.mod.parent, al_ref_id_value, al_ref_start[0] + al_ref_len[0],
340 ref_len - num_read, ref_ploidy, rslt );
341
342 }
343 return rc;
344 }
345
346
347 static
RefTableSubSelect_Whack(void * obj)348 void CC RefTableSubSelect_Whack ( void *obj )
349 {
350 if ( obj != NULL )
351 {
352 RefTableSubSelect* self = ( RefTableSubSelect* )obj;
353 VCursorRelease( self->curs );
354 if ( self->func != REFERENCE_TABLE_sub_select )
355 {
356 RefTableSubSelect_Whack( self->u.mod.parent );
357 }
358 else
359 {
360 free( self->u.ref.name );
361 }
362 free( self );
363 }
364 }
365
366 /* normal way to do it */
367 #define IS_ADDED(c, i, n) ((rc = VCursorAddColumn(c, i, "%s", n)) == 0 || \
368 (GetRCObject(rc) == (enum RCObject)rcColumn && GetRCState(rc) == rcExists))
369
370 static
RefTableSubSelect_Make(RefTableSubSelect ** objp,const VTable * tbl,const VCursor * native_curs,const char * out_col_name)371 rc_t RefTableSubSelect_Make( RefTableSubSelect **objp, const VTable *tbl, const VCursor *native_curs, const char* out_col_name )
372 {
373 rc_t rc;
374
375 /* create the object */
376 RefTableSubSelect* obj = ( RefTableSubSelect* )calloc( 1, sizeof( *obj ) );
377 if ( obj == NULL )
378 {
379 rc = RC( rcXF, rcFunction, rcConstructing, rcMemory, rcExhausted );
380 }
381 else
382 {
383 const VTable *reftbl = NULL;
384
385 SUB_DEBUG( ( "SUB.Make in 'ref-tbl-sub-select.c' col=%s\n", out_col_name ) );
386
387 /* open the reference table cursor*/
388 rc = AlignRefTableCursor( tbl, native_curs, &obj->curs, &reftbl );
389 if ( rc == 0 )
390 {
391 /* add columns to cursor */
392 if ( IS_ADDED( obj->curs, &obj->u.ref.circular_idx, "CIRCULAR" ) )
393 {
394 /* normal REFERENCE table */
395 if ( IS_ADDED( obj->curs, &obj->u.ref.name_idx, "(utf8)NAME" ) )
396 {
397 if ( IS_ADDED( obj->curs, &obj->u.ref.name_range_idx, "NAME_RANGE" ) )
398 {
399 if ( IS_ADDED( obj->curs, &obj->u.ref.seq_len_idx, "SEQ_LEN" ) )
400 {
401 if ( IS_ADDED( obj->curs, &obj->u.ref.max_seq_len_idx, "MAX_SEQ_LEN" ) )
402 {
403 if ( IS_ADDED( obj->curs, &obj->u.ref.cmp_read_idx, "CMP_READ" ) )
404 {
405 obj->func = REFERENCE_TABLE_sub_select;
406 rc = 0;
407 }
408 }
409 }
410 }
411 }
412 }
413 else if ( GetRCObject( rc ) == ( enum RCObject )rcColumn && GetRCState( rc ) == rcNotFound )
414 {
415 /* try as align_cmn */
416 rc = RefTableSubSelect_Make( &obj->u.mod.parent, reftbl, native_curs, out_col_name );
417 if ( rc == 0 )
418 {
419 if ( IS_ADDED( obj->curs, &obj->u.mod.ref_id_idx, "REF_ID" ) )
420 {
421 if ( IS_ADDED( obj->curs, &obj->u.mod.ref_start_idx, "REF_START" ) )
422 {
423 if ( IS_ADDED( obj->curs, &obj->u.mod.ref_len_idx, "REF_LEN" ) )
424 {
425 if ( IS_ADDED( obj->curs, &obj->u.mod.read_start_idx, "READ_START" ) )
426 {
427 if ( IS_ADDED( obj->curs, &obj->u.mod.read_len_idx, "READ_LEN" ) )
428 {
429 obj->func = ALIGN_CMN_TABLE_sub_select;
430 rc = 0;
431 }
432 }
433 }
434 }
435 }
436 }
437 }
438
439 if ( rc == 0 )
440 {
441 if ( IS_ADDED( obj->curs, &obj->out_idx, out_col_name ) )
442 rc = 0;
443 }
444
445 if ( rc == 0 )
446 {
447 *objp = obj;
448 VTableRelease( reftbl );
449 return 0;
450 }
451
452 VCursorRelease( obj->curs );
453 }
454 VTableRelease( reftbl );
455 free( obj );
456 }
457 return rc;
458 }
459
460 static
reftbl_sub_select(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])461 rc_t CC reftbl_sub_select ( void *data, const VXformInfo *info,
462 int64_t row_id, VRowResult *rslt, uint32_t argc, const VRowData argv[] )
463 {
464 rc_t rc;
465 RefTableSubSelect* self = ( RefTableSubSelect* )data;
466 const int64_t* ref_id = ( const int64_t* )argv[ 0 ].u.data.base;
467 const INSDC_coord_zero* ref_start = ( const INSDC_coord_zero* )argv[ 1 ].u.data.base;
468 const INSDC_coord_len* ref_len = ( const INSDC_coord_len* )argv[ 2 ].u.data.base;
469 const uint32_t* ref_ploidy = NULL;
470
471 if ( argc > 3 )
472 {
473 ref_ploidy = ( const uint32_t* )argv[ 3 ].u.data.base;
474 ref_ploidy += argv[3].u.data.first_elem;
475 assert( argv[ 3 ].u.data.elem_bits == sizeof( *ref_ploidy ) * 8 );
476 }
477 assert( argv[ 0 ].u.data.elem_bits == sizeof( *ref_id ) * 8 );
478 assert( argv[ 1 ].u.data.elem_bits == sizeof( *ref_start ) * 8 );
479 assert( argv[ 2 ].u.data.elem_bits == sizeof( *ref_len ) * 8 );
480
481 ref_id += argv[ 0 ].u.data.first_elem;
482 ref_start += argv[ 1 ].u.data.first_elem;
483 ref_len += argv[ 2 ].u.data.first_elem;
484
485 /* get the memory for output row */
486 rslt->data->elem_bits = rslt->elem_bits;
487 rc = KDataBufferResize( rslt->data, ref_len[ 0 ] );
488 if ( rc == 0 )
489 {
490 /* must set it to 0 here - functions above accumulate */
491 rslt->elem_count = 0;
492 if ( ref_len[ 0 ] > 0 )
493 {
494 int64_t ref_id_val;
495 memmove( &ref_id_val, ref_id, sizeof ref_id_val );
496 rc = self->func( self, ref_id_val, ref_start[ 0 ], ref_len[ 0 ], ref_ploidy ? ref_ploidy[ 0 ] : 0, rslt );
497 }
498 }
499
500 return rc;
501 }
502
503
504 VTRANSFACT_IMPL ( ALIGN_ref_sub_select, 1, 0, 0 ) ( const void *self, const VXfactInfo *info,
505 VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
506 {
507 RefTableSubSelect *fself;
508 rc_t rc = RefTableSubSelect_Make( &fself, info -> tbl, ( const VCursor* )info->parms, "(INSDC:4na:bin)READ" );
509 if ( rc == 0 )
510 {
511 rslt -> self = fself;
512 rslt -> whack = RefTableSubSelect_Whack;
513 rslt -> u . ndf = reftbl_sub_select;
514 rslt -> variant = vftRow;
515 }
516 return rc;
517 }
518
519
520 VTRANSFACT_IMPL ( NCBI_align_ref_sub_select_preserve_qual, 1, 0, 0 ) ( const void *self, const VXfactInfo *info,
521 VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
522 {
523 RefTableSubSelect *fself;
524 rc_t rc = RefTableSubSelect_Make( & fself, info -> tbl, (const VCursor*)info->parms , "(bool)PRESERVE_QUAL" );
525 if ( rc == 0 )
526 {
527 rslt -> self = fself;
528 rslt -> whack = RefTableSubSelect_Whack;
529 rslt -> u . ndf = reftbl_sub_select;
530 rslt -> variant = vftRow;
531 }
532 return rc;
533 }
534