1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 #include "CSRA1_Read.h"
28
29 #include "NGS_String.h"
30 #include "NGS_Cursor.h"
31 #include "NGS_Id.h"
32
33 #include <kfc/ctx.h>
34 #include <kfc/rsrc.h>
35 #include <kfc/except.h>
36 #include <kfc/xc.h>
37 #include <klib/text.h>
38 #include <klib/printf.h>
39 #include <klib/refcount.h>
40 #include <vdb/cursor.h>
41 #include <vdb/schema.h>
42 #include <vdb/vdb-priv.h>
43 #include <insdc/insdc.h>
44
45 #include <stddef.h>
46 #include <assert.h>
47
48 #include <sysalloc.h>
49
50 #ifndef min
51 # define min(a,b) ( (a) < (b) ? (a) : (b) )
52 #endif
53
54 static bool CSRA1_ReadIteratorNext ( CSRA1_Read * self, ctx_t ctx );
55
56 /*--------------------------------------------------------------------------
57 * CSRA1_Read
58 */
59 struct CSRA1_Read
60 {
61 SRA_Read dad;
62 };
63
64 static bool CSRA1_FragmentIsAligned ( CSRA1_Read * self, ctx_t ctx );
65 static bool CSRA1_ReadFragIsAligned ( CSRA1_Read * self, ctx_t ctx, uint32_t frag_idx );
66
67 static NGS_Read_vt CSRA1_Read_vt_inst =
68 {
69 {
70 {
71 /* NGS_Refcount */
72 SRA_ReadWhack
73 },
74
75 /* NGS_Fragment */
76 SRA_FragmentGetId,
77 SRA_FragmentGetSequence,
78 SRA_FragmentGetQualities,
79 SRA_FragmentIsPaired,
80 CSRA1_FragmentIsAligned,
81 SRA_FragmentNext
82 },
83
84 /* NGS_Read */
85 SRA_ReadGetId,
86 SRA_ReadGetName,
87 SRA_ReadGetReadGroup,
88 SRA_ReadGetCategory,
89 SRA_ReadGetSequence,
90 SRA_ReadGetQualities,
91 SRA_ReadNumFragments,
92 CSRA1_ReadFragIsAligned,
93 CSRA1_ReadIteratorNext,
94 SRA_ReadIteratorGetCount,
95 };
96
97 /* Init
98 */
99 static
CSRA1_ReadInit(ctx_t ctx,SRA_Read * self,const char * clsname,const char * instname,const NGS_String * run_name)100 void CSRA1_ReadInit ( ctx_t ctx, SRA_Read * self, const char *clsname, const char *instname, const NGS_String * run_name )
101 {
102 FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcConstructing );
103
104 if ( self == NULL )
105 INTERNAL_ERROR ( xcParamNull, "bad object reference" );
106 else
107 {
108 TRY ( NGS_ReadInit ( ctx, & self -> dad, & CSRA1_Read_vt_inst, clsname, instname ) )
109 {
110 TRY ( self -> run_name = NGS_StringDuplicate ( run_name, ctx ) )
111 {
112 self -> wants_full = true;
113 self -> wants_partial = true;
114 self -> wants_unaligned = true;
115 }
116 }
117 }
118 }
119
120 /* Release
121 * release reference
122 */
CSRA1_ReadRelease(const CSRA1_Read * self,ctx_t ctx)123 void CSRA1_ReadRelease ( const CSRA1_Read * self, ctx_t ctx )
124 {
125 if ( self != NULL )
126 {
127 NGS_ReadRelease ( & self -> dad . dad, ctx );
128 }
129 }
130
131 /*--------------------------------------------------------------------------
132 * NGS_ReadIterator
133 */
134
135 static
CSRA1_ReadIteratorInit(ctx_t ctx,CSRA1_Read * cself,const char * clsname,const char * instname,const NGS_String * run_name,bool wants_full,bool wants_partial,bool wants_unaligned)136 void CSRA1_ReadIteratorInit ( ctx_t ctx,
137 CSRA1_Read * cself,
138 const char *clsname,
139 const char *instname,
140 const NGS_String * run_name,
141 bool wants_full,
142 bool wants_partial,
143 bool wants_unaligned )
144 {
145 FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcConstructing );
146
147 if ( cself == NULL )
148 INTERNAL_ERROR ( xcParamNull, "bad object reference" );
149 else
150 {
151 SRA_Read * self = & cself -> dad;
152 TRY ( NGS_ReadIteratorInit ( ctx, & self -> dad, & CSRA1_Read_vt_inst, clsname, instname ) )
153 {
154 TRY ( self -> run_name = NGS_StringDuplicate ( run_name, ctx ) )
155 {
156 self -> wants_full = wants_full;
157 self -> wants_partial = wants_partial;
158 self -> wants_unaligned = wants_unaligned;
159 }
160 }
161 }
162 }
163
164 /* Make
165 */
CSRA1_ReadIteratorMake(ctx_t ctx,const NGS_Cursor * curs,const NGS_String * run_name,bool wants_full,bool wants_partial,bool wants_unaligned)166 NGS_Read * CSRA1_ReadIteratorMake ( ctx_t ctx,
167 const NGS_Cursor * curs,
168 const NGS_String * run_name,
169 bool wants_full,
170 bool wants_partial,
171 bool wants_unaligned )
172 {
173 FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcConstructing );
174
175 CSRA1_Read * cref;
176 SRA_Read * ref;
177
178 assert ( curs != NULL );
179
180 cref = calloc ( 1, sizeof * cref );
181 if ( cref == NULL )
182 SYSTEM_ERROR ( xcNoMemory, "allocating CSRA1_ReadIterator on '%.*s'", NGS_StringSize ( run_name, ctx ), NGS_StringData ( run_name, ctx ) );
183 else
184 {
185 #if _DEBUGGING
186 char instname [ 256 ];
187 string_printf ( instname, sizeof instname, NULL, "%.*s", NGS_StringSize ( run_name, ctx ), NGS_StringData ( run_name, ctx ) );
188 instname [ sizeof instname - 1 ] = 0;
189 #else
190 const char *instname = "";
191 #endif
192 TRY ( CSRA1_ReadIteratorInit ( ctx, cref, "CSRA1_ReadIterator", instname, run_name, wants_full, wants_partial, wants_unaligned ) )
193 {
194 ref = & cref -> dad;
195
196 ref -> curs = NGS_CursorDuplicate ( curs, ctx );
197 TRY ( NGS_CursorGetRowRange ( ref -> curs, ctx, & ref -> cur_row, & ref -> row_count ) )
198 {
199 ref -> row_max = ref -> cur_row + ref -> row_count;
200 return & ref -> dad;
201 }
202 CSRA1_ReadRelease ( cref, ctx );
203 return NULL;
204 }
205
206 free ( cref );
207 }
208
209 return NULL;
210 }
211
212 /* MakeRange
213 */
CSRA1_ReadIteratorMakeRange(ctx_t ctx,const NGS_Cursor * curs,const NGS_String * run_name,uint64_t first,uint64_t count,bool wants_full,bool wants_partial,bool wants_unaligned)214 NGS_Read * CSRA1_ReadIteratorMakeRange ( ctx_t ctx,
215 const NGS_Cursor * curs,
216 const NGS_String * run_name,
217 uint64_t first,
218 uint64_t count,
219 bool wants_full,
220 bool wants_partial,
221 bool wants_unaligned )
222 {
223 FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcConstructing );
224
225 CSRA1_Read * cref;
226 SRA_Read * ref;
227
228 assert ( curs != NULL );
229
230 cref = calloc ( 1, sizeof * ref );
231 if ( cref == NULL )
232 SYSTEM_ERROR ( xcNoMemory, "allocating CSRA1_ReadIterator on '%.*s'", NGS_StringSize ( run_name, ctx ), NGS_StringData ( run_name, ctx ) );
233 else
234 {
235 #if _DEBUGGING
236 char instname [ 256 ];
237 string_printf ( instname, sizeof instname, NULL, "%.*s", NGS_StringSize ( run_name, ctx ), NGS_StringData ( run_name, ctx ) );
238 instname [ sizeof instname - 1 ] = 0;
239 #else
240 const char *instname = "";
241 #endif
242 TRY ( CSRA1_ReadIteratorInit ( ctx, cref, "CSRA1_ReadIterator", instname, run_name, wants_full, wants_partial, wants_unaligned ) )
243 {
244 ref = & cref -> dad;
245
246 ref -> curs = NGS_CursorDuplicate ( curs, ctx );
247 TRY ( NGS_CursorGetRowRange ( ref -> curs, ctx, & ref -> cur_row, & ref -> row_count ) )
248 {
249 ref -> row_max = min ( first + count, ref -> cur_row + ref -> row_count );
250 ref -> cur_row = first;
251 return & ref -> dad;
252 }
253 CSRA1_ReadRelease ( cref, ctx );
254 return NULL;
255 }
256
257 free ( cref );
258 }
259
260 return NULL;
261 }
262
263 /* MakeReadGroup
264 */
CSRA1_ReadIteratorMakeReadGroup(ctx_t ctx,const NGS_Cursor * curs,const NGS_String * run_name,const NGS_String * group_name,uint64_t first,uint64_t count,bool wants_full,bool wants_partial,bool wants_unaligned)265 NGS_Read * CSRA1_ReadIteratorMakeReadGroup ( ctx_t ctx,
266 const NGS_Cursor * curs,
267 const NGS_String * run_name,
268 const NGS_String * group_name,
269 uint64_t first,
270 uint64_t count,
271 bool wants_full,
272 bool wants_partial,
273 bool wants_unaligned )
274 {
275 FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcConstructing );
276
277 TRY ( CSRA1_Read * cref = (CSRA1_Read*) CSRA1_ReadIteratorMakeRange ( ctx,
278 curs,
279 run_name,
280 first,
281 count,
282 wants_full,
283 wants_partial,
284 wants_unaligned ) )
285 {
286 SRA_Read * ref = & cref -> dad;
287 TRY ( ref -> group_name = NGS_StringDuplicate ( group_name, ctx ) )
288 {
289 return & ref -> dad;
290 }
291
292 CSRA1_ReadRelease ( cref, ctx );
293 }
294 return NULL;
295 }
296
CSRA1_ReadMake(ctx_t ctx,const NGS_Cursor * curs,int64_t readId,const struct NGS_String * run_name)297 NGS_Read * CSRA1_ReadMake ( ctx_t ctx, const NGS_Cursor * curs, int64_t readId, const struct NGS_String * run_name )
298 {
299 FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcConstructing );
300
301 CSRA1_Read * cref;
302 SRA_Read * ref;
303
304 assert ( curs != NULL );
305
306 cref = calloc ( 1, sizeof * cref );
307 if ( cref == NULL )
308 SYSTEM_ERROR ( xcNoMemory, "allocating SRA_Read(%lu) on '%.*s'", readId, NGS_StringSize ( run_name, ctx ), NGS_StringData ( run_name, ctx ) );
309 else
310 {
311 #if _DEBUGGING
312 char instname [ 256 ];
313 string_printf ( instname, sizeof instname, NULL, "%.*s(%lu)", NGS_StringSize ( run_name, ctx ), NGS_StringData ( run_name, ctx ), readId );
314 instname [ sizeof instname - 1 ] = 0;
315 #else
316 const char *instname = "";
317 #endif
318 ref = & cref -> dad;
319
320 TRY ( CSRA1_ReadInit ( ctx, ref, "CSRA1_Read", instname, run_name ) )
321 {
322 uint64_t row_count = NGS_CursorGetRowCount ( curs, ctx );
323
324 /* validate the requested rowId and seek to it */
325 if ( readId <= 0 || (uint64_t)readId > row_count )
326 {
327 INTERNAL_ERROR ( xcCursorAccessFailed, "rowId ( %li ) out of range for %.*s", readId, NGS_StringSize ( run_name, ctx ), NGS_StringData ( run_name, ctx ) );
328 }
329 else
330 {
331 ref -> curs = NGS_CursorDuplicate ( curs, ctx );
332 ref -> cur_row = readId;
333 TRY ( SRA_ReadIteratorInitFragment ( cref , ctx ) )
334 {
335 ref -> row_max = readId + 1;
336 ref -> row_count = 1;
337 ref -> seen_first = true;
338 return & ref -> dad;
339 }
340 }
341
342 CSRA1_ReadRelease ( cref, ctx );
343 return NULL;
344 }
345 free ( cref );
346 }
347
348 return NULL;
349 }
350
CSRA1_FragmentIsAligned(CSRA1_Read * cself,ctx_t ctx)351 bool CSRA1_FragmentIsAligned ( CSRA1_Read * cself, ctx_t ctx )
352 {
353 FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcAccessing );
354 const SRA_Read * self;
355
356 assert ( cself != NULL );
357
358 self = & cself -> dad;
359
360 if ( ! self -> seen_first )
361 {
362 USER_ERROR ( xcIteratorUninitialized, "Read accessed before a call to nextRead()" );
363 return false;
364 }
365
366 if ( self -> cur_row >= self -> row_max )
367 {
368 USER_ERROR ( xcCursorExhausted, "No more rows available" );
369 return false;
370 }
371
372 if ( ! self -> seen_first_frag )
373 {
374 USER_ERROR ( xcIteratorUninitialized, "Fragment accessed before a call to nextFragment()" );
375 return false;
376 }
377
378 if ( self -> frag_idx >= self -> frag_max )
379 {
380 USER_ERROR ( xcCursorExhausted, "No more fragments available" );
381 return false;
382 }
383
384 {
385 const void * base;
386 uint32_t elem_bits, boff, row_len;
387 ON_FAIL ( NGS_CursorCellDataDirect ( self -> curs, ctx, self -> cur_row, seq_PRIMARY_ALIGNMENT_ID, & elem_bits, & base, & boff, & row_len ) )
388 {
389 CLEAR();
390 return false;
391 }
392
393 {
394 const int64_t * orig = base;
395 assert(elem_bits == 64);
396 assert(boff == 0);
397
398 return orig[self -> frag_idx] != 0;
399 }
400 }
401
402 }
403
CSRA1_ReadFragIsAligned(CSRA1_Read * cself,ctx_t ctx,uint32_t frag_idx)404 bool CSRA1_ReadFragIsAligned ( CSRA1_Read * cself, ctx_t ctx, uint32_t frag_idx )
405 {
406 FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcAccessing );
407 const SRA_Read * self;
408
409 assert ( cself != NULL );
410
411 self = & cself -> dad;
412
413 if ( ! self -> seen_first )
414 {
415 USER_ERROR ( xcIteratorUninitialized, "Read accessed before a call to nextRead()" );
416 return false;
417 }
418
419 if ( self -> cur_row >= self -> row_max )
420 {
421 USER_ERROR ( xcCursorExhausted, "No more rows available" );
422 return false;
423 }
424
425 if ( frag_idx >= self -> bio_frags )
426 {
427 USER_ERROR ( xcIntegerOutOfBounds, "bad fragment index" );
428 return false;
429 }
430
431 {
432 const void * base;
433 uint32_t elem_bits, boff, row_len;
434 TRY ( NGS_CursorCellDataDirect ( self -> curs, ctx, self -> cur_row, seq_PRIMARY_ALIGNMENT_ID, & elem_bits, & base, & boff, & row_len ) )
435 {
436 uint32_t idx, bidx;
437 const int64_t * orig = base;
438 assert ( base != NULL );
439 assert ( elem_bits == 64 );
440 assert ( boff == 0 );
441 assert ( row_len == self -> frag_max );
442
443 /* technically, we do not expect technical reads (fragments) within CSRA1,
444 but it is correct to check for this possibility
445 same applies to 0-length biological fragments (VDB-3132)
446 */
447 if ( self -> bio_frags == self -> frag_max )
448 return orig [ frag_idx ] != 0;
449
450 for ( idx = bidx = 0; idx < row_len; ++ idx )
451 {
452 if ( ( self -> READ_TYPE [ idx ] & READ_TYPE_BIOLOGICAL ) != 0 && self -> READ_LEN [ idx ] != 0 )
453 {
454 if ( bidx == frag_idx )
455 return orig [ idx ] != 0;
456
457 ++ bidx;
458 }
459 }
460 }
461 }
462
463 CLEAR();
464 return false;
465 }
466
467 bool
CSRA1_ReadIteratorNext(CSRA1_Read * cself,ctx_t ctx)468 CSRA1_ReadIteratorNext ( CSRA1_Read * cself, ctx_t ctx )
469 {
470 FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcAccessing );
471 SRA_Read * self;
472
473 assert ( cself != NULL );
474
475 self = & cself -> dad;
476
477 if ( self -> wants_full )
478 {
479 return SRA_ReadIteratorNext ( cself, ctx );
480 }
481
482 /* to iterate over partially aligned and unaligned reads, use column CMP_READ */
483
484 self -> seen_first_frag = false;
485 self -> seen_last_frag = false;
486
487 self -> cur_frag = 0;
488 self -> bio_frags = 0;
489 self -> frag_idx = 0;
490 self -> frag_max = 0;
491 self -> frag_start = 0;
492 self -> frag_len = 0;
493
494 self -> READ_TYPE = NULL;
495 self -> READ_LEN = NULL;
496
497 if ( self -> seen_first )
498 { /* move to next row */
499 ++ self -> cur_row;
500 }
501 else
502 {
503 self -> seen_first = true;
504 }
505
506 while ( self -> cur_row < self -> row_max )
507 {
508 enum NGS_ReadCategory cat;
509 NGS_String * read;
510 bool aligned;
511 ON_FAIL ( read = NGS_CursorGetString ( self -> curs, ctx, self -> cur_row, seq_CMP_READ ) )
512 return false;
513
514 aligned = NGS_StringSize ( read, ctx ) == 0;
515 NGS_StringRelease( read, ctx );
516 if ( aligned )
517 { // aligned read - skip
518 ++ self -> cur_row;
519 continue;
520 }
521
522 /* work the category filter, we know wants_full is false */
523 ON_FAIL ( cat = SRA_ReadGetCategory ( cself, ctx ) )
524 return false;
525
526 if ( ( cat == NGS_ReadCategory_partiallyAligned && ! self -> wants_partial )
527 ||
528 ( cat == NGS_ReadCategory_unaligned && ! self -> wants_unaligned ) )
529 {
530 ++ self -> cur_row;
531 continue;
532 }
533
534 /* work the read group filter if required */
535 if ( self -> group_name != NULL )
536 {
537 uint32_t size;
538
539 ON_FAIL ( NGS_String* group = NGS_CursorGetString ( self -> curs, ctx, self -> cur_row, seq_GROUP ) )
540 return false;
541
542 size = ( uint32_t ) NGS_StringSize ( group, ctx );
543 if ( string_cmp ( NGS_StringData ( self -> group_name, ctx ),
544 NGS_StringSize ( self -> group_name, ctx ),
545 NGS_StringData ( group, ctx ),
546 size,
547 size ) != 0 )
548 {
549 NGS_StringRelease ( group, ctx );
550 ++ self -> cur_row;
551 continue;
552 }
553 NGS_StringRelease ( group, ctx );
554 }
555
556 TRY ( SRA_ReadIteratorInitFragment ( cself, ctx ) )
557 {
558 return true;
559 }
560
561 break;
562 }
563
564 return false;
565 }
566