1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 #include "pl-consensus.h"
28 #include <sysalloc.h>
29
30 #include <stdlib.h>
31 #include <stdio.h>
32 #include <string.h>
33
34
35 const char * consensus_tab_names[] =
36 {
37 /* base-space */
38 "READ",
39 "QUALITY",
40 "NREADS",
41 "READ_TYPE",
42 "READ_START",
43 "READ_LEN",
44 "(INSDC:SRA:platform_id)PLATFORM",
45 "READ_FILTER",
46
47 /* consensus-space */
48 "HOLE_NUMBER",
49 "HOLE_STATUS",
50 "HOLE_XY",
51 "NUM_PASSES",
52 "INSERTION_QV",
53 "DELETION_QV",
54 "DELETION_TAG",
55 "SUBSTITUTION_QV",
56 "SUBSTITUTION_TAG"
57 };
58
59
check_Consensus_totalcount(BaseCalls_cmn * tab,const uint64_t expected)60 static bool check_Consensus_totalcount( BaseCalls_cmn *tab, const uint64_t expected )
61 {
62 bool res = check_table_count( &tab->Basecall, "Basecall", expected );
63 if ( res )
64 res = check_table_count( &tab->QualityValue, "QualityValue", expected );
65
66 if ( res )
67 {
68 if ( tab->DeletionQV.extents != NULL )
69 res = check_table_count( &tab->DeletionQV, "DeletionQV", expected );
70
71 if ( tab->DeletionTag.extents != NULL )
72 res = check_table_count( &tab->DeletionTag, "DeletionTag", expected );
73
74 if ( tab->InsertionQV.extents != NULL )
75 res = check_table_count( &tab->InsertionQV, "InsertionQV", expected );
76
77 if ( tab->SubstitutionQV.extents != NULL )
78 res = check_table_count( &tab->SubstitutionQV, "SubstitutionQV", expected );
79
80 if ( tab->SubstitutionTag.extents != NULL )
81 res = check_table_count( &tab->SubstitutionTag, "SubstitutionTag", expected );
82 }
83
84 return res;
85 }
86
87
consensus_load_zero_bases(VCursor * cursor,const uint32_t * col_idx)88 static rc_t consensus_load_zero_bases( VCursor *cursor, const uint32_t *col_idx )
89 {
90 uint32_t dummy_src;
91 INSDC_SRA_read_filter filter = SRA_READ_FILTER_CRITERIA;
92
93 rc_t rc = vdb_write_value( cursor, col_idx[ consensus_tab_READ ],
94 &dummy_src, BASECALL_BITSIZE, 0, "consensus.Basecall" );
95 if ( rc == 0 )
96 rc = vdb_write_value( cursor, col_idx[ consensus_tab_QUALITY ],
97 &dummy_src, QUALITY_VALUE_BITSIZE, 0, "QualityValue" );
98 if ( rc == 0 )
99 rc = vdb_write_value( cursor, col_idx[ consensus_tab_INSERTION_QV ],
100 &dummy_src, INSERTION_QV_BITSIZE, 0, "consensus.InsertionQV" );
101 if ( rc == 0 )
102 rc = vdb_write_value( cursor, col_idx[ consensus_tab_DELETION_QV ],
103 &dummy_src, DELETION_QV_BITSIZE, 0, "consensus.DeletionQV" );
104 if ( rc == 0 )
105 rc = vdb_write_value( cursor, col_idx[ consensus_tab_DELETION_TAG ],
106 &dummy_src, DELETION_TAG_BITSIZE, 0, "consensus.DeletionTag" );
107 if ( rc == 0 )
108 rc = vdb_write_value( cursor, col_idx[ consensus_tab_SUBSTITUTION_QV ],
109 &dummy_src, SUBSTITUTION_QV_BITZISE, 0, "consensus.SubstitutionQV" );
110 if ( rc == 0 )
111 rc = vdb_write_value( cursor, col_idx[ consensus_tab_SUBSTITUTION_TAG ],
112 &dummy_src, SUBSTITUTION_TAG_BITSIZE, 0, "consensus.SubstitutionTag" );
113 if ( rc == 0 )
114 rc = vdb_write_value( cursor, col_idx[ consensus_tab_READ_FILTER ],
115 &filter, sizeof filter * 8, 1, "consensus.READ_FILTER" );
116 return rc;
117 }
118
119
consensus_load_spot_bases(VCursor * cursor,BaseCalls_cmn * tab,const uint32_t * col_idx,zmw_row * spot)120 static rc_t consensus_load_spot_bases( VCursor *cursor, BaseCalls_cmn *tab,
121 const uint32_t *col_idx, zmw_row * spot )
122 {
123 rc_t rc = 0;
124 uint32_t column_idx, dummy_src;
125
126 /* we make a buffer to store NumEvent 8-bit-values
127 (that is so far the biggest value we have to read per DNA-BASE) */
128 char * buffer = malloc( spot->NumEvent );
129 if ( buffer == NULL )
130 {
131 rc = RC( rcExe, rcNoTarg, rcAllocating, rcMemory, rcExhausted );
132 PLOGERR( klogErr, ( klogErr, rc, "cannot allocate $(numbytes) to read seq-data",
133 "numbytes=%u", spot->NumEvent ) );
134 }
135 if ( rc == 0 )
136 rc = transfer_bits( cursor, col_idx[ consensus_tab_READ ],
137 &tab->Basecall, buffer, spot->offset, spot->NumEvent,
138 BASECALL_BITSIZE, "consensus.Basecall" );
139 if ( rc == 0 )
140 rc = transfer_bits( cursor, col_idx[ consensus_tab_QUALITY ],
141 &tab->QualityValue, buffer, spot->offset, spot->NumEvent,
142 QUALITY_VALUE_BITSIZE, "consensus.QualityValue" );
143
144 if ( rc == 0 )
145 {
146 column_idx = col_idx[ consensus_tab_INSERTION_QV ];
147 if ( tab->InsertionQV.extents != NULL )
148 rc = transfer_bits( cursor, column_idx,
149 &tab->InsertionQV, buffer, spot->offset, spot->NumEvent,
150 INSERTION_QV_BITSIZE, "consensus.InsertionQV" );
151 else
152 rc = vdb_write_value( cursor, column_idx,
153 &dummy_src, INSERTION_QV_BITSIZE, 0, "consensus.InsertionQV" );
154 }
155
156 if ( rc == 0 )
157 {
158 column_idx = col_idx[ consensus_tab_DELETION_QV ];
159 if ( tab->DeletionQV.extents != NULL )
160 rc = transfer_bits( cursor, column_idx,
161 &tab->DeletionQV, buffer, spot->offset, spot->NumEvent,
162 DELETION_QV_BITSIZE, "consensus.DeletionQV" );
163 else
164 rc = vdb_write_value( cursor, column_idx,
165 &dummy_src, DELETION_QV_BITSIZE, 0, "consensus.DeletionQV" );
166 }
167
168 if ( rc == 0 )
169 {
170 column_idx = col_idx[ consensus_tab_DELETION_TAG ];
171 if ( tab->DeletionTag.extents != NULL )
172 rc = transfer_bits( cursor, column_idx,
173 &tab->DeletionTag, buffer, spot->offset, spot->NumEvent,
174 DELETION_TAG_BITSIZE, "consensus.DeletionTag" );
175 else
176 rc = vdb_write_value( cursor, column_idx,
177 &dummy_src, DELETION_TAG_BITSIZE, 0, "consensus.DeletionTag" );
178 }
179
180 if ( rc == 0 )
181 {
182 column_idx = col_idx[ consensus_tab_SUBSTITUTION_QV ];
183 if ( tab->SubstitutionQV.extents != NULL )
184 rc = transfer_bits( cursor, column_idx,
185 &tab->SubstitutionQV, buffer, spot->offset, spot->NumEvent,
186 SUBSTITUTION_QV_BITZISE, "consensus.SubstitutionQV" );
187 else
188 rc = vdb_write_value( cursor, column_idx,
189 &dummy_src, SUBSTITUTION_QV_BITZISE, 0, "consensus.SubstitutionQV" );
190 }
191
192 if ( rc == 0 )
193 {
194 column_idx = col_idx[ consensus_tab_SUBSTITUTION_TAG ];
195 if ( tab->SubstitutionTag.extents != NULL )
196 rc = transfer_bits( cursor, column_idx,
197 &tab->SubstitutionTag, buffer, spot->offset, spot->NumEvent,
198 SUBSTITUTION_TAG_BITSIZE, "consensus.SubstitutionTag" );
199 else
200 rc = vdb_write_value( cursor, column_idx,
201 &dummy_src, SUBSTITUTION_TAG_BITSIZE, 0, "consensus.SubstitutionTag" );
202 }
203
204 if ( buffer != NULL )
205 free( buffer );
206 return rc;
207 }
208
209
consensus_load_spot(VCursor * cursor,const uint32_t * col_idx,region_type_mapping * mapping,zmw_row * spot,void * data)210 static rc_t consensus_load_spot( VCursor *cursor, const uint32_t *col_idx,
211 region_type_mapping *mapping, zmw_row * spot,
212 void * data )
213 {
214 rc_t rc = 0;
215 if ( spot->NumEvent > 0 )
216 {
217 BaseCalls_cmn *tab = (BaseCalls_cmn *)data;
218 rc = VCursorOpenRow( cursor );
219 if ( rc != 0 )
220 PLOGERR( klogErr, ( klogErr, rc, "cannot open consensus-row on spot# $(spotnr)",
221 "spotnr=%u", spot->spot_nr ) );
222
223 if ( rc == 0 )
224 rc = vdb_write_uint32( cursor, col_idx[ consensus_tab_HOLE_NUMBER ],
225 spot->HoleNumber, "consensus.HOLE_NUMBER" );
226 if ( rc == 0 )
227 rc = vdb_write_uint8( cursor, col_idx[ consensus_tab_HOLE_STATUS ],
228 spot->HoleStatus, "consensus.HOLE_STATUS" );
229 if ( rc == 0 )
230 rc = vdb_write_value( cursor, col_idx[ consensus_tab_HOLE_XY ],
231 &spot->HoleXY, HOLE_XY_BITSIZE, 2, "consensus.HOLE_XY" );
232
233 /* has to be read ... from "PulseData/ConsensusBaesCalls/Passes/NumPasses" */
234 if ( rc == 0 )
235 rc = vdb_write_uint32( cursor, col_idx[ consensus_tab_NUM_PASSES ],
236 spot->NumPasses, "consensus.NUM_PASSES" );
237
238 if ( rc == 0 )
239 {
240 if ( spot->NumEvent > 0 )
241 rc = consensus_load_spot_bases( cursor, tab, col_idx, spot );
242 else
243 rc = consensus_load_zero_bases( cursor, col_idx );
244 }
245
246 if ( rc == 0 )
247 rc = vdb_write_uint8( cursor, col_idx[ consensus_tab_NREADS ],
248 1, "consensus.NREADS" );
249 if ( rc == 0 )
250 rc = vdb_write_uint32( cursor, col_idx[ consensus_tab_READ_START ],
251 0, "consensus.READ_START" );
252 if ( rc == 0 )
253 rc = vdb_write_uint32( cursor, col_idx[ consensus_tab_READ_LEN ],
254 spot->NumEvent, "consensus.READ_LEN" );
255 if ( rc == 0 )
256 rc = vdb_write_uint8( cursor, col_idx[ consensus_tab_READ_TYPE ],
257 SRA_READ_TYPE_BIOLOGICAL, "consensus.READ_TYPE" );
258
259 if ( rc == 0 )
260 {
261 rc = VCursorCommitRow( cursor );
262 if ( rc != 0 )
263 PLOGERR( klogErr, ( klogErr, rc, "cannot commit consensus-row on spot# $(spotnr)",
264 "spotnr=%u", spot->spot_nr ) );
265 }
266
267 if ( rc == 0 )
268 {
269 rc = VCursorCloseRow( cursor );
270 if ( rc != 0 )
271 PLOGERR( klogErr, ( klogErr, rc, "cannot close consensus-row on spot# $(spotnr)",
272 "spotnr=%u", spot->spot_nr ) );
273
274 }
275 }
276 return rc;
277 }
278
279
consensus_loader(ld_context * lctx,KDirectory * hdf5_src,VCursor * cursor,const char * table_name)280 static rc_t consensus_loader( ld_context *lctx, KDirectory * hdf5_src, VCursor * cursor, const char * table_name )
281 {
282 uint32_t col_idx[ consensus_tab_count ];
283 rc_t rc = add_columns( cursor, consensus_tab_count, -1, col_idx, consensus_tab_names );
284 if ( rc == 0 )
285 {
286 rc = VCursorOpen( cursor );
287 if ( rc != 0 )
288 LOGERR( klogErr, rc, "cannot open cursor on consensus-table" );
289
290 else
291 {
292 BaseCalls_cmn ConsensusTab;
293 const INSDC_SRA_platform_id platform = SRA_PLATFORM_PACBIO_SMRT;
294
295 rc = VCursorDefault ( cursor, col_idx[ consensus_tab_PLATFORM ],
296 sizeof platform * 8, &platform, 0, 1 );
297 if ( rc != 0 )
298 LOGERR( klogErr, rc, "cannot set cursor-default on consensus-table for platform-column" );
299 else
300 {
301 const INSDC_SRA_read_filter filter = SRA_READ_FILTER_PASS;
302 rc = VCursorDefault ( cursor, col_idx[ consensus_tab_READ_FILTER ],
303 sizeof filter * 8, &filter, 0, 1 );
304 if ( rc != 0 )
305 LOGERR( klogErr, rc, "cannot set cursor-default on consensus-table for read-filter-column" );
306 }
307
308 if ( rc == 0 )
309 rc = open_BaseCalls_cmn( hdf5_src, &ConsensusTab, true,
310 "PulseData/ConsensusBaseCalls", lctx->cache_content, true );
311
312 if ( rc == 0 )
313 {
314 uint64_t total_bases = zmw_total( &ConsensusTab.zmw );
315 uint64_t total_spots = ConsensusTab.zmw.NumEvent.extents[ 0 ];
316
317 KLogLevel tmp_lvl = KLogLevelGet();
318 KLogLevelSet( klogInfo );
319 PLOGMSG( klogInfo, ( klogInfo,
320 "loading consensus-table ( $(bases) bases / $(spots) spots ):",
321 "bases=%lu,spots=%lu", total_bases, total_spots ));
322 KLogLevelSet( tmp_lvl );
323
324 if ( check_Consensus_totalcount( &ConsensusTab, total_bases ) )
325 {
326 rc = zmw_for_each( &ConsensusTab.zmw, &lctx->xml_progress, cursor,
327 lctx->with_progress, col_idx, NULL,
328 true, consensus_load_spot, &ConsensusTab );
329 }
330 else
331 {
332 rc = RC( rcExe, rcNoTarg, rcAllocating, rcParam, rcInvalid );
333 }
334 close_BaseCalls_cmn( &ConsensusTab );
335 }
336 }
337 }
338 return rc;
339 }
340
341
342 /* HDF5-Groups and tables used to load the CONSENSUS-table */
343 static const char * consensus_groups_to_check[] =
344 {
345 "PulseData",
346 "PulseData/ConsensusBaseCalls",
347 "PulseData/ConsensusBaseCalls/ZMW",
348 "PulseData/ConsensusBaseCalls/Passes",
349 NULL
350 };
351
352
353 static const char * consensus_tables_to_check[] =
354 {
355 "PulseData/ConsensusBaseCalls/Basecall",
356 "PulseData/ConsensusBaseCalls/DeletionQV",
357 "PulseData/ConsensusBaseCalls/DeletionTag",
358 "PulseData/ConsensusBaseCalls/InsertionQV",
359 "PulseData/ConsensusBaseCalls/QualityValue",
360 "PulseData/ConsensusBaseCalls/SubstitutionQV",
361 "PulseData/ConsensusBaseCalls/SubstitutionTag",
362 "PulseData/ConsensusBaseCalls/ZMW/HoleNumber",
363 "PulseData/ConsensusBaseCalls/ZMW/HoleStatus",
364 "PulseData/ConsensusBaseCalls/ZMW/HoleXY",
365 "PulseData/ConsensusBaseCalls/ZMW/NumEvent",
366 "PulseData/ConsensusBaseCalls/Passes/NumPasses",
367 NULL
368 };
369
370
371 static const char * consensus_schema_template = "CONSENSUS";
372 static const char * consensus_table_to_create = "CONSENSUS";
373
374
prepare_consensus(VDatabase * database,con_ctx * sctx,ld_context * lctx)375 rc_t prepare_consensus( VDatabase * database, con_ctx * sctx, ld_context *lctx )
376 {
377 rc_t rc = prepare_table( database, &sctx->cursor,
378 consensus_schema_template, consensus_table_to_create ); /* pl-tools.c ... this creates the cursor */
379 if ( rc == 0 )
380 {
381 rc = add_columns( sctx->cursor, consensus_tab_count, -1, sctx->col_idx, consensus_tab_names );
382 if ( rc == 0 )
383 {
384 rc = VCursorOpen( sctx->cursor );
385 if ( rc != 0 )
386 {
387 LOGERR( klogErr, rc, "cannot open cursor on consensus-table" );
388 }
389 else
390 {
391 const INSDC_SRA_platform_id platform = SRA_PLATFORM_PACBIO_SMRT;
392
393 rc = VCursorDefault ( sctx->cursor, sctx->col_idx[ consensus_tab_PLATFORM ],
394 sizeof platform * 8, &platform, 0, 1 );
395 if ( rc != 0 )
396 {
397 LOGERR( klogErr, rc, "cannot set cursor-default on consensus-table for platform-column" );
398 }
399 else
400 {
401 const INSDC_SRA_read_filter filter = SRA_READ_FILTER_PASS;
402 rc = VCursorDefault ( sctx->cursor, sctx->col_idx[ consensus_tab_READ_FILTER ],
403 sizeof filter * 8, &filter, 0, 1 );
404 if ( rc != 0 )
405 {
406 LOGERR( klogErr, rc, "cannot set cursor-default on consensus-table for read-filter-column" );
407 }
408 else
409 {
410 sctx->lctx = lctx;
411 }
412 }
413 }
414 }
415 }
416 return rc;
417 }
418
419
load_consensus_src(con_ctx * sctx,KDirectory * hdf5_src)420 rc_t load_consensus_src( con_ctx * sctx, KDirectory * hdf5_src )
421 {
422 BaseCalls_cmn ConsensusTab;
423
424 rc_t rc = 0;
425 if ( sctx->lctx->check_src_obj )
426 rc = check_src_objects( hdf5_src, consensus_groups_to_check,
427 consensus_tables_to_check, false );
428 if ( rc == 0 )
429 rc = open_BaseCalls_cmn( hdf5_src, &ConsensusTab, true,
430 "PulseData/ConsensusBaseCalls", sctx->lctx->cache_content, true );
431 if ( rc == 0 )
432 {
433 uint64_t total_bases = zmw_total( &ConsensusTab.zmw );
434 uint64_t total_spots = ConsensusTab.zmw.NumEvent.extents[ 0 ];
435
436 KLogLevel tmp_lvl = KLogLevelGet();
437 KLogLevelSet( klogInfo );
438 PLOGMSG( klogInfo, ( klogInfo,
439 "loading consensus-table ( $(bases) bases / $(spots) spots ):",
440 "bases=%lu,spots=%lu", total_bases, total_spots ));
441 KLogLevelSet( tmp_lvl );
442
443 if ( !check_Consensus_totalcount( &ConsensusTab, total_bases ) )
444 rc = RC( rcExe, rcNoTarg, rcAllocating, rcParam, rcInvalid );
445 else
446 rc = zmw_for_each( &ConsensusTab.zmw, &sctx->lctx->xml_progress, sctx->cursor,
447 sctx->lctx->with_progress, sctx->col_idx, NULL,
448 true, consensus_load_spot, &ConsensusTab );
449 close_BaseCalls_cmn( &ConsensusTab );
450 }
451 return rc;
452 }
453
454
finish_consensus(con_ctx * sctx)455 rc_t finish_consensus( con_ctx * sctx )
456 {
457 VCursorRelease( sctx->cursor );
458 return 0;
459 }
460
461
load_consensus(VDatabase * database,KDirectory * hdf5_src,ld_context * lctx)462 rc_t load_consensus( VDatabase * database, KDirectory * hdf5_src, ld_context *lctx )
463 {
464 rc_t rc = 0;
465 if ( lctx->check_src_obj )
466 rc = check_src_objects( hdf5_src, consensus_groups_to_check,
467 consensus_tables_to_check, false );
468 if ( rc == 0 )
469 rc = load_table( database, hdf5_src, lctx, consensus_schema_template,
470 consensus_table_to_create, consensus_loader );
471 return rc;
472 }
473