1 /*===========================================================================
2 *
3 *                            PUBLIC DOMAIN NOTICE
4 *               National Center for Biotechnology Information
5 *
6 *  This software/database is a "United States Government Work" under the
7 *  terms of the United States Copyright Act.  It was written as part of
8 *  the author's official duties as a United States Government employee and
9 *  thus cannot be copyrighted.  This software/database is freely available
10 *  to the public for use. The National Library of Medicine and the U.S.
11 *  Government have not placed any restriction on its use or reproduction.
12 *
13 *  Although all reasonable efforts have been taken to ensure the accuracy
14 *  and reliability of the software and data, the NLM and the U.S.
15 *  Government do not and cannot warrant the performance or results that
16 *  may be obtained by using this software or data. The NLM and the U.S.
17 *  Government disclaim all warranties, express or implied, including
18 *  warranties of performance, merchantability or fitness for any particular
19 *  purpose.
20 *
21 *  Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 
27 #include "pl-consensus.h"
28 #include <sysalloc.h>
29 
30 #include <stdlib.h>
31 #include <stdio.h>
32 #include <string.h>
33 
34 
35 const char * consensus_tab_names[] =
36 {
37     /* base-space */
38     "READ",
39     "QUALITY",
40     "NREADS",
41     "READ_TYPE",
42     "READ_START",
43     "READ_LEN",
44     "(INSDC:SRA:platform_id)PLATFORM",
45     "READ_FILTER",
46 
47     /* consensus-space */
48     "HOLE_NUMBER",
49     "HOLE_STATUS",
50     "HOLE_XY",
51     "NUM_PASSES",
52     "INSERTION_QV",
53     "DELETION_QV",
54     "DELETION_TAG",
55     "SUBSTITUTION_QV",
56     "SUBSTITUTION_TAG"
57 };
58 
59 
check_Consensus_totalcount(BaseCalls_cmn * tab,const uint64_t expected)60 static bool check_Consensus_totalcount( BaseCalls_cmn *tab, const uint64_t expected )
61 {
62     bool res = check_table_count( &tab->Basecall, "Basecall", expected );
63     if ( res )
64         res = check_table_count( &tab->QualityValue, "QualityValue", expected );
65 
66 	if ( res )
67 	{
68 		if ( tab->DeletionQV.extents != NULL )
69 			res = check_table_count( &tab->DeletionQV, "DeletionQV", expected );
70 
71 		if ( tab->DeletionTag.extents != NULL )
72 			res = check_table_count( &tab->DeletionTag, "DeletionTag", expected );
73 
74 		if ( tab->InsertionQV.extents != NULL )
75 			res = check_table_count( &tab->InsertionQV, "InsertionQV", expected );
76 
77 		if ( tab->SubstitutionQV.extents != NULL )
78 			res = check_table_count( &tab->SubstitutionQV, "SubstitutionQV", expected );
79 
80 		if ( tab->SubstitutionTag.extents != NULL )
81 			res = check_table_count( &tab->SubstitutionTag, "SubstitutionTag", expected );
82 	}
83 
84     return res;
85 }
86 
87 
consensus_load_zero_bases(VCursor * cursor,const uint32_t * col_idx)88 static rc_t consensus_load_zero_bases( VCursor *cursor, const uint32_t *col_idx )
89 {
90     uint32_t dummy_src;
91     INSDC_SRA_read_filter filter = SRA_READ_FILTER_CRITERIA;
92 
93     rc_t rc = vdb_write_value( cursor, col_idx[ consensus_tab_READ ],
94                                &dummy_src, BASECALL_BITSIZE, 0, "consensus.Basecall" );
95     if ( rc == 0 )
96         rc = vdb_write_value( cursor, col_idx[ consensus_tab_QUALITY ],
97                                &dummy_src, QUALITY_VALUE_BITSIZE, 0, "QualityValue" );
98     if ( rc == 0 )
99         rc = vdb_write_value( cursor, col_idx[ consensus_tab_INSERTION_QV ],
100                                &dummy_src, INSERTION_QV_BITSIZE, 0, "consensus.InsertionQV" );
101     if ( rc == 0 )
102         rc = vdb_write_value( cursor, col_idx[ consensus_tab_DELETION_QV ],
103                                &dummy_src, DELETION_QV_BITSIZE, 0, "consensus.DeletionQV" );
104     if ( rc == 0 )
105         rc = vdb_write_value( cursor, col_idx[ consensus_tab_DELETION_TAG ],
106                                &dummy_src, DELETION_TAG_BITSIZE, 0, "consensus.DeletionTag" );
107     if ( rc == 0 )
108         rc = vdb_write_value( cursor, col_idx[ consensus_tab_SUBSTITUTION_QV ],
109                                &dummy_src, SUBSTITUTION_QV_BITZISE, 0, "consensus.SubstitutionQV" );
110     if ( rc == 0 )
111         rc = vdb_write_value( cursor, col_idx[ consensus_tab_SUBSTITUTION_TAG ],
112                                &dummy_src, SUBSTITUTION_TAG_BITSIZE, 0, "consensus.SubstitutionTag" );
113     if ( rc == 0 )
114         rc = vdb_write_value( cursor, col_idx[ consensus_tab_READ_FILTER ],
115                                &filter, sizeof filter * 8, 1, "consensus.READ_FILTER" );
116     return rc;
117 }
118 
119 
consensus_load_spot_bases(VCursor * cursor,BaseCalls_cmn * tab,const uint32_t * col_idx,zmw_row * spot)120 static rc_t consensus_load_spot_bases( VCursor *cursor, BaseCalls_cmn *tab,
121                                        const uint32_t *col_idx, zmw_row * spot )
122 {
123     rc_t rc = 0;
124 	uint32_t column_idx, dummy_src;
125 
126     /* we make a buffer to store NumEvent 8-bit-values
127       (that is so far the biggest value we have to read per DNA-BASE) */
128     char * buffer = malloc( spot->NumEvent );
129     if ( buffer == NULL )
130     {
131         rc = RC( rcExe, rcNoTarg, rcAllocating, rcMemory, rcExhausted );
132         PLOGERR( klogErr, ( klogErr, rc, "cannot allocate $(numbytes) to read seq-data",
133                             "numbytes=%u", spot->NumEvent ) );
134     }
135     if ( rc == 0 )
136         rc = transfer_bits( cursor, col_idx[ consensus_tab_READ ],
137             &tab->Basecall, buffer, spot->offset, spot->NumEvent,
138             BASECALL_BITSIZE, "consensus.Basecall" );
139     if ( rc == 0 )
140         rc = transfer_bits( cursor, col_idx[ consensus_tab_QUALITY ],
141             &tab->QualityValue, buffer, spot->offset, spot->NumEvent,
142             QUALITY_VALUE_BITSIZE, "consensus.QualityValue" );
143 
144     if ( rc == 0 )
145 	{
146 		column_idx = col_idx[ consensus_tab_INSERTION_QV ];
147 		if ( tab->InsertionQV.extents != NULL )
148 			rc = transfer_bits( cursor, column_idx,
149 				&tab->InsertionQV, buffer, spot->offset, spot->NumEvent,
150 				INSERTION_QV_BITSIZE, "consensus.InsertionQV" );
151 		else
152 			rc = vdb_write_value( cursor, column_idx,
153 					&dummy_src, INSERTION_QV_BITSIZE, 0, "consensus.InsertionQV" );
154 	}
155 
156     if ( rc == 0 )
157 	{
158 		column_idx = col_idx[ consensus_tab_DELETION_QV ];
159 		if ( tab->DeletionQV.extents != NULL )
160 			rc = transfer_bits( cursor, column_idx,
161 				&tab->DeletionQV, buffer, spot->offset, spot->NumEvent,
162 				DELETION_QV_BITSIZE, "consensus.DeletionQV" );
163 		else
164 			rc = vdb_write_value( cursor, column_idx,
165 				   &dummy_src, DELETION_QV_BITSIZE, 0, "consensus.DeletionQV" );
166 	}
167 
168     if ( rc == 0 )
169 	{
170 		column_idx = col_idx[ consensus_tab_DELETION_TAG ];
171 		if ( tab->DeletionTag.extents != NULL )
172 			rc = transfer_bits( cursor, column_idx,
173 				&tab->DeletionTag, buffer, spot->offset, spot->NumEvent,
174 				DELETION_TAG_BITSIZE, "consensus.DeletionTag" );
175 		else
176 			rc = vdb_write_value( cursor, column_idx,
177 				   &dummy_src, DELETION_TAG_BITSIZE, 0, "consensus.DeletionTag" );
178 	}
179 
180     if ( rc == 0 )
181 	{
182 		column_idx = col_idx[ consensus_tab_SUBSTITUTION_QV ];
183 		if ( tab->SubstitutionQV.extents != NULL )
184 			rc = transfer_bits( cursor, column_idx,
185 				&tab->SubstitutionQV, buffer, spot->offset, spot->NumEvent,
186 				SUBSTITUTION_QV_BITZISE, "consensus.SubstitutionQV" );
187 		else
188 			rc = vdb_write_value( cursor, column_idx,
189 					&dummy_src, SUBSTITUTION_QV_BITZISE, 0, "consensus.SubstitutionQV" );
190 	}
191 
192     if ( rc == 0 )
193 	{
194 		column_idx = col_idx[ consensus_tab_SUBSTITUTION_TAG ];
195 		if ( tab->SubstitutionTag.extents != NULL )
196 			rc = transfer_bits( cursor, column_idx,
197 				&tab->SubstitutionTag, buffer, spot->offset, spot->NumEvent,
198 				SUBSTITUTION_TAG_BITSIZE, "consensus.SubstitutionTag" );
199 		else
200 			rc = vdb_write_value( cursor, column_idx,
201 				   &dummy_src, SUBSTITUTION_TAG_BITSIZE, 0, "consensus.SubstitutionTag" );
202 	}
203 
204     if ( buffer != NULL )
205         free( buffer );
206     return rc;
207 }
208 
209 
consensus_load_spot(VCursor * cursor,const uint32_t * col_idx,region_type_mapping * mapping,zmw_row * spot,void * data)210 static rc_t consensus_load_spot( VCursor *cursor, const uint32_t *col_idx,
211                                  region_type_mapping *mapping, zmw_row * spot,
212                                  void * data )
213 {
214 	rc_t rc = 0;
215 	if ( spot->NumEvent > 0 )
216 	{
217 		BaseCalls_cmn *tab = (BaseCalls_cmn *)data;
218 		rc = VCursorOpenRow( cursor );
219 		if ( rc != 0 )
220 			PLOGERR( klogErr, ( klogErr, rc, "cannot open consensus-row on spot# $(spotnr)",
221 								"spotnr=%u", spot->spot_nr ) );
222 
223 		if ( rc == 0 )
224 			rc = vdb_write_uint32( cursor, col_idx[ consensus_tab_HOLE_NUMBER ],
225 								   spot->HoleNumber, "consensus.HOLE_NUMBER" );
226 		if ( rc == 0 )
227 			rc = vdb_write_uint8( cursor, col_idx[ consensus_tab_HOLE_STATUS ],
228 								  spot->HoleStatus, "consensus.HOLE_STATUS" );
229 		if ( rc == 0 )
230 			rc = vdb_write_value( cursor, col_idx[ consensus_tab_HOLE_XY ],
231 								  &spot->HoleXY, HOLE_XY_BITSIZE, 2, "consensus.HOLE_XY" );
232 
233 		/* has to be read ... from "PulseData/ConsensusBaesCalls/Passes/NumPasses" */
234 		if ( rc == 0 )
235 			rc = vdb_write_uint32( cursor, col_idx[ consensus_tab_NUM_PASSES ],
236 								   spot->NumPasses, "consensus.NUM_PASSES" );
237 
238 		if ( rc == 0 )
239 		{
240 			if ( spot->NumEvent > 0 )
241 				rc = consensus_load_spot_bases( cursor, tab, col_idx, spot );
242 			else
243 				rc = consensus_load_zero_bases( cursor, col_idx );
244 		}
245 
246 		if ( rc == 0 )
247 			rc = vdb_write_uint8( cursor, col_idx[ consensus_tab_NREADS ],
248 								  1, "consensus.NREADS" );
249 		if ( rc == 0 )
250 			rc = vdb_write_uint32( cursor, col_idx[ consensus_tab_READ_START ],
251 								   0, "consensus.READ_START" );
252 		if ( rc == 0 )
253 			rc = vdb_write_uint32( cursor, col_idx[ consensus_tab_READ_LEN ],
254 								   spot->NumEvent, "consensus.READ_LEN" );
255 		if ( rc == 0 )
256 			rc = vdb_write_uint8( cursor, col_idx[ consensus_tab_READ_TYPE ],
257 								  SRA_READ_TYPE_BIOLOGICAL, "consensus.READ_TYPE" );
258 
259 		if ( rc == 0 )
260 		{
261 			rc = VCursorCommitRow( cursor );
262 			if ( rc != 0 )
263 				PLOGERR( klogErr, ( klogErr, rc, "cannot commit consensus-row on spot# $(spotnr)",
264 									"spotnr=%u", spot->spot_nr ) );
265 		}
266 
267 		if ( rc == 0 )
268 		{
269 			rc = VCursorCloseRow( cursor );
270 			if ( rc != 0 )
271 				PLOGERR( klogErr, ( klogErr, rc, "cannot close consensus-row on spot# $(spotnr)",
272 									"spotnr=%u", spot->spot_nr ) );
273 
274 		}
275 	}
276     return rc;
277 }
278 
279 
consensus_loader(ld_context * lctx,KDirectory * hdf5_src,VCursor * cursor,const char * table_name)280 static rc_t consensus_loader( ld_context *lctx, KDirectory * hdf5_src, VCursor * cursor, const char * table_name )
281 {
282     uint32_t col_idx[ consensus_tab_count ];
283     rc_t rc = add_columns( cursor, consensus_tab_count, -1, col_idx, consensus_tab_names );
284     if ( rc == 0 )
285     {
286         rc = VCursorOpen( cursor );
287         if ( rc != 0 )
288             LOGERR( klogErr, rc, "cannot open cursor on consensus-table" );
289 
290         else
291         {
292             BaseCalls_cmn ConsensusTab;
293             const INSDC_SRA_platform_id platform = SRA_PLATFORM_PACBIO_SMRT;
294 
295             rc = VCursorDefault ( cursor, col_idx[ consensus_tab_PLATFORM ],
296                                   sizeof platform * 8, &platform, 0, 1 );
297             if ( rc != 0 )
298                 LOGERR( klogErr, rc, "cannot set cursor-default on consensus-table for platform-column" );
299             else
300             {
301                 const INSDC_SRA_read_filter filter = SRA_READ_FILTER_PASS;
302                 rc = VCursorDefault ( cursor, col_idx[ consensus_tab_READ_FILTER ],
303                                   sizeof filter * 8, &filter, 0, 1 );
304                 if ( rc != 0 )
305                     LOGERR( klogErr, rc, "cannot set cursor-default on consensus-table for read-filter-column" );
306             }
307 
308             if ( rc == 0 )
309                 rc = open_BaseCalls_cmn( hdf5_src, &ConsensusTab, true,
310                                      "PulseData/ConsensusBaseCalls", lctx->cache_content, true );
311 
312             if ( rc == 0 )
313             {
314                 uint64_t total_bases = zmw_total( &ConsensusTab.zmw );
315                 uint64_t total_spots = ConsensusTab.zmw.NumEvent.extents[ 0 ];
316 
317                 KLogLevel tmp_lvl = KLogLevelGet();
318                 KLogLevelSet( klogInfo );
319                 PLOGMSG( klogInfo, ( klogInfo,
320                          "loading consensus-table ( $(bases) bases / $(spots) spots ):",
321                          "bases=%lu,spots=%lu", total_bases, total_spots ));
322                 KLogLevelSet( tmp_lvl );
323 
324                 if ( check_Consensus_totalcount( &ConsensusTab, total_bases ) )
325 				{
326                     rc = zmw_for_each( &ConsensusTab.zmw, &lctx->xml_progress, cursor,
327                                        lctx->with_progress, col_idx, NULL,
328                                        true, consensus_load_spot, &ConsensusTab );
329 				}
330                 else
331 				{
332                     rc = RC( rcExe, rcNoTarg, rcAllocating, rcParam, rcInvalid );
333 				}
334                 close_BaseCalls_cmn( &ConsensusTab );
335             }
336         }
337     }
338     return rc;
339 }
340 
341 
342 /* HDF5-Groups and tables used to load the CONSENSUS-table */
343 static const char * consensus_groups_to_check[] =
344 {
345     "PulseData",
346     "PulseData/ConsensusBaseCalls",
347     "PulseData/ConsensusBaseCalls/ZMW",
348     "PulseData/ConsensusBaseCalls/Passes",
349     NULL
350 };
351 
352 
353 static const char * consensus_tables_to_check[] =
354 {
355     "PulseData/ConsensusBaseCalls/Basecall",
356     "PulseData/ConsensusBaseCalls/DeletionQV",
357     "PulseData/ConsensusBaseCalls/DeletionTag",
358     "PulseData/ConsensusBaseCalls/InsertionQV",
359     "PulseData/ConsensusBaseCalls/QualityValue",
360     "PulseData/ConsensusBaseCalls/SubstitutionQV",
361     "PulseData/ConsensusBaseCalls/SubstitutionTag",
362     "PulseData/ConsensusBaseCalls/ZMW/HoleNumber",
363     "PulseData/ConsensusBaseCalls/ZMW/HoleStatus",
364     "PulseData/ConsensusBaseCalls/ZMW/HoleXY",
365     "PulseData/ConsensusBaseCalls/ZMW/NumEvent",
366     "PulseData/ConsensusBaseCalls/Passes/NumPasses",
367     NULL
368 };
369 
370 
371 static const char * consensus_schema_template = "CONSENSUS";
372 static const char * consensus_table_to_create = "CONSENSUS";
373 
374 
prepare_consensus(VDatabase * database,con_ctx * sctx,ld_context * lctx)375 rc_t prepare_consensus( VDatabase * database, con_ctx * sctx, ld_context *lctx )
376 {
377     rc_t rc = prepare_table( database, &sctx->cursor,
378             consensus_schema_template, consensus_table_to_create ); /* pl-tools.c ... this creates the cursor */
379     if ( rc == 0 )
380     {
381         rc = add_columns( sctx->cursor, consensus_tab_count, -1, sctx->col_idx, consensus_tab_names );
382         if ( rc == 0 )
383         {
384             rc = VCursorOpen( sctx->cursor );
385             if ( rc != 0 )
386             {
387                 LOGERR( klogErr, rc, "cannot open cursor on consensus-table" );
388             }
389             else
390             {
391                 const INSDC_SRA_platform_id platform = SRA_PLATFORM_PACBIO_SMRT;
392 
393                 rc = VCursorDefault ( sctx->cursor, sctx->col_idx[ consensus_tab_PLATFORM ],
394                                       sizeof platform * 8, &platform, 0, 1 );
395                 if ( rc != 0 )
396                 {
397                     LOGERR( klogErr, rc, "cannot set cursor-default on consensus-table for platform-column" );
398                 }
399                 else
400                 {
401                     const INSDC_SRA_read_filter filter = SRA_READ_FILTER_PASS;
402                     rc = VCursorDefault ( sctx->cursor, sctx->col_idx[ consensus_tab_READ_FILTER ],
403                                       sizeof filter * 8, &filter, 0, 1 );
404                     if ( rc != 0 )
405                     {
406                         LOGERR( klogErr, rc, "cannot set cursor-default on consensus-table for read-filter-column" );
407                     }
408                     else
409                     {
410                         sctx->lctx = lctx;
411                     }
412                 }
413             }
414         }
415     }
416     return rc;
417 }
418 
419 
load_consensus_src(con_ctx * sctx,KDirectory * hdf5_src)420 rc_t load_consensus_src( con_ctx * sctx, KDirectory * hdf5_src )
421 {
422     BaseCalls_cmn ConsensusTab;
423 
424     rc_t rc = 0;
425     if ( sctx->lctx->check_src_obj )
426         rc = check_src_objects( hdf5_src, consensus_groups_to_check,
427                                 consensus_tables_to_check, false );
428     if ( rc == 0 )
429         rc = open_BaseCalls_cmn( hdf5_src, &ConsensusTab, true,
430                                  "PulseData/ConsensusBaseCalls", sctx->lctx->cache_content, true );
431     if ( rc == 0 )
432     {
433         uint64_t total_bases = zmw_total( &ConsensusTab.zmw );
434         uint64_t total_spots = ConsensusTab.zmw.NumEvent.extents[ 0 ];
435 
436         KLogLevel tmp_lvl = KLogLevelGet();
437         KLogLevelSet( klogInfo );
438         PLOGMSG( klogInfo, ( klogInfo,
439                  "loading consensus-table ( $(bases) bases / $(spots) spots ):",
440                  "bases=%lu,spots=%lu", total_bases, total_spots ));
441         KLogLevelSet( tmp_lvl );
442 
443         if ( !check_Consensus_totalcount( &ConsensusTab, total_bases ) )
444             rc = RC( rcExe, rcNoTarg, rcAllocating, rcParam, rcInvalid );
445         else
446             rc = zmw_for_each( &ConsensusTab.zmw, &sctx->lctx->xml_progress, sctx->cursor,
447                                sctx->lctx->with_progress, sctx->col_idx, NULL,
448                                true, consensus_load_spot, &ConsensusTab );
449         close_BaseCalls_cmn( &ConsensusTab );
450     }
451     return rc;
452 }
453 
454 
finish_consensus(con_ctx * sctx)455 rc_t finish_consensus( con_ctx * sctx )
456 {
457     VCursorRelease( sctx->cursor );
458     return 0;
459 }
460 
461 
load_consensus(VDatabase * database,KDirectory * hdf5_src,ld_context * lctx)462 rc_t load_consensus( VDatabase * database, KDirectory * hdf5_src, ld_context *lctx )
463 {
464     rc_t rc = 0;
465     if ( lctx->check_src_obj )
466         rc = check_src_objects( hdf5_src, consensus_groups_to_check,
467                                 consensus_tables_to_check, false );
468     if ( rc == 0 )
469         rc = load_table( database, hdf5_src, lctx, consensus_schema_template,
470                          consensus_table_to_create, consensus_loader );
471     return rc;
472 }
473