1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 #include <klib/rc.h>
27 #include <klib/log.h>
28 #include <klib/debug.h>
29 #include <klib/container.h>
30 #include <klib/trie.h>
31 #include <sra/types.h>
32 #include <os-native.h>
33
34 #include "experiment-xml.h"
35 #include "debug.h"
36
37 #include <stdlib.h>
38 #include <stdio.h>
39 #include <string.h>
40 #include <ctype.h>
41 #include <assert.h>
42
43 #define MAX_POOL_MEMBER_BASECALLS 64
44 #define MAX_SPOT_DESCRIPTOR_READS (SRALOAD_MAX_READS + 2)
45
46 typedef struct PoolReadSpec_struct {
47 int16_t start;
48 int16_t length;
49 /* chosen basecalls for EXPECTED_BASECALL_TABLE for defined member_name, NULL-terminated array */
50 const ReadSpecXML_read_BASECALL* item[MAX_POOL_MEMBER_BASECALLS];
51 int32_t score;
52 } PoolReadSpec;
53
54 typedef struct PoolMember_struct {
55 BSTNode node;
56 const char *name;
57 PoolReadSpec spec[MAX_SPOT_DESCRIPTOR_READS];
58 } PoolMember;
59
60 struct ExperimentXML {
61 uint32_t spot_length;
62 const PlatformXML* platform;
63 ProcessingXML processing;
64 const ReadSpecXML* reads;
65 BSTree* member_pool;
66 PoolMember* member_null;
67 SLList member_stats;
68 };
69
70 static
PoolMember_Whack(BSTNode * node,void * data)71 void CC PoolMember_Whack(BSTNode* node, void* data)
72 {
73 if( node != NULL ) {
74 const PoolMember* n = (const PoolMember*)node;
75 free((void*)n->name);
76 free(node);
77 }
78 }
79
80 static
PoolMember_StrCmp(const char * s1,const char * s2)81 int64_t CC PoolMember_StrCmp(const char* s1, const char* s2)
82 {
83 if( s1 == NULL && s2 == NULL ) {
84 return 0;
85 } else if( s1 == NULL ) {
86 return -32767;
87 } else if( s2 == NULL ) {
88 return 32768;
89 }
90 return strcmp(s1, s2);
91 }
92
93 static
PoolMember_Cmp(const BSTNode * item,const BSTNode * node)94 int64_t CC PoolMember_Cmp(const BSTNode* item, const BSTNode* node)
95 {
96 const PoolMember* i = (const PoolMember*)item;
97 const PoolMember* n = (const PoolMember*)node;
98
99 return PoolMember_StrCmp(i->name, n->name);
100 }
101
102 static
PoolMember_FindByName(const void * item,const BSTNode * node)103 int64_t CC PoolMember_FindByName(const void* item, const BSTNode* node)
104 {
105 const char* name = (const char*)item;
106 const PoolMember* n = (const PoolMember*)node;
107
108 return PoolMember_StrCmp(name, n->name);
109 }
110
111 #if _DEBUGGING
112 static
PoolMember_Dump(BSTNode * node,void * data)113 void CC PoolMember_Dump( BSTNode *node, void *data )
114 {
115 const PoolMember* n = (const PoolMember*)node;
116 const ExperimentXML* self = (const ExperimentXML*)data;
117 int r;
118
119 DEBUG_MSG(9, ("Member '%s'\n", n->name));
120
121 for(r = 0; r <= self->reads->nreads + 1; r++) {
122 DEBUG_MSG(9, ("Read %d [%hd:%hd]", r + 1, n->spec[r].start, n->spec[r].length));
123 if( self->reads->spec[r].coord_type == rdsp_ExpectedBaseCall_ct ) {
124 DEBUG_MSG(9, (" ebc:'%s'", n->spec[r].item[0]->basecall));
125 } else if( self->reads->spec[r].coord_type == rdsp_ExpectedBaseCallTable_ct ) {
126 int i = -1;
127 while( n->spec[r].item[++i] != NULL ) {
128 DEBUG_MSG(9, (" bc:'%s', tag:'%s';", n->spec[r].item[i]->basecall, n->spec[r].item[i]->read_group_tag));
129 }
130 }
131 DEBUG_MSG(9, ("\n"));
132 }
133 }
134 #endif
135
136 static
PoolMember_Add(ExperimentXML * self,const char * name)137 rc_t PoolMember_Add(ExperimentXML* self, const char* name)
138 {
139 rc_t rc = 0;
140 PoolMember* member = NULL;
141
142 if( self == NULL) {
143 return RC(rcSRA, rcFormatter, rcAllocating, rcSelf, rcNull);
144 }
145 member = calloc(1, sizeof(*member));
146 if( member == NULL ) {
147 return RC(rcSRA, rcFormatter, rcAllocating, rcMemory, rcExhausted);
148 }
149 if( name == NULL ) {
150 if( self->member_null == NULL ) {
151 self->member_null = member;
152 } else {
153 rc = RC(rcSRA, rcFormatter, rcAllocating, rcId, rcDuplicate);
154 }
155 } else {
156 member->name = strdup(name);
157 if( member->name == NULL ) {
158 rc = RC(rcSRA, rcFormatter, rcAllocating, rcMemory, rcExhausted);
159 } else {
160 if( self->member_pool == NULL ) {
161 self->member_pool = calloc(1, sizeof(BSTree));
162 if( self->member_pool == NULL ) {
163 rc = RC(rcSRA, rcFormatter, rcAllocating, rcMemory, rcExhausted);
164 } else {
165 BSTreeInit(self->member_pool);
166 }
167 }
168 if( rc == 0 ) {
169 rc = BSTreeInsertUnique(self->member_pool, &member->node, NULL, PoolMember_Cmp);
170 }
171 }
172 }
173 if( rc == 0 ) {
174 uint32_t i;
175 for(i = 0; i <= self->reads->nreads + 1; i++) {
176 switch(self->reads->spec[i].coord_type) {
177 case rdsp_FIXED_BRACKET_ct:
178 member->spec[i].start = self->reads->spec[i].coord.start_coord - 1;
179 member->spec[i].length = (i == 0 ? 0 : -1);
180 break;
181 case rdsp_RelativeOrder_ct:
182 member->spec[i].start = -1;
183 member->spec[i].length = -1;
184 break;
185
186 case rdsp_BaseCoord_ct:
187 case rdsp_CycleCoord_ct:
188 member->spec[i].start = self->reads->spec[i].coord.start_coord - 1;
189 member->spec[i].length = -1;
190 break;
191
192 case rdsp_ExpectedBaseCall_ct:
193 if( name == NULL || name[0] == '\0' ) {
194 member->spec[i].start = self->reads->spec[i].coord.expected_basecalls.base_coord;
195 member->spec[i].length = self->reads->spec[i].coord.expected_basecalls.default_length;
196 }
197 if( member->spec[i].start == 0 ) {
198 member->spec[i].start = -1;
199 }
200 member->spec[i].item[0] = self->reads->spec[i].coord.expected_basecalls.table;
201 if( member->spec[i].length == 0 ) {
202 member->spec[i].length = strlen(member->spec[i].item[0]->basecall);
203 }
204 break;
205
206 case rdsp_ExpectedBaseCallTable_ct:
207 if( name == NULL || name[0] == '\0' ) {
208 member->spec[i].start = self->reads->spec[i].coord.expected_basecalls.base_coord;
209 member->spec[i].length = self->reads->spec[i].coord.expected_basecalls.default_length;
210 }
211 if( member->spec[i].start == 0 ) {
212 member->spec[i].start = -1;
213 }
214 if( member->spec[i].length == 0 ) {
215 member->spec[i].length = (!self->reads->spec[i].coord.expected_basecalls.pooled || name == NULL) ? -1 : 0;
216 }
217 break;
218 }
219 }
220 } else {
221 PoolMember_Whack(&member->node, NULL);
222 }
223 return rc;
224 }
225
226 static
PoolMember_AddBasecalls(const ExperimentXML * self,const char * member,int readid,const ReadSpecXML_read_BASECALL * basecalls[])227 rc_t PoolMember_AddBasecalls(const ExperimentXML* self, const char* member, int readid, const ReadSpecXML_read_BASECALL* basecalls[])
228 {
229 rc_t rc = 0;
230
231 if( self == NULL || basecalls == NULL ) {
232 rc = RC(rcSRA, rcFormatter, rcParsing, rcParam, rcNull);
233 } else if( self->reads->reads[readid].coord_type != rdsp_ExpectedBaseCallTable_ct ) {
234 rc = RC(rcSRA, rcFormatter, rcParsing, rcId, rcIncorrect);
235 } else {
236 PoolMember* pm = (PoolMember*)BSTreeFind(self->member_pool, member, PoolMember_FindByName);
237 if( pm == NULL ) {
238 rc = RC(rcSRA, rcFormatter, rcResolving, rcData, rcNotFound);
239 PLOGERR(klogErr, (klogErr, rc, "MEMBER '$(m)'", PLOG_S(m), member));
240 } else {
241 int k = -1, i = -1;
242 while( pm->spec[readid + 1].item[++k] != NULL );
243 while( rc == 0 && basecalls[++i] != NULL ) {
244 if( k >= MAX_POOL_MEMBER_BASECALLS ) {
245 rc = RC(rcSRA, rcFormatter, rcConstructing, rcData, rcTooLong);
246 PLOGERR(klogErr, (klogErr, rc, "MEMBER '$(m)' basecalls", PLOG_S(m), member));
247 } else {
248 pm->spec[readid + 1].item[k++] = basecalls[i];
249 }
250 }
251 }
252 }
253 return rc;
254 }
255
parse_POOL(const KXMLNode * EXPERIMENT,ExperimentXML * obj)256 rc_t parse_POOL(const KXMLNode* EXPERIMENT, ExperimentXML* obj)
257 {
258 rc_t rc = 0;
259
260 /* create unspecified member -> NULL */
261 if( (rc = PoolMember_Add(obj, NULL)) == 0 ) {
262 const KXMLNodeset* MEMBERS;
263 if( (rc = KXMLNodeOpenNodesetRead(EXPERIMENT, &MEMBERS, "DESIGN/SAMPLE_DESCRIPTOR/POOL/MEMBER")) == 0 ) {
264 uint32_t i, num_members = 0;
265 if( (rc = KXMLNodesetCount(MEMBERS, &num_members)) == 0 && num_members > 0 ) {
266 /* create default member -> '' */
267 rc = PoolMember_Add(obj, "");
268 for(i = 0; rc == 0 && i < num_members; i++) {
269 const KXMLNode* MEMBER;
270 if( (rc = KXMLNodesetGetNodeRead(MEMBERS, &MEMBER, i)) == 0 ) {
271 char* member_name = NULL;
272 if( (rc = KXMLNodeReadAttrCStr(MEMBER, "member_name", &member_name, NULL)) == 0 ) {
273 const KXMLNodeset* LABELS;
274 if( strcasecmp(member_name, "default") == 0 ) {
275 member_name[0] = '\0';
276 }
277 if( member_name[0] != '\0' ) {
278 if( (rc = PoolMember_Add(obj, member_name)) != 0 ) {
279 PLOGERR(klogErr, (klogErr, rc, "MEMBER '$(m)'", PLOG_S(m), member_name));
280 }
281 }
282 if( rc == 0 && (rc = KXMLNodeOpenNodesetRead(MEMBER, &LABELS, "READ_LABEL")) == 0 ) {
283 uint32_t l, num_labels = 0;
284 KXMLNodesetCount(LABELS, &num_labels);
285 if( num_labels > 0 ) {
286 const ReadSpecXML_read_BASECALL* basecalls[MAX_POOL_MEMBER_BASECALLS];
287 for(l = 0; rc == 0 && l < num_labels; l++) {
288 const KXMLNode* LABEL;
289 if( (rc = KXMLNodesetGetNodeRead(LABELS, &LABEL, l)) == 0 ) {
290 char* label = NULL, *tag = NULL;
291 if( (rc = KXMLNodeReadCStr(LABEL, &label, NULL)) != 0 || label == NULL || label[0] == '\0' ) {
292 rc = rc ? rc : RC(rcSRA, rcFormatter, rcConstructing, rcData, rcEmpty);
293 PLOGERR(klogErr, (klogErr, rc, "MEMBER '$(m)' READ_LABEL value", PLOG_S(m), member_name));
294 } else {
295 if( (rc = KXMLNodeReadAttrCStr(LABEL, "read_group_tag", &tag, NULL)) != 0 ) {
296 if( GetRCState(rc) != rcNotFound ) {
297 PLOGERR(klogErr, (klogErr, rc, "MEMBER '$(m)' READ_LABEL '$(l)' 'read_group_tag' attribute",
298 PLOG_2(PLOG_S(m),PLOG_S(l)), member_name, label));
299 } else {
300 rc = 0;
301 }
302 } else if( tag != NULL && tag[0] == '\0' ) {
303 rc = RC(rcSRA, rcFormatter, rcConstructing, rcData, rcEmpty);
304 PLOGERR(klogErr, (klogErr, rc, "MEMBER '$(m)' READ_LABEL '$(l)' 'read_group_tag' attribute",
305 PLOG_2(PLOG_S(m),PLOG_S(l)), member_name, label));
306 }
307 if( rc == 0 ) {
308 /* find read_spec based on label and update basecall table */
309 uint32_t r, read_found = 0;
310 for(r = 0; rc == 0 && r < obj->reads->nreads; r++) {
311 if( obj->reads->reads[r].read_label != NULL && strcmp(label, obj->reads->reads[r].read_label) == 0 ) {
312 read_found++;
313 if( tag != NULL && tag[0] != '\0' ) {
314 if( obj->reads->reads[r].coord_type != rdsp_ExpectedBaseCallTable_ct ) {
315 rc = RC(rcSRA, rcFormatter, rcConstructing, rcData, rcInconsistent);
316 PLOGERR(klogErr, (klogErr, rc,
317 "MEMBER '$(m)' READ_LABEL '$(l)' READ_SPEC must have EXPECTED_BASECALL_TABLE type",
318 PLOG_2(PLOG_S(m),PLOG_S(l)), member_name, label));
319 } else {
320 uint32_t b, bcall_found = 0;
321 memset(basecalls, 0, sizeof(basecalls));
322 /* find tag in table */
323 for(b = 0; rc == 0 && b < obj->reads->reads[r].coord.expected_basecalls.count; b++) {
324 if( obj->reads->reads[r].coord.expected_basecalls.table[b].read_group_tag != NULL &&
325 strcmp(tag, obj->reads->reads[r].coord.expected_basecalls.table[b].read_group_tag) == 0 ) {
326 if( bcall_found >= MAX_POOL_MEMBER_BASECALLS ) {
327 rc = RC(rcSRA, rcFormatter, rcConstructing, rcData, rcExcessive);
328 PLOGERR(klogErr, (klogErr, rc,
329 "MEMBER '$(m)' READ_LABEL '$(l)' read_group_tag='$(t)' too many basecalls",
330 PLOG_3(PLOG_S(m),PLOG_S(l),PLOG_S(t)), member_name, label, tag));
331 }
332 basecalls[bcall_found++] = &obj->reads->reads[r].coord.expected_basecalls.table[b];
333 }
334 }
335 if( rc == 0 && bcall_found == 0 ) {
336 rc = RC(rcSRA, rcFormatter, rcConstructing, rcData, rcNotFound);
337 PLOGERR(klogErr, (klogErr, rc, "MEMBER '$(m)' READ_LABEL '$(l)' read_group_tag='$(t)' in READ_SPEC",
338 PLOG_3(PLOG_S(m),PLOG_S(l),PLOG_S(t)), member_name, label, tag));
339 } else {
340 rc = PoolMember_AddBasecalls(obj, member_name, r, basecalls);
341 }
342 }
343 }
344 }
345 }
346 if( read_found == 0 ) {
347 rc = RC(rcSRA, rcFormatter, rcConstructing, rcData, rcNotFound);
348 PLOGERR(klogErr, (klogErr, rc, "MEMBER '$(m)' READ_LABEL '$(l)' in READ_SPEC", PLOG_2(PLOG_S(l),PLOG_S(m)), label, member_name));
349 } else if( read_found > 1 ) {
350 rc = RC(rcSRA, rcFormatter, rcConstructing, rcData, rcInconsistent);
351 PLOGERR(klogErr, (klogErr, rc, "MEMBER '$(m)' READ_LABEL '$(l)' in READ_SPEC", PLOG_2(PLOG_S(l),PLOG_S(m)), label, member_name));
352 }
353 }
354 }
355 free(tag);
356 free(label);
357 KXMLNodeRelease(LABEL);
358 } else {
359 PLOGERR(klogErr, (klogErr, rc, "MEMBER '$(m)' READ_LABEL #$(i)", PLOG_2(PLOG_S(m),PLOG_U32(i)), member_name, l));
360 }
361 }
362 } else {
363 rc = RC(rcSRA, rcFormatter, rcConstructing, rcData, rcInvalid);
364 PLOGERR(klogErr, (klogErr, rc, "missing READ_LABEL element(s) in MEMBER '$(m)'", PLOG_S(m), member_name));
365 }
366 KXMLNodesetRelease(LABELS);
367 }
368 free(member_name);
369 } else {
370 PLOGERR(klogErr, (klogErr, rc, "'member_name' in MEMBER node #$(i)", PLOG_U32(i), i));
371 }
372 KXMLNodeRelease(MEMBER);
373 } else {
374 PLOGERR(klogErr, (klogErr, rc, "MEMBER node #$(i)", PLOG_U32(i), i));
375 }
376 }
377 } else if( rc != 0 ) {
378 LOGERR(klogErr, rc, "POOL/MEMBER(s)");
379 }
380 KXMLNodesetRelease(MEMBERS);
381 } else {
382 LOGERR(klogErr, rc, "POOL/MEMBER(s)");
383 }
384 }
385 #if _DEBUGGING
386 PoolMember_Dump(&obj->member_null->node, obj);
387 BSTreeForEach(obj->member_pool, false, PoolMember_Dump, obj);
388 #endif
389 return rc;
390 }
391
392 typedef struct ExpectedTableMatch_struct {
393 /* in */
394 const char* seq;
395 uint32_t len;
396 const char* tag;
397 enum {
398 etm_Forward = 1,
399 etm_Reverse,
400 etm_Both
401 } direction;
402 bool left_adjusted;
403 /* out */
404 AgrepMatch match;
405 ReadSpecXML_read_BASECALL* bc;
406 } ExpectedTableMatch;
407
408 static
Experiment_ExpectedTableMatch(const ReadSpecXML_read_BASECALL_TABLE * table,ExpectedTableMatch * data)409 bool Experiment_ExpectedTableMatch(const ReadSpecXML_read_BASECALL_TABLE* table, ExpectedTableMatch* data)
410 {
411 uint32_t i;
412
413 assert(table && data && data->seq );
414
415 data->match.score = -1;
416 for(i = 0; i < table->count; i++) {
417 AgrepMatch match;
418 ReadSpecXML_read_BASECALL* bc = &table->table[i];
419
420 if( data->tag != NULL && strcmp(data->tag, bc->read_group_tag) != 0 ) {
421 continue;
422 }
423 if( data->direction != etm_Both ) {
424 if( bc->match_edge == match_edge_End ) {
425 if( data->direction == etm_Forward ) {
426 continue;
427 }
428 } else if( data->direction != etm_Forward ) {
429 continue;
430 }
431 }
432 match.score = -1;
433 if( data->left_adjusted ) {
434 if( AgrepFindFirst(bc->agrep, bc->max_mismatch, data->seq, data->len, &match) ) {
435 if( data->direction == etm_Reverse ) {
436 int32_t right_tail = data->len - match.position - strlen(bc->basecall);
437 if( right_tail > bc->max_mismatch - match.score ) {
438 match.score = -1;
439 } else if( match.position > 0 ) {
440 match.score += right_tail;
441 match.length += right_tail;
442 match.position = 0;
443 }
444 } else {
445 if( match.position > bc->max_mismatch - match.score ) {
446 match.score = -1;
447 } else if( match.position > 0 ) {
448 match.score += match.position;
449 match.length += match.position;
450 match.position = 0;
451 }
452 }
453 }
454 } else {
455 AgrepFindBest(bc->agrep, bc->max_mismatch, data->seq, data->len, &match);
456 }
457 if( !(match.score < 0) ) {
458 DEBUG_MSG(3, ("agrep match bc %s, min: %u, max: %u, edge: %s%s, [%d:%d] score %d\n", bc->basecall, bc->min_match, bc->max_mismatch,
459 bc->match_edge == match_edge_Full ? "Full" : (bc->match_edge == match_edge_Start ? "Start" : "End"),
460 data->left_adjusted ? " left adjusted" : "",
461 match.position, match.length, match.score));
462 if( (data->match.score < 0 || data->match.score > match.score) ) {
463 memmove(&data->match, &match, sizeof(data->match));
464 data->bc = bc;
465 }
466 }
467 }
468 return !(data->match.score < 0);
469 }
470
471 static
Experiment_FillRead(const ReadSpecXML_read * spec,PoolReadSpec * pool,const char ** tag,const uint32_t nreads,int16_t start,const char * bases,uint32_t nbases)472 rc_t Experiment_FillRead(const ReadSpecXML_read* spec, PoolReadSpec* pool,
473 const char** tag, const uint32_t nreads, int16_t start, const char* bases, uint32_t nbases)
474 {
475 rc_t rc = 0;
476
477 switch(spec->coord_type) {
478 case rdsp_RelativeOrder_ct:
479 if( pool[-1].length >= 0 ) {
480 if( pool[-1].start >= 0 ) {
481 pool->start = pool[-1].start + pool[-1].length;
482 } else {
483 pool->start = start;
484 }
485 } else {
486 pool->start = -1;
487 }
488 pool->length = -1;
489 if( nreads > 1 ) {
490 if( (rc = Experiment_FillRead(spec + 1, pool + 1, tag + 1, nreads - 1, start, bases, nbases)) != 0 ) {
491 break;
492 }
493 }
494 if( pool->start < 0 ) {
495 pool->start = pool[1].start;
496 }
497 pool->length = pool[1].start - pool->start;
498 break;
499
500 case rdsp_BaseCoord_ct:
501 case rdsp_CycleCoord_ct:
502 pool->start = spec->coord.start_coord - 1;
503 pool->length = -1;
504 if( nreads > 1 ) {
505 if( (rc = Experiment_FillRead(spec + 1, pool + 1, tag + 1, nreads - 1, pool->start, bases, nbases)) != 0 ) {
506 break;
507 }
508 }
509 pool->length = pool[1].start - pool->start;
510 break;
511
512 case rdsp_ExpectedBaseCall_ct:
513 case rdsp_ExpectedBaseCallTable_ct:
514 {{
515 int32_t gap;
516 if( spec->coord.expected_basecalls.base_coord > 0 ) {
517 pool->start = spec->coord.expected_basecalls.base_coord - 1;
518 start = pool->start;
519 } else {
520 pool->start = -1;
521 }
522 pool->length = -1;
523 pool->item[0] = NULL;
524 pool->score = -1;
525 if( spec->coord.expected_basecalls.match_start > 0 ) {
526 uint32_t i, stop = nbases;
527 ExpectedTableMatch data;
528 /* find terminator, if any */
529 for(i = 1; i < nreads; i++) {
530 if( spec[i].coord_type == rdsp_BaseCoord_ct ||
531 spec[i].coord_type == rdsp_CycleCoord_ct ||
532 spec[i].coord_type == rdsp_FIXED_BRACKET_ct ) {
533 stop = spec[i].coord.start_coord - 1;
534 break;
535 }
536 }
537 data.seq = &bases[start];
538 data.len = stop - start;
539 data.tag = *tag;
540 data.direction = etm_Forward;
541 data.left_adjusted = !(pool->start < 0) || !(pool[-1].length < 0);
542 DEBUG_MSG(3, ("agrep find read %u fwd in: '%.*s' = %u\n", nreads, data.len, data.seq, data.len));
543 if( Experiment_ExpectedTableMatch(&spec->coord.expected_basecalls, &data) ) {
544 DEBUG_MSG(3, ("agrep found from %hd match %d:%d - %d '%s' in '%.*s...' \n",
545 start, data.match.position, data.match.length,
546 data.match.score, data.bc->basecall,
547 start + data.match.position + data.match.length * 3 + data.match.score, &bases[start]));
548 pool->start = start + data.match.position;
549 pool->length = data.match.length;
550 pool->item[0] = data.bc;
551 pool->score = data.match.score;
552 start = pool->start + pool->length;
553 } else {
554 DEBUG_MSG(3, ("agrep nothing found\n"));
555 }
556 }
557 if( pool[-1].length >= 0 && pool->length < 0 && spec->coord.expected_basecalls.match_end == 0 ) {
558 pool->length = spec->coord.expected_basecalls.default_length;
559 start += pool->length;
560 }
561 if( nreads > 1 ) {
562 if( (rc = Experiment_FillRead(spec + 1, pool + 1, tag + 1, nreads - 1, start, bases, nbases)) != 0 ) {
563 break;
564 }
565 }
566 if( spec->coord.expected_basecalls.match_end > 0 ) {
567 /* match_end can have a better match even if match_start found smth before
568 normally table has start or end match edge, not both */
569 char r[4096];
570 uint32_t i;
571 ExpectedTableMatch data;
572
573 data.len = pool[1].start - start;
574 DEBUG_MSG(3, ("agrep find read %u rev in: '%.*s' = %u\n", nreads, data.len, &bases[start], data.len));
575 if( data.len > sizeof(r) ) {
576 rc = RC(rcSRA, rcFormatter, rcResolving, rcBuffer, rcInsufficient);
577 break;
578 }
579 for(i = 0; i < data.len; i++) {
580 /* reverse a portion of seq */
581 r[i] = bases[pool[1].start - i - 1];
582 }
583 data.seq = r;
584 data.tag = *tag;
585 data.direction = etm_Reverse;
586 data.left_adjusted = spec[1].coord_type != rdsp_RelativeOrder_ct && pool[1].start > 0;
587 DEBUG_MSG(3, ("agrep find read %u rev in: '%.*s' = %u\n", nreads, data.len, data.seq, data.len));
588 if( Experiment_ExpectedTableMatch(&spec->coord.expected_basecalls, &data) ) {
589 DEBUG_MSG(3, ("agrep found rev from %hd match %d:%d - %d '%s' in '%.*s' \n",
590 start, data.match.position, data.match.length, data.match.score,
591 data.bc->basecall, data.len, &bases[pool[1].start - data.len]));
592 if( pool->length < 0 || data.match.score < pool->score ) {
593 pool->start = pool[1].start - data.match.position - data.match.length;
594 pool->length = data.match.length;
595 pool->item[0] = data.bc;
596 pool->score = data.match.score;
597 }
598 } else {
599 DEBUG_MSG(3, ("agrep nothing found rev\n"));
600 }
601 }
602 if( pool->length < 0 ) {
603 pool->length = spec->coord.expected_basecalls.default_length;
604 }
605 if( pool->start < 0 ) {
606 pool->start = pool[1].start - pool->length;
607 }
608 gap = pool[1].start - (pool->start + pool->length);
609 if( gap > 0 ) {
610 /* right gap */
611 if( spec[1].coord_type == rdsp_RelativeOrder_ct ) {
612 /* adjust right read's start to the tail of current read */
613 pool[1].start -= gap;
614 pool[1].length += gap;
615 gap = 0;
616 } else if( spec[1].coord_type >= rdsp_ExpectedBaseCall_ct &&
617 spec[1].coord.expected_basecalls.base_coord <= 0 && pool[1].item[0] != NULL ) {
618 /* move left read's start to left as much as possible */
619 int32_t left = pool[1].item[0]->max_mismatch - pool[1].score;
620 if( left > 0 ) {
621 if( left > gap ) {
622 left = gap;
623 }
624 pool[1].length += left;
625 pool[1].score += left;
626 pool[1].start -= left;
627 gap -= left;
628 }
629 }
630 if( gap > 0 && pool->item[0] != NULL ) {
631 /* add gap to end of self */
632 int32_t left = pool->item[0]->max_mismatch - pool->score;
633 if( left > 0 ) {
634 if( left > gap ) {
635 left = gap;
636 }
637 pool->length += left;
638 pool->score += left;
639 }
640 }
641 }
642 break;
643 }}
644 default:
645 rc = RC(rcSRA, rcFormatter, rcResolving, rcItem, rcUnexpected);
646 break;
647 }
648 return rc;
649 }
650
651 typedef struct PoolMember_FindReverse_Data_struct {
652 const char* tags[MAX_SPOT_DESCRIPTOR_READS];
653 uint8_t nreads;
654 const PoolMember* member;
655 } PoolMember_FindReverse_Data;
656
657 static
PoolMember_FindReverse(BSTNode * n,void * d)658 bool CC PoolMember_FindReverse(BSTNode* n, void* d)
659 {
660 const PoolMember* node = (const PoolMember*)n;
661 PoolMember_FindReverse_Data* data = (PoolMember_FindReverse_Data*)d;
662 int r, j, found;
663
664 for(r = 1; r <= data->nreads; r++) {
665 /* comparing pointers here because tags contains same pointers */
666 if( node->spec[r].item[0] == NULL && data->tags[r] == NULL ) {
667 continue;
668 }
669 found = 0;
670 for(j = 0; node->spec[r].item[j] != NULL; j++ ) {
671 if( node->spec[r].item[j]->read_group_tag == data->tags[r] ) {
672 found = 1;
673 break;
674 }
675 }
676 if( found == 0 ) {
677 return false;
678 }
679 }
680 data->member = node;
681 return true;
682 }
683
684 static
Experiment_FillSegAgrep(const ExperimentXML * self,PoolReadSpec * pool,const PoolMember * src,uint32_t nbases,const char * bases,const char ** const new_member_name)685 rc_t Experiment_FillSegAgrep(const ExperimentXML* self, PoolReadSpec* pool, const PoolMember* src,
686 uint32_t nbases, const char* bases,
687 const char** const new_member_name)
688 {
689 rc_t rc = 0;
690 uint32_t i;
691 PoolMember_FindReverse_Data data;
692
693 /* close left and right brackets tech reads as spot terminators */
694 pool[0].start = 0;
695 pool[0].length = 0;
696 pool[self->reads->nreads + 1].start = nbases;
697 pool[self->reads->nreads + 1].length = 0;
698
699 /* setup known tags if it is predefined member search */
700 for(i = 0; i <= self->reads->nreads + 1; i++) {
701 data.tags[i] = src->spec[i].item[0] ? src->spec[i].item[0]->read_group_tag : NULL;
702 }
703 /* start from read, not bracket read */
704 rc = Experiment_FillRead(self->reads->reads, &pool[1], &data.tags[1], self->reads->nreads, 0, bases, nbases);
705
706 if( rc != 0 ) {
707 DEBUG_MSG(3, ("ERROR: %R\n", rc));
708 }
709 DEBUG_MSG(3, ("%s: bases %u: '%.*s'\n", new_member_name ? *new_member_name : NULL, nbases, nbases, bases));
710 for(i = 1; i <= self->reads->nreads; i++) {
711 const char* b = NULL;
712 if( pool[i].start >= 0 ) {
713 b = &bases[pool[i].start];
714 }
715 DEBUG_MSG(3, ("Read %u. [%hd:%hd] - '%.*s'",
716 i, pool[i].start, pool[i].length, pool[i].length < 0 ? 0 : pool[i].length, b));
717 if( self->reads->spec[i].coord_type == rdsp_FIXED_BRACKET_ct ) {
718 DEBUG_MSG(3, (" terminator???"));
719 }
720 if( self->reads->spec[i].coord_type >= rdsp_ExpectedBaseCall_ct ) {
721 if( pool[i].item[0] ) {
722 if( pool[i].item[0]->match_edge == match_edge_End ) {
723 DEBUG_MSG(3, (" in reverse"));
724 }
725 DEBUG_MSG(3, (" matches %s", pool[i].item[0]->basecall));
726 if( pool[i].item[0]->read_group_tag ) {
727 DEBUG_MSG(3, (" tagged %s", pool[i].item[0]->read_group_tag));
728 }
729 DEBUG_MSG(3, (" with %d mismatches", pool[i].score));
730 } else {
731 DEBUG_MSG(3, (" no match"));
732 }
733 }
734 DEBUG_MSG(3, ("\n"));
735 }
736
737 if( rc == 0 && new_member_name != NULL ) {
738 data.nreads = self->reads->nreads;
739 data.member = NULL;
740 for(i = 1; i <= self->reads->nreads; i++) {
741 if( self->reads->spec[i].coord_type == rdsp_ExpectedBaseCallTable_ct && pool[i].item[0] != NULL ) {
742 data.tags[i] = pool[i].item[0]->read_group_tag;
743 } else {
744 data.tags[i] = NULL;
745 }
746 }
747 if( BSTreeDoUntil(self->member_pool, false, PoolMember_FindReverse, &data) ) {
748 *new_member_name = data.member->name;
749 DEBUG_MSG(2, ("Assigned member_name '%s'\n", *new_member_name));
750 } else {
751 #if _DEBUGGING
752 DEBUG_MSG(2, ("member_name reverse lookup failed for tags:"));
753 for(i = 1; i <= self->reads->nreads; i++) {
754 DEBUG_MSG(2, (" '%s'", data.tags[i]));
755 }
756 DEBUG_MSG(2, ("\n"));
757 #endif
758 }
759 }
760 return rc;
761 }
762
Experiment_MemberSeg(const ExperimentXML * self,const char * const file_member_name,const char * const data_block_member_name,uint32_t nbases,const char * bases,SRASegment * seg,const char ** const new_member_name)763 rc_t Experiment_MemberSeg(const ExperimentXML* self,
764 const char* const file_member_name, const char* const data_block_member_name,
765 uint32_t nbases, const char* bases,
766 SRASegment* seg, const char** const new_member_name)
767 {
768 rc_t rc = 0;
769 uint32_t i;
770 PoolReadSpec pool[MAX_SPOT_DESCRIPTOR_READS];
771
772 if( self == NULL || bases == NULL || seg == NULL || new_member_name == NULL ) {
773 rc = RC(rcSRA, rcFormatter, rcResolving, rcParam, rcNull);
774 } else if( nbases < 1 ) {
775 rc = RC(rcSRA, rcFormatter, rcResolving, rcData, rcEmpty);
776 }
777
778 if( rc == 0 ) {
779 const char* mm = NULL;
780 switch(self->processing.barcode_rule) {
781 case eBarcodeRule_not_set:
782 mm = data_block_member_name ? data_block_member_name : file_member_name;
783 break;
784
785 case eBarcodeRule_use_file_spot_name:
786 mm = file_member_name;
787 break;
788
789 case eBarcodeRule_use_table_in_experiment:
790 case eBarcodeRule_ignore_barcode:
791 mm = NULL;
792 break;
793 }
794 if( self->member_pool == NULL || self->processing.barcode_rule == eBarcodeRule_ignore_barcode ) {
795 if( (rc = Experiment_FillSegAgrep(self, pool, self->member_null, nbases, bases, NULL)) == 0 ) {
796 *new_member_name = mm;
797 }
798 } else if( mm == NULL ) {
799 *new_member_name = NULL;
800 rc = Experiment_FillSegAgrep(self, pool, self->member_null, nbases, bases, new_member_name);
801 } else {
802 PoolMember* pm = (PoolMember*)BSTreeFind(self->member_pool, mm, PoolMember_FindByName);
803 if( pm == NULL ) {
804 rc = RC(rcSRA, rcFormatter, rcResolving, rcData, rcNotFound);
805 PLOGERR(klogErr, (klogErr, rc, "specified member_name '$(m)'", PLOG_S(m), mm));
806 } else {
807 if( (rc = Experiment_FillSegAgrep(self, pool, pm, nbases, bases, NULL)) == 0 ) {
808 *new_member_name = mm;
809 }
810 }
811 }
812 }
813
814 if( rc == 0 ) {
815 int32_t new_bases = 0;
816
817 for(i = 1; i <= self->reads->nreads; i++) {
818 if( pool[i].start < 0 || pool[i].length < 0 ) {
819 rc = RC(rcSRA, rcFormatter, rcResolving, rcData, rcIncomplete);
820 PLOGERR(klogErr, (klogErr, rc, "Read #$(i) boundaries", PLOG_U32(i), i));
821 } else if( i == 1 ) {
822 if( pool[i].start != 0 ) {
823 rc = RC(rcSRA, rcFormatter, rcResolving, rcData, rcInvalid);
824 PLOGERR(klogErr, (klogErr, rc, "Read #$(i) do not start at 0", PLOG_U32(i), i));
825 }
826 } else {
827 int16_t delta = pool[i].start - pool[i - 1].length - pool[i - 1].start;
828 /* try to close gap by failng back to default length IF:
829 prev read was EXPECTED_BASECALL OR
830 EXPECTED_BASECALL_TABLE and there an BASECALL chosen
831 AND
832 defailt_length != 0 (was set)
833 */
834 if( delta != 0 &&
835 (self->reads->spec[i - 1].coord_type == rdsp_ExpectedBaseCall_ct ||
836 (self->reads->spec[i - 1].coord_type == rdsp_ExpectedBaseCallTable_ct && pool[i - 1].item[0] != NULL)) &&
837 self->reads->spec[i - 1].coord.expected_basecalls.default_length > 0 ) {
838 new_bases -= pool[i - 1].length;
839 pool[i - 1].length = self->reads->spec[i - 1].coord.expected_basecalls.default_length;
840 DEBUG_MSG(2, ("trying to close delta %hi for read %u: length reset default %hi\n", delta, i - 1, pool[i - 1].length));
841 if( pool[i - 1].item[0] != NULL && pool[i - 1].item[0]->read_group_tag != NULL ) {
842 DEBUG_MSG(2, ("member_name '%s' reset to ''\n", *new_member_name));
843 *new_member_name = NULL;
844 }
845 delta = pool[i].start - pool[i - 1].length - pool[i - 1].start;
846 new_bases += pool[i - 1].length;
847 seg[i - 2].len = pool[i - 1].length;
848 }
849 if( delta > 0 ) {
850 rc = RC(rcSRA, rcFormatter, rcResolving, rcData, rcInconsistent);
851 PLOGERR(klogErr, (klogErr, rc, "Gap between reads #$(p) and #$(i)", PLOG_2(PLOG_U32(p),PLOG_U32(i)), i - 1, i));
852 } else if( delta < 0 ) {
853 rc = RC(rcSRA, rcFormatter, rcResolving, rcData, rcInconsistent);
854 PLOGERR(klogErr, (klogErr, rc, "Reads #$(p) and #$(i) overlap", PLOG_2(PLOG_U32(p),PLOG_U32(i)), i - 1, i));
855 }
856 }
857 seg[i - 1].start = pool[i].start;
858 seg[i - 1].len = pool[i].length;
859 new_bases += pool[i].length;
860 }
861 if( rc == 0 && (new_bases < 0 || nbases != new_bases) ) {
862 rc = RC(rcSRA, rcFormatter, rcResolving, rcData, rcInconsistent);
863 PLOGERR(klogErr, (klogErr, rc, "total read length $(c)", PLOG_U32(c), new_bases));
864 }
865 if( rc != 0 ) {
866 char err_buf[8192], *p = err_buf;
867 size_t max = sizeof(err_buf);
868
869 for(i = 1; i <= self->reads->nreads; i++) {
870 int x = snprintf(p, max, "[%hd:%d, len = %hd", pool[i].start, pool[i].start + pool[i].length, pool[i].length);
871 if( x > 0 && x < max ) {
872 max -= x;
873 p += x;
874 if( self->reads->spec[i].coord_type == rdsp_ExpectedBaseCallTable_ct ) {
875 if( pool[i].item[0] != NULL ) {
876 if( pool[i].item[0]->read_group_tag != NULL ) {
877 x = snprintf(p, max, ", matched tag '%s', mismatches %d]", pool[i].item[0]->read_group_tag, pool[i].score);
878 } else {
879 x = snprintf(p, max, ", mismatches %d]", pool[i].score);
880 }
881 } else {
882 x = snprintf(p, max, ", no match]");
883 }
884 } else {
885 x = snprintf(p, max, "]");
886 }
887 }
888 if( x <= 0 || x >= max ) {
889 max = strlen(err_buf);
890 err_buf[max - 4] = '\0';
891 strcat(err_buf, "... ");
892 break;
893 }
894 max -= x;
895 p += x;
896 }
897 PLOGMSG(klogErr, (klogErr, "segments: $(seg) spot length actual: $(actual), calculated $(calc)",
898 "seg=%s,actual=%u,calc=%d", err_buf, nbases, new_bases));
899 }
900 }
901 return rc;
902 }
903
Experiment_MemberSegSimple(const ExperimentXML * self,const char * const file_member_name,const char * const data_block_member_name,const char ** const new_member_name)904 rc_t Experiment_MemberSegSimple(const ExperimentXML* self,
905 const char* const file_member_name, const char* const data_block_member_name,
906 const char** const new_member_name)
907 {
908 rc_t rc = 0;
909 if( self == NULL || new_member_name == NULL ) {
910 rc = RC(rcSRA, rcFormatter, rcResolving, rcParam, rcNull);
911 } else {
912 switch(self->processing.barcode_rule) {
913 case eBarcodeRule_not_set:
914 *new_member_name = data_block_member_name ? data_block_member_name : file_member_name;
915 break;
916
917 case eBarcodeRule_use_file_spot_name:
918 *new_member_name = file_member_name;
919 break;
920
921 case eBarcodeRule_use_table_in_experiment:
922 case eBarcodeRule_ignore_barcode:
923 *new_member_name = NULL;
924 break;
925 }
926 }
927 return rc;
928 }
929
930 static
Experiment_Apply_RUN_ATTRIBUTES(const ExperimentXML * cself,RunAttributes * attr,uint32_t spot_length)931 rc_t Experiment_Apply_RUN_ATTRIBUTES(const ExperimentXML* cself, RunAttributes* attr, uint32_t spot_length)
932 {
933 rc_t rc = 0;
934
935 if( cself == NULL ) {
936 rc = RC(rcSRA, rcFormatter, rcEvaluating, rcSelf, rcNull);
937 } else if( attr != NULL ) {
938 ExperimentXML* obj = (ExperimentXML*)cself;
939
940 obj->processing.barcode_rule = attr->barcode_rule;
941 if( attr->quality_offset != '\0' ) {
942 obj->processing.quality_offset = attr->quality_offset;
943 }
944 if( attr->quality_type != eExperimentQualityType_Undefined ) {
945 obj->processing.quality_type = attr->quality_type;
946 }
947 obj->spot_length = attr->spot_length ? attr->spot_length : spot_length;
948 if( obj->spot_length == 0 &&
949 (obj->platform->id == SRA_PLATFORM_ILLUMINA || obj->platform->id == SRA_PLATFORM_ABSOLID) ) {
950 PLOGMSG(klogWarn, (klogWarn,
951 "SPOT_LENGTH is not found or specified as $(l)", PLOG_U32(l), obj->spot_length));
952 }
953 }
954 return rc;
955 }
956
Experiment_Make(const ExperimentXML ** self,const KXMLDoc * doc,RunAttributes * attr)957 rc_t Experiment_Make(const ExperimentXML** self, const KXMLDoc* doc, RunAttributes* attr)
958 {
959 rc_t rc = 0;
960 ExperimentXML* obj = NULL;
961 const KXMLNodeset* ns = NULL;
962 uint32_t p_spot_length = 0, r_spot_length = 0;
963
964 if( self == NULL || doc == NULL ) {
965 return RC(rcSRA, rcFormatter, rcConstructing, rcParam, rcNull);
966 }
967 obj = calloc(1, sizeof(*obj));
968 if( obj == NULL ) {
969 return RC(rcSRA, rcFormatter, rcConstructing, rcMemory, rcExhausted);
970 }
971 if( (rc = KXMLDocOpenNodesetRead(doc, &ns, "/EXPERIMENT_SET/EXPERIMENT | /EXPERIMENT")) == 0 ) {
972 uint32_t count = 0;
973 if( (rc = KXMLNodesetCount(ns, &count)) == 0 && count != 1 ) {
974 rc = RC(rcSRA, rcFormatter, rcConstructing, rcTag, count ? rcExcessive : rcNotFound);
975 LOGERR(klogErr, rc, "EXPERIMENT");
976 } else {
977 const KXMLNode* EXPERIMENT = NULL;
978 if( (rc = KXMLNodesetGetNodeRead(ns, &EXPERIMENT, 0)) == 0 ) {
979 if( (rc = PlatformXML_Make(&obj->platform, EXPERIMENT, &p_spot_length)) == 0 ) {
980 if( attr->platform != NULL ) {
981 if( obj->platform->id != attr->platform->id ) {
982 rc = RC(rcSRA, rcFormatter, rcConstructing, rcData, rcInconsistent);
983 LOGERR(klogErr, rc, "EXPERIMENT and RUN platforms differ");
984 } else {
985 PlatformXML_Whack(obj->platform);
986 obj->platform = attr->platform;
987 /* take ownership */
988 attr->platform = NULL;
989 }
990 }
991 if( rc == 0 ) {
992 if( attr->reads == NULL ) {
993 if( (rc = ReadSpecXML_Make(&obj->reads, EXPERIMENT, "DESIGN/SPOT_DESCRIPTOR", &r_spot_length)) != 0 ) {
994 LOGERR(klogErr, rc, "EXPERIMENT/.../READ_SPEC");
995 }
996 } else {
997 obj->reads = attr->reads;
998 /* take ownership */
999 attr->reads = NULL;
1000 }
1001 if( rc == 0 && (rc = parse_POOL(EXPERIMENT, obj)) == 0 ) {
1002 rc = Experiment_Apply_RUN_ATTRIBUTES(obj, attr, r_spot_length ? r_spot_length : p_spot_length);
1003 }
1004 if( rc == 0 && p_spot_length != 0 && r_spot_length != 0 && p_spot_length != r_spot_length ) {
1005 PLOGMSG(klogWarn, (klogWarn,
1006 "in EXPERIMENT sequence length (cycle count) in PLATFORM $(p) differ"
1007 " from SPOT_LENGTH $(l) in SPOT_DECODE_SPEC",
1008 PLOG_2(PLOG_U32(p),PLOG_U32(l)), p_spot_length, r_spot_length));
1009 }
1010 }
1011 } else {
1012 LOGERR(klogErr, rc, "EXPERIMENT/PLATFORM");
1013 }
1014 KXMLNodeRelease(EXPERIMENT);
1015 }
1016 }
1017 KXMLNodesetRelease(ns);
1018 }
1019 if( rc != 0 ) {
1020 *self = NULL;
1021 Experiment_Whack(obj);
1022 } else {
1023 *self = obj;
1024 }
1025 return rc;
1026 }
1027
Experiment_GetPlatform(const ExperimentXML * cself,const PlatformXML ** platform)1028 rc_t Experiment_GetPlatform(const ExperimentXML* cself, const PlatformXML** platform)
1029 {
1030 rc_t rc = 0;
1031 if( cself == NULL || platform == NULL ) {
1032 rc = RC(rcSRA, rcFormatter, rcAccessing, rcParam, rcNull);
1033 } else {
1034 *platform = cself->platform;
1035 }
1036 return rc;
1037 }
1038
Experiment_GetProcessing(const ExperimentXML * cself,const ProcessingXML ** processing)1039 rc_t Experiment_GetProcessing(const ExperimentXML* cself, const ProcessingXML** processing)
1040 {
1041 rc_t rc = 0;
1042 if( cself == NULL || processing == NULL ) {
1043 rc = RC(rcSRA, rcFormatter, rcAccessing, rcParam, rcNull);
1044 } else {
1045 *processing = &cself->processing;
1046 }
1047 return rc;
1048 }
1049
Experiment_GetReadNumber(const ExperimentXML * cself,uint8_t * nreads)1050 rc_t Experiment_GetReadNumber(const ExperimentXML* cself, uint8_t* nreads)
1051 {
1052 rc_t rc = 0;
1053 if( cself == NULL || nreads == NULL ) {
1054 rc = RC(rcSRA, rcFormatter, rcAccessing, rcParam, rcNull);
1055 } else {
1056 *nreads = cself->reads->nreads;
1057 }
1058 return rc;
1059 }
1060
Experiment_GetSpotLength(const ExperimentXML * cself,uint32_t * spot_length)1061 rc_t Experiment_GetSpotLength(const ExperimentXML* cself, uint32_t* spot_length)
1062 {
1063 rc_t rc = 0;
1064 if( cself == NULL || spot_length == NULL ) {
1065 rc = RC(rcSRA, rcFormatter, rcAccessing, rcParam, rcNull);
1066 } else {
1067 *spot_length= cself->spot_length;
1068 }
1069 return rc;
1070 }
1071
Experiment_GetRead(const ExperimentXML * cself,uint8_t read_id,const ReadSpecXML_read ** read_spec)1072 rc_t Experiment_GetRead(const ExperimentXML* cself, uint8_t read_id, const ReadSpecXML_read** read_spec)
1073 {
1074 rc_t rc = 0;
1075 if( cself == NULL || read_spec == NULL ) {
1076 rc = RC(rcSRA, rcFormatter, rcAccessing, rcParam, rcNull);
1077 } else if( read_id >= cself->reads->nreads ) {
1078 rc = RC(rcSRA, rcFormatter, rcAccessing, rcId, rcOutofrange);
1079 } else {
1080 *read_spec = &(cself->reads->reads[read_id]);
1081 }
1082 return rc;
1083 }
1084
1085 typedef struct PoolMember_FindByTag_Data_struct {
1086 /* in */
1087 const char* tag;
1088 uint8_t readid;
1089 /* out */
1090 const char* member_name;
1091 } PoolMember_FindByTag_Data;
1092
1093 static
PoolMember_FindByTag(BSTNode * node,void * data)1094 bool CC PoolMember_FindByTag(BSTNode *node, void *data)
1095 {
1096 PoolMember_FindByTag_Data* d = (PoolMember_FindByTag_Data*)data;
1097 const PoolMember* n = (const PoolMember*)node;
1098 int i = 0;
1099
1100 const ReadSpecXML_read_BASECALL* bc = n->spec[d->readid].item[i];
1101 while( bc != NULL ) {
1102 if( strcmp(bc->read_group_tag, d->tag) == 0 ) {
1103 d->member_name = n->name;
1104 return true;
1105 }
1106 bc = n->spec[d->readid].item[++i];
1107 }
1108 return false;
1109 }
1110
Experiment_FindReadInTable(const ExperimentXML * cself,uint8_t read_id,const char * key,const char ** basecall,const char ** member_name)1111 rc_t Experiment_FindReadInTable(const ExperimentXML* cself, uint8_t read_id, const char* key, const char** basecall, const char** member_name)
1112 {
1113 rc_t rc = 0;
1114 if( cself == NULL || key == NULL || (basecall == NULL && member_name == NULL) ) {
1115 rc = RC(rcSRA, rcFormatter, rcAccessing, rcParam, rcNull);
1116 } else if( read_id >= cself->reads->nreads ) {
1117 rc = RC(rcSRA, rcFormatter, rcAccessing, rcId, rcOutofrange);
1118 } else {
1119 const ReadSpecXML_read* read_spec = &(cself->reads->reads[read_id++]);
1120 if( read_spec->coord_type != rdsp_ExpectedBaseCallTable_ct ) {
1121 rc = RC(rcSRA, rcFormatter, rcAccessing, rcName, rcNotFound);
1122 } else {
1123 ExpectedTableMatch match;
1124
1125 rc = RC(rcSRA, rcFormatter, rcAccessing, rcName, rcNotFound);
1126 /* try to find key as basecall */
1127 match.seq = key;
1128 match.len = strlen(key);
1129 match.tag = NULL;
1130 match.direction = etm_Both;
1131 match.left_adjusted = true;
1132 if( Experiment_ExpectedTableMatch(&read_spec->coord.expected_basecalls, &match) ) {
1133 rc = 0;
1134 if( basecall != NULL ) {
1135 *basecall = match.bc->basecall;
1136 }
1137 if( member_name != NULL ) {
1138 if( cself->member_pool != NULL ) {
1139 PoolMember_FindByTag_Data data;
1140 data.readid = read_id;
1141 data.tag = match.bc->read_group_tag;
1142 if( BSTreeDoUntil(cself->member_pool, false, PoolMember_FindByTag, &data) ) {
1143 *member_name = data.member_name;
1144 } else {
1145 rc = RC(rcSRA, rcFormatter, rcAccessing, rcSelf, rcCorrupt);
1146 }
1147 } else {
1148 *member_name = NULL;
1149 }
1150 }
1151 } else if( cself->member_pool != NULL ) {
1152 /* try to find key as member_name */
1153 PoolMember* pm = (PoolMember*)BSTreeFind(cself->member_pool, key, PoolMember_FindByName);
1154 if( pm != NULL ) {
1155 rc = 0;
1156 if( member_name != NULL ) {
1157 *member_name = pm->name;
1158 }
1159 if( basecall != NULL ) {
1160 if( pm->spec[read_id].item[0] != NULL ) {
1161 *basecall = pm->spec[read_id].item[0]->basecall;
1162 if( pm->spec[read_id].item[1] != NULL ) {
1163 rc = RC(rcSRA, rcFormatter, rcAccessing, rcData, rcAmbiguous);
1164 }
1165 } else {
1166 rc = RC(rcSRA, rcFormatter, rcAccessing, rcData, rcViolated);
1167 }
1168 }
1169 }
1170 }
1171 }
1172 }
1173 return rc;
1174 }
1175
Experiment_HasPool(const ExperimentXML * cself,bool * has_pool)1176 rc_t Experiment_HasPool(const ExperimentXML* cself, bool* has_pool)
1177 {
1178 rc_t rc = 0;
1179 if( cself == NULL || has_pool == NULL ) {
1180 rc = RC(rcSRA, rcFormatter, rcAccessing, rcParam, rcNull);
1181 } else {
1182 *has_pool = cself->member_pool != NULL;
1183 }
1184 return rc;
1185 }
1186
Experiment_ReadSegDefault(const ExperimentXML * self,SRASegment * seg)1187 rc_t Experiment_ReadSegDefault(const ExperimentXML* self, SRASegment* seg)
1188 {
1189 rc_t rc = 0;
1190
1191 /* TBD to do memberseg call based on junk seq of known length */
1192 if( self == NULL || seg == NULL ) {
1193 rc = RC(rcSRA, rcFormatter, rcResolving, rcParam, rcNull);
1194 } else if( self->spot_length == 0 ) {
1195 rc = RC(rcSRA, rcFormatter, rcConstructing, rcData, rcUnsupported);
1196 } else {
1197 uint32_t i;
1198 int32_t spot_len = self->spot_length;
1199
1200 i = self->reads->nreads;
1201 do {
1202 int16_t len = 0;
1203 --i;
1204 switch(self->reads->reads[i].coord_type) {
1205 case rdsp_RelativeOrder_ct:
1206 if( i == 0 ) {
1207 len = spot_len;
1208 } else {
1209 len = 0;
1210 }
1211 break;
1212 case rdsp_BaseCoord_ct:
1213 case rdsp_CycleCoord_ct:
1214 len = spot_len - (self->reads->reads[i].coord.start_coord - 1);
1215 break;
1216 case rdsp_ExpectedBaseCall_ct:
1217 case rdsp_ExpectedBaseCallTable_ct:
1218 if( self->reads->reads[i].coord.expected_basecalls.base_coord > 0 ) {
1219 len = spot_len - (self->reads->reads[i].coord.expected_basecalls.base_coord - 1);
1220 }
1221 len += self->reads->reads[i].coord.expected_basecalls.default_length;
1222 break;
1223 default:
1224 rc = RC(rcSRA, rcFormatter, rcConstructing, rcData, rcUnexpected);
1225 LOGERR(klogErr, rc, "read type in default");
1226 break;
1227 }
1228 spot_len -= len;
1229 if( spot_len < 0 || len < 0 ) {
1230 rc = RC(rcSRA, rcFormatter, rcConstructing, rcData, rcInconsistent);
1231 LOGERR(klogErr, rc, "cumulative read lengths and SEQUENCE_LENGTH");
1232 return rc;
1233 } else {
1234 seg[i].start = spot_len;
1235 seg[i].len = len;
1236 }
1237 } while( i > 0 );
1238 }
1239 return rc;
1240 }
1241
Experiment_Whack(const ExperimentXML * cself)1242 void Experiment_Whack(const ExperimentXML* cself)
1243 {
1244 if( cself != NULL ) {
1245 ExperimentXML* self = (ExperimentXML*)cself;
1246
1247 free(self->processing.quality_scorer);
1248 ReadSpecXML_Whack(self->reads);
1249 BSTreeWhack(self->member_pool, PoolMember_Whack, NULL);
1250 free(self->member_pool);
1251 PoolMember_Whack(&self->member_null->node, NULL);
1252 PlatformXML_Whack(self->platform);
1253 free(self);
1254 }
1255 }
1256