1 /*===========================================================================
2 *
3 *                            PUBLIC DOMAIN NOTICE
4 *               National Center for Biotechnology Information
5 *
6 *  This software/database is a "United States Government Work" under the
7 *  terms of the United States Copyright Act.  It was written as part of
8 *  the author's official duties as a United States Government employee and
9 *  thus cannot be copyrighted.  This software/database is freely available
10 *  to the public for use. The National Library of Medicine and the U.S.
11 *  Government have not placed any restriction on its use or reproduction.
12 *
13 *  Although all reasonable efforts have been taken to ensure the accuracy
14 *  and reliability of the software and data, the NLM and the U.S.
15 *  Government do not and cannot warrant the performance or results that
16 *  may be obtained by using this software or data. The NLM and the U.S.
17 *  Government disclaim all warranties, express or implied, including
18 *  warranties of performance, merchantability or fitness for any particular
19 *  purpose.
20 *
21 *  Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 #include <klib/rc.h>
27 #include <klib/log.h>
28 #include <klib/debug.h>
29 #include <klib/container.h>
30 #include <klib/trie.h>
31 #include <sra/types.h>
32 #include <os-native.h>
33 
34 #include "experiment-xml.h"
35 #include "debug.h"
36 
37 #include <stdlib.h>
38 #include <stdio.h>
39 #include <string.h>
40 #include <ctype.h>
41 #include <assert.h>
42 
43 #define MAX_POOL_MEMBER_BASECALLS 64
44 #define MAX_SPOT_DESCRIPTOR_READS (SRALOAD_MAX_READS + 2)
45 
46 typedef struct PoolReadSpec_struct {
47     int16_t start;
48     int16_t length;
49     /* chosen basecalls for EXPECTED_BASECALL_TABLE for defined member_name, NULL-terminated array */
50     const ReadSpecXML_read_BASECALL* item[MAX_POOL_MEMBER_BASECALLS];
51     int32_t score;
52 } PoolReadSpec;
53 
54 typedef struct PoolMember_struct {
55     BSTNode node;
56     const char *name;
57     PoolReadSpec spec[MAX_SPOT_DESCRIPTOR_READS];
58 } PoolMember;
59 
60 struct ExperimentXML {
61     uint32_t spot_length;
62     const PlatformXML* platform;
63     ProcessingXML processing;
64     const ReadSpecXML* reads;
65     BSTree* member_pool;
66     PoolMember* member_null;
67     SLList member_stats;
68 };
69 
70 static
PoolMember_Whack(BSTNode * node,void * data)71 void CC PoolMember_Whack(BSTNode* node, void* data)
72 {
73     if( node != NULL ) {
74         const PoolMember* n = (const PoolMember*)node;
75         free((void*)n->name);
76         free(node);
77     }
78 }
79 
80 static
PoolMember_StrCmp(const char * s1,const char * s2)81 int64_t CC PoolMember_StrCmp(const char* s1, const char* s2)
82 {
83     if( s1 == NULL && s2 == NULL ) {
84         return 0;
85     } else if( s1 == NULL ) {
86         return -32767;
87     } else if( s2 == NULL ) {
88         return 32768;
89     }
90     return strcmp(s1, s2);
91 }
92 
93 static
PoolMember_Cmp(const BSTNode * item,const BSTNode * node)94 int64_t CC PoolMember_Cmp(const BSTNode* item, const BSTNode* node)
95 {
96     const PoolMember* i = (const PoolMember*)item;
97     const PoolMember* n = (const PoolMember*)node;
98 
99     return PoolMember_StrCmp(i->name, n->name);
100 }
101 
102 static
PoolMember_FindByName(const void * item,const BSTNode * node)103 int64_t CC PoolMember_FindByName(const void* item, const BSTNode* node)
104 {
105     const char* name = (const char*)item;
106     const PoolMember* n = (const PoolMember*)node;
107 
108     return PoolMember_StrCmp(name, n->name);
109 }
110 
111 #if _DEBUGGING
112 static
PoolMember_Dump(BSTNode * node,void * data)113 void CC PoolMember_Dump( BSTNode *node, void *data )
114 {
115     const PoolMember* n = (const PoolMember*)node;
116     const ExperimentXML* self = (const ExperimentXML*)data;
117     int r;
118 
119     DEBUG_MSG(9, ("Member '%s'\n", n->name));
120 
121     for(r = 0; r <= self->reads->nreads + 1; r++) {
122         DEBUG_MSG(9, ("Read %d [%hd:%hd]", r + 1, n->spec[r].start, n->spec[r].length));
123         if( self->reads->spec[r].coord_type == rdsp_ExpectedBaseCall_ct ) {
124             DEBUG_MSG(9, (" ebc:'%s'", n->spec[r].item[0]->basecall));
125         } else if( self->reads->spec[r].coord_type == rdsp_ExpectedBaseCallTable_ct ) {
126             int i = -1;
127             while( n->spec[r].item[++i] != NULL ) {
128                 DEBUG_MSG(9, (" bc:'%s', tag:'%s';", n->spec[r].item[i]->basecall, n->spec[r].item[i]->read_group_tag));
129             }
130         }
131         DEBUG_MSG(9, ("\n"));
132     }
133 }
134 #endif
135 
136 static
PoolMember_Add(ExperimentXML * self,const char * name)137 rc_t PoolMember_Add(ExperimentXML* self, const char* name)
138 {
139     rc_t rc = 0;
140     PoolMember* member = NULL;
141 
142     if( self == NULL) {
143         return RC(rcSRA, rcFormatter, rcAllocating, rcSelf, rcNull);
144     }
145     member = calloc(1, sizeof(*member));
146     if( member == NULL ) {
147         return RC(rcSRA, rcFormatter, rcAllocating, rcMemory, rcExhausted);
148     }
149     if( name == NULL ) {
150         if( self->member_null == NULL ) {
151             self->member_null = member;
152         } else {
153             rc = RC(rcSRA, rcFormatter, rcAllocating, rcId, rcDuplicate);
154         }
155     } else {
156         member->name = strdup(name);
157         if( member->name == NULL ) {
158             rc = RC(rcSRA, rcFormatter, rcAllocating, rcMemory, rcExhausted);
159         } else {
160             if( self->member_pool == NULL ) {
161                 self->member_pool = calloc(1, sizeof(BSTree));
162                 if( self->member_pool == NULL ) {
163                     rc = RC(rcSRA, rcFormatter, rcAllocating, rcMemory, rcExhausted);
164                 } else {
165                     BSTreeInit(self->member_pool);
166                 }
167             }
168             if( rc == 0 ) {
169                 rc = BSTreeInsertUnique(self->member_pool, &member->node, NULL, PoolMember_Cmp);
170             }
171         }
172     }
173     if( rc == 0 ) {
174         uint32_t i;
175         for(i = 0; i <= self->reads->nreads + 1; i++) {
176             switch(self->reads->spec[i].coord_type) {
177                 case rdsp_FIXED_BRACKET_ct:
178                     member->spec[i].start = self->reads->spec[i].coord.start_coord - 1;
179                     member->spec[i].length = (i == 0 ? 0 : -1);
180                     break;
181                 case rdsp_RelativeOrder_ct:
182                     member->spec[i].start = -1;
183                     member->spec[i].length = -1;
184                     break;
185 
186                 case rdsp_BaseCoord_ct:
187                 case rdsp_CycleCoord_ct:
188                     member->spec[i].start = self->reads->spec[i].coord.start_coord - 1;
189                     member->spec[i].length = -1;
190                     break;
191 
192                 case rdsp_ExpectedBaseCall_ct:
193                     if( name == NULL || name[0] == '\0' ) {
194                         member->spec[i].start = self->reads->spec[i].coord.expected_basecalls.base_coord;
195                         member->spec[i].length = self->reads->spec[i].coord.expected_basecalls.default_length;
196                     }
197                     if( member->spec[i].start == 0 ) {
198                         member->spec[i].start = -1;
199                     }
200                     member->spec[i].item[0] = self->reads->spec[i].coord.expected_basecalls.table;
201                     if( member->spec[i].length == 0 ) {
202                         member->spec[i].length = strlen(member->spec[i].item[0]->basecall);
203                     }
204                     break;
205 
206                 case rdsp_ExpectedBaseCallTable_ct:
207                     if( name == NULL || name[0] == '\0' ) {
208                         member->spec[i].start = self->reads->spec[i].coord.expected_basecalls.base_coord;
209                         member->spec[i].length = self->reads->spec[i].coord.expected_basecalls.default_length;
210                     }
211                     if( member->spec[i].start == 0 ) {
212                         member->spec[i].start = -1;
213                     }
214                     if( member->spec[i].length == 0 ) {
215                         member->spec[i].length = (!self->reads->spec[i].coord.expected_basecalls.pooled || name == NULL) ? -1 : 0;
216                     }
217                     break;
218             }
219         }
220     } else {
221         PoolMember_Whack(&member->node, NULL);
222     }
223     return rc;
224 }
225 
226 static
PoolMember_AddBasecalls(const ExperimentXML * self,const char * member,int readid,const ReadSpecXML_read_BASECALL * basecalls[])227 rc_t PoolMember_AddBasecalls(const ExperimentXML* self, const char* member, int readid, const ReadSpecXML_read_BASECALL* basecalls[])
228 {
229     rc_t rc = 0;
230 
231     if( self == NULL || basecalls == NULL ) {
232         rc = RC(rcSRA, rcFormatter, rcParsing, rcParam, rcNull);
233     } else if( self->reads->reads[readid].coord_type != rdsp_ExpectedBaseCallTable_ct ) {
234         rc = RC(rcSRA, rcFormatter, rcParsing, rcId, rcIncorrect);
235     } else {
236         PoolMember* pm = (PoolMember*)BSTreeFind(self->member_pool, member, PoolMember_FindByName);
237         if( pm == NULL ) {
238             rc = RC(rcSRA, rcFormatter, rcResolving, rcData, rcNotFound);
239             PLOGERR(klogErr, (klogErr, rc, "MEMBER '$(m)'", PLOG_S(m), member));
240         } else {
241             int k = -1, i = -1;
242             while( pm->spec[readid + 1].item[++k] != NULL );
243             while( rc == 0 && basecalls[++i] != NULL ) {
244                 if( k >= MAX_POOL_MEMBER_BASECALLS ) {
245                     rc = RC(rcSRA, rcFormatter, rcConstructing, rcData, rcTooLong);
246                     PLOGERR(klogErr, (klogErr, rc, "MEMBER '$(m)' basecalls", PLOG_S(m), member));
247                 } else {
248                     pm->spec[readid + 1].item[k++] = basecalls[i];
249                 }
250             }
251         }
252     }
253     return rc;
254 }
255 
parse_POOL(const KXMLNode * EXPERIMENT,ExperimentXML * obj)256 rc_t parse_POOL(const KXMLNode* EXPERIMENT, ExperimentXML* obj)
257 {
258     rc_t rc = 0;
259 
260     /* create unspecified member -> NULL */
261     if( (rc = PoolMember_Add(obj, NULL)) == 0 ) {
262         const KXMLNodeset* MEMBERS;
263         if( (rc = KXMLNodeOpenNodesetRead(EXPERIMENT, &MEMBERS, "DESIGN/SAMPLE_DESCRIPTOR/POOL/MEMBER")) == 0 ) {
264             uint32_t i, num_members = 0;
265             if( (rc = KXMLNodesetCount(MEMBERS, &num_members)) == 0 && num_members > 0 ) {
266                 /* create default member -> '' */
267                 rc = PoolMember_Add(obj, "");
268                 for(i = 0; rc == 0 && i < num_members; i++) {
269                     const KXMLNode* MEMBER;
270                     if( (rc = KXMLNodesetGetNodeRead(MEMBERS, &MEMBER, i)) == 0 ) {
271                         char* member_name = NULL;
272                         if( (rc = KXMLNodeReadAttrCStr(MEMBER, "member_name", &member_name, NULL)) == 0 ) {
273                             const KXMLNodeset* LABELS;
274                             if( strcasecmp(member_name, "default") == 0 ) {
275                                 member_name[0] = '\0';
276                             }
277                             if( member_name[0] != '\0' ) {
278                                 if( (rc = PoolMember_Add(obj, member_name)) != 0 ) {
279                                     PLOGERR(klogErr, (klogErr, rc, "MEMBER '$(m)'", PLOG_S(m), member_name));
280                                 }
281                             }
282                             if( rc == 0 && (rc = KXMLNodeOpenNodesetRead(MEMBER, &LABELS, "READ_LABEL")) == 0 ) {
283                                 uint32_t l, num_labels = 0;
284                                 KXMLNodesetCount(LABELS, &num_labels);
285                                 if( num_labels > 0 ) {
286                                     const ReadSpecXML_read_BASECALL* basecalls[MAX_POOL_MEMBER_BASECALLS];
287                                     for(l = 0; rc == 0 && l < num_labels; l++) {
288                                         const KXMLNode* LABEL;
289                                         if( (rc = KXMLNodesetGetNodeRead(LABELS, &LABEL, l)) == 0 ) {
290                                             char* label = NULL, *tag = NULL;
291                                             if( (rc = KXMLNodeReadCStr(LABEL, &label, NULL)) != 0 || label == NULL || label[0] == '\0' ) {
292                                                 rc = rc ? rc : RC(rcSRA, rcFormatter, rcConstructing, rcData, rcEmpty);
293                                                 PLOGERR(klogErr, (klogErr, rc, "MEMBER '$(m)' READ_LABEL value", PLOG_S(m), member_name));
294                                             } else {
295                                                 if( (rc = KXMLNodeReadAttrCStr(LABEL, "read_group_tag", &tag, NULL)) != 0 ) {
296                                                     if( GetRCState(rc) != rcNotFound ) {
297                                                         PLOGERR(klogErr, (klogErr, rc, "MEMBER '$(m)' READ_LABEL '$(l)' 'read_group_tag' attribute",
298                                                                                         PLOG_2(PLOG_S(m),PLOG_S(l)), member_name, label));
299                                                     } else {
300                                                         rc = 0;
301                                                     }
302                                                 } else if( tag != NULL && tag[0] == '\0' )  {
303                                                     rc = RC(rcSRA, rcFormatter, rcConstructing, rcData, rcEmpty);
304                                                     PLOGERR(klogErr, (klogErr, rc, "MEMBER '$(m)' READ_LABEL '$(l)' 'read_group_tag' attribute",
305                                                                                    PLOG_2(PLOG_S(m),PLOG_S(l)), member_name, label));
306                                                 }
307                                                 if( rc == 0 ) {
308                                                     /* find read_spec based on label and update basecall table */
309                                                     uint32_t r, read_found = 0;
310                                                     for(r = 0; rc == 0 && r < obj->reads->nreads; r++) {
311                                                         if( obj->reads->reads[r].read_label != NULL && strcmp(label, obj->reads->reads[r].read_label) == 0 ) {
312                                                             read_found++;
313                                                             if( tag != NULL && tag[0] != '\0' ) {
314                                                                 if( obj->reads->reads[r].coord_type != rdsp_ExpectedBaseCallTable_ct ) {
315                                                                     rc = RC(rcSRA, rcFormatter, rcConstructing, rcData, rcInconsistent);
316                                                                     PLOGERR(klogErr, (klogErr, rc,
317                                                                             "MEMBER '$(m)' READ_LABEL '$(l)' READ_SPEC must have EXPECTED_BASECALL_TABLE type",
318                                                                             PLOG_2(PLOG_S(m),PLOG_S(l)), member_name, label));
319                                                                 } else {
320                                                                     uint32_t b, bcall_found = 0;
321                                                                     memset(basecalls, 0, sizeof(basecalls));
322                                                                     /* find tag in table */
323                                                                     for(b = 0; rc == 0 && b < obj->reads->reads[r].coord.expected_basecalls.count; b++) {
324                                                                         if( obj->reads->reads[r].coord.expected_basecalls.table[b].read_group_tag != NULL &&
325                                                                             strcmp(tag, obj->reads->reads[r].coord.expected_basecalls.table[b].read_group_tag) == 0 ) {
326                                                                             if( bcall_found >= MAX_POOL_MEMBER_BASECALLS ) {
327                                                                                 rc = RC(rcSRA, rcFormatter, rcConstructing, rcData, rcExcessive);
328                                                                                 PLOGERR(klogErr, (klogErr, rc,
329                                                                                         "MEMBER '$(m)' READ_LABEL '$(l)' read_group_tag='$(t)' too many basecalls",
330                                                                                          PLOG_3(PLOG_S(m),PLOG_S(l),PLOG_S(t)), member_name, label, tag));
331                                                                             }
332                                                                             basecalls[bcall_found++] = &obj->reads->reads[r].coord.expected_basecalls.table[b];
333                                                                         }
334                                                                     }
335                                                                     if( rc == 0 && bcall_found == 0 ) {
336                                                                         rc = RC(rcSRA, rcFormatter, rcConstructing, rcData, rcNotFound);
337                                                                         PLOGERR(klogErr, (klogErr, rc, "MEMBER '$(m)' READ_LABEL '$(l)' read_group_tag='$(t)' in READ_SPEC",
338                                                                                  PLOG_3(PLOG_S(m),PLOG_S(l),PLOG_S(t)), member_name, label, tag));
339                                                                     } else {
340                                                                         rc = PoolMember_AddBasecalls(obj, member_name, r, basecalls);
341                                                                     }
342                                                                 }
343                                                             }
344                                                         }
345                                                     }
346                                                     if( read_found == 0 ) {
347                                                         rc = RC(rcSRA, rcFormatter, rcConstructing, rcData, rcNotFound);
348                                                         PLOGERR(klogErr, (klogErr, rc, "MEMBER '$(m)' READ_LABEL '$(l)' in READ_SPEC", PLOG_2(PLOG_S(l),PLOG_S(m)), label, member_name));
349                                                     } else if( read_found > 1 ) {
350                                                         rc = RC(rcSRA, rcFormatter, rcConstructing, rcData, rcInconsistent);
351                                                         PLOGERR(klogErr, (klogErr, rc, "MEMBER '$(m)' READ_LABEL '$(l)' in READ_SPEC", PLOG_2(PLOG_S(l),PLOG_S(m)), label, member_name));
352                                                     }
353                                                 }
354                                             }
355                                             free(tag);
356                                             free(label);
357                                             KXMLNodeRelease(LABEL);
358                                         } else {
359                                             PLOGERR(klogErr, (klogErr, rc, "MEMBER '$(m)' READ_LABEL #$(i)", PLOG_2(PLOG_S(m),PLOG_U32(i)), member_name, l));
360                                         }
361                                     }
362                                 } else {
363                                     rc = RC(rcSRA, rcFormatter, rcConstructing, rcData, rcInvalid);
364                                     PLOGERR(klogErr, (klogErr, rc, "missing READ_LABEL element(s) in MEMBER '$(m)'", PLOG_S(m), member_name));
365                                 }
366                                 KXMLNodesetRelease(LABELS);
367                             }
368                             free(member_name);
369                         } else {
370                             PLOGERR(klogErr, (klogErr, rc, "'member_name' in MEMBER node #$(i)", PLOG_U32(i), i));
371                         }
372                         KXMLNodeRelease(MEMBER);
373                     } else {
374                         PLOGERR(klogErr, (klogErr, rc, "MEMBER node #$(i)", PLOG_U32(i), i));
375                     }
376                 }
377             } else if( rc != 0 ) {
378                 LOGERR(klogErr, rc, "POOL/MEMBER(s)");
379             }
380             KXMLNodesetRelease(MEMBERS);
381         } else {
382             LOGERR(klogErr, rc, "POOL/MEMBER(s)");
383         }
384     }
385 #if _DEBUGGING
386     PoolMember_Dump(&obj->member_null->node, obj);
387     BSTreeForEach(obj->member_pool, false, PoolMember_Dump, obj);
388 #endif
389     return rc;
390 }
391 
392 typedef struct ExpectedTableMatch_struct {
393     /* in */
394     const char* seq;
395     uint32_t len;
396     const char* tag;
397     enum {
398         etm_Forward = 1,
399         etm_Reverse,
400         etm_Both
401     } direction;
402     bool left_adjusted;
403     /* out */
404     AgrepMatch match;
405     ReadSpecXML_read_BASECALL* bc;
406 } ExpectedTableMatch;
407 
408 static
Experiment_ExpectedTableMatch(const ReadSpecXML_read_BASECALL_TABLE * table,ExpectedTableMatch * data)409 bool Experiment_ExpectedTableMatch(const ReadSpecXML_read_BASECALL_TABLE* table, ExpectedTableMatch* data)
410 {
411     uint32_t i;
412 
413     assert(table && data && data->seq );
414 
415     data->match.score = -1;
416     for(i = 0; i < table->count; i++) {
417         AgrepMatch match;
418         ReadSpecXML_read_BASECALL* bc = &table->table[i];
419 
420         if( data->tag != NULL && strcmp(data->tag, bc->read_group_tag) != 0 ) {
421             continue;
422         }
423         if( data->direction != etm_Both ) {
424             if( bc->match_edge == match_edge_End ) {
425                 if( data->direction == etm_Forward ) {
426                     continue;
427                 }
428             } else if( data->direction != etm_Forward ) {
429                 continue;
430             }
431         }
432         match.score = -1;
433         if( data->left_adjusted ) {
434             if( AgrepFindFirst(bc->agrep, bc->max_mismatch, data->seq, data->len, &match) ) {
435                 if( data->direction == etm_Reverse ) {
436                     int32_t right_tail = data->len - match.position - strlen(bc->basecall);
437                     if( right_tail > bc->max_mismatch - match.score ) {
438                         match.score = -1;
439                     } else if( match.position > 0 ) {
440                         match.score += right_tail;
441                         match.length += right_tail;
442                         match.position = 0;
443                     }
444                 } else {
445                     if( match.position > bc->max_mismatch - match.score ) {
446                         match.score = -1;
447                     } else if( match.position > 0 ) {
448                         match.score += match.position;
449                         match.length += match.position;
450                         match.position = 0;
451                     }
452                 }
453             }
454         } else {
455             AgrepFindBest(bc->agrep, bc->max_mismatch, data->seq, data->len, &match);
456         }
457         if( !(match.score < 0) ) {
458             DEBUG_MSG(3, ("agrep match bc %s, min: %u, max: %u, edge: %s%s, [%d:%d] score %d\n", bc->basecall, bc->min_match, bc->max_mismatch,
459                        bc->match_edge == match_edge_Full ? "Full" : (bc->match_edge == match_edge_Start ? "Start" : "End"),
460                        data->left_adjusted ? " left adjusted" : "",
461                        match.position, match.length, match.score));
462             if( (data->match.score < 0 || data->match.score > match.score) ) {
463                 memmove(&data->match, &match, sizeof(data->match));
464                 data->bc = bc;
465             }
466         }
467     }
468     return !(data->match.score < 0);
469 }
470 
471 static
Experiment_FillRead(const ReadSpecXML_read * spec,PoolReadSpec * pool,const char ** tag,const uint32_t nreads,int16_t start,const char * bases,uint32_t nbases)472 rc_t Experiment_FillRead(const ReadSpecXML_read* spec, PoolReadSpec* pool,
473                          const char** tag, const uint32_t nreads, int16_t start, const char* bases, uint32_t nbases)
474 {
475     rc_t rc = 0;
476 
477     switch(spec->coord_type) {
478         case rdsp_RelativeOrder_ct:
479             if( pool[-1].length >= 0 ) {
480                 if( pool[-1].start >= 0 ) {
481                     pool->start = pool[-1].start + pool[-1].length;
482                 } else {
483                     pool->start = start;
484                 }
485             } else {
486                 pool->start = -1;
487             }
488             pool->length = -1;
489             if( nreads > 1 ) {
490                 if( (rc = Experiment_FillRead(spec + 1, pool + 1, tag + 1, nreads - 1, start, bases, nbases)) != 0 ) {
491                     break;
492                 }
493             }
494             if( pool->start < 0 ) {
495                 pool->start = pool[1].start;
496             }
497             pool->length = pool[1].start - pool->start;
498             break;
499 
500         case rdsp_BaseCoord_ct:
501         case rdsp_CycleCoord_ct:
502             pool->start = spec->coord.start_coord - 1;
503             pool->length = -1;
504             if( nreads > 1 ) {
505                 if( (rc = Experiment_FillRead(spec + 1, pool + 1, tag + 1, nreads - 1, pool->start, bases, nbases)) != 0 ) {
506                     break;
507                 }
508             }
509             pool->length = pool[1].start - pool->start;
510             break;
511 
512         case rdsp_ExpectedBaseCall_ct:
513         case rdsp_ExpectedBaseCallTable_ct:
514             {{
515             int32_t gap;
516             if( spec->coord.expected_basecalls.base_coord > 0 ) {
517                 pool->start = spec->coord.expected_basecalls.base_coord - 1;
518                 start = pool->start;
519             } else {
520                 pool->start = -1;
521             }
522             pool->length = -1;
523             pool->item[0] = NULL;
524             pool->score = -1;
525             if( spec->coord.expected_basecalls.match_start > 0 ) {
526                 uint32_t i, stop = nbases;
527                 ExpectedTableMatch data;
528                 /* find terminator, if any */
529                 for(i = 1; i < nreads; i++) {
530                     if( spec[i].coord_type == rdsp_BaseCoord_ct ||
531                         spec[i].coord_type == rdsp_CycleCoord_ct ||
532                         spec[i].coord_type == rdsp_FIXED_BRACKET_ct ) {
533                         stop = spec[i].coord.start_coord - 1;
534                         break;
535                     }
536                 }
537                 data.seq = &bases[start];
538                 data.len = stop - start;
539                 data.tag = *tag;
540                 data.direction = etm_Forward;
541                 data.left_adjusted = !(pool->start < 0) || !(pool[-1].length < 0);
542                 DEBUG_MSG(3, ("agrep find read %u fwd in: '%.*s' = %u\n", nreads, data.len, data.seq, data.len));
543                 if( Experiment_ExpectedTableMatch(&spec->coord.expected_basecalls, &data) ) {
544                     DEBUG_MSG(3, ("agrep found from %hd match %d:%d - %d '%s' in '%.*s...' \n",
545                                    start, data.match.position, data.match.length,
546                                    data.match.score, data.bc->basecall,
547                                    start + data.match.position + data.match.length * 3 + data.match.score, &bases[start]));
548                     pool->start = start + data.match.position;
549                     pool->length = data.match.length;
550                     pool->item[0] = data.bc;
551                     pool->score = data.match.score;
552                     start = pool->start + pool->length;
553                 } else {
554                     DEBUG_MSG(3, ("agrep nothing found\n"));
555                 }
556             }
557             if( pool[-1].length >= 0 && pool->length < 0 && spec->coord.expected_basecalls.match_end == 0 ) {
558                 pool->length = spec->coord.expected_basecalls.default_length;
559                 start += pool->length;
560             }
561             if( nreads > 1 ) {
562                 if( (rc = Experiment_FillRead(spec + 1, pool + 1, tag + 1, nreads - 1, start, bases, nbases)) != 0 ) {
563                     break;
564                 }
565             }
566             if( spec->coord.expected_basecalls.match_end > 0 ) {
567                 /* match_end can have a better match even if match_start found smth before
568                    normally table has start or end match edge, not both */
569                 char r[4096];
570                 uint32_t i;
571                 ExpectedTableMatch data;
572 
573                 data.len = pool[1].start - start;
574                 DEBUG_MSG(3, ("agrep find read %u rev in: '%.*s' = %u\n", nreads, data.len, &bases[start], data.len));
575                 if( data.len > sizeof(r) ) {
576                     rc = RC(rcSRA, rcFormatter, rcResolving, rcBuffer, rcInsufficient);
577                     break;
578                 }
579                 for(i = 0; i < data.len; i++) {
580                     /* reverse a portion of seq */
581                     r[i] = bases[pool[1].start - i - 1];
582                 }
583                 data.seq = r;
584                 data.tag = *tag;
585                 data.direction = etm_Reverse;
586                 data.left_adjusted = spec[1].coord_type != rdsp_RelativeOrder_ct && pool[1].start > 0;
587                 DEBUG_MSG(3, ("agrep find read %u rev in: '%.*s' = %u\n", nreads, data.len, data.seq, data.len));
588                 if( Experiment_ExpectedTableMatch(&spec->coord.expected_basecalls, &data) ) {
589                     DEBUG_MSG(3, ("agrep found rev from %hd match %d:%d - %d '%s' in '%.*s' \n",
590                                     start, data.match.position, data.match.length, data.match.score,
591                                     data.bc->basecall, data.len, &bases[pool[1].start - data.len]));
592                     if( pool->length < 0 || data.match.score < pool->score ) {
593                         pool->start = pool[1].start - data.match.position - data.match.length;
594                         pool->length = data.match.length;
595                         pool->item[0] = data.bc;
596                         pool->score = data.match.score;
597                     }
598                 } else {
599                     DEBUG_MSG(3, ("agrep nothing found rev\n"));
600                 }
601             }
602             if( pool->length < 0 ) {
603                 pool->length = spec->coord.expected_basecalls.default_length;
604             }
605             if( pool->start < 0 ) {
606                 pool->start = pool[1].start - pool->length;
607             }
608             gap = pool[1].start - (pool->start + pool->length);
609             if( gap > 0 ) {
610                 /* right gap */
611                 if( spec[1].coord_type == rdsp_RelativeOrder_ct ) {
612                     /* adjust right read's start to the tail of current read */
613                     pool[1].start -= gap;
614                     pool[1].length += gap;
615                     gap = 0;
616                 } else if( spec[1].coord_type >= rdsp_ExpectedBaseCall_ct &&
617                     spec[1].coord.expected_basecalls.base_coord <= 0 && pool[1].item[0] != NULL ) {
618                     /* move left read's start to left as much as possible */
619                     int32_t left = pool[1].item[0]->max_mismatch - pool[1].score;
620                     if( left > 0 ) {
621                         if( left > gap ) {
622                             left = gap;
623                         }
624                         pool[1].length += left;
625                         pool[1].score += left;
626                         pool[1].start -= left;
627                         gap -= left;
628                     }
629                 }
630                 if( gap > 0 && pool->item[0] != NULL ) {
631                     /* add gap to end of self */
632                     int32_t left = pool->item[0]->max_mismatch - pool->score;
633                     if( left > 0 ) {
634                         if( left > gap ) {
635                             left = gap;
636                         }
637                         pool->length += left;
638                         pool->score += left;
639                     }
640                 }
641             }
642             break;
643             }}
644         default:
645             rc = RC(rcSRA, rcFormatter, rcResolving, rcItem, rcUnexpected);
646             break;
647     }
648     return rc;
649 }
650 
651 typedef struct PoolMember_FindReverse_Data_struct {
652     const char* tags[MAX_SPOT_DESCRIPTOR_READS];
653     uint8_t nreads;
654     const PoolMember* member;
655 } PoolMember_FindReverse_Data;
656 
657 static
PoolMember_FindReverse(BSTNode * n,void * d)658 bool CC PoolMember_FindReverse(BSTNode* n, void* d)
659 {
660     const PoolMember* node = (const PoolMember*)n;
661     PoolMember_FindReverse_Data* data = (PoolMember_FindReverse_Data*)d;
662     int r, j, found;
663 
664     for(r = 1; r <= data->nreads; r++) {
665         /* comparing pointers here because tags contains same pointers */
666         if( node->spec[r].item[0] == NULL && data->tags[r] == NULL ) {
667             continue;
668         }
669         found = 0;
670         for(j = 0; node->spec[r].item[j] != NULL; j++ ) {
671             if( node->spec[r].item[j]->read_group_tag == data->tags[r] ) {
672                 found = 1;
673                 break;
674             }
675         }
676         if( found == 0 ) {
677             return false;
678         }
679     }
680     data->member = node;
681     return true;
682 }
683 
684 static
Experiment_FillSegAgrep(const ExperimentXML * self,PoolReadSpec * pool,const PoolMember * src,uint32_t nbases,const char * bases,const char ** const new_member_name)685 rc_t Experiment_FillSegAgrep(const ExperimentXML* self, PoolReadSpec* pool, const PoolMember* src,
686                              uint32_t nbases, const char* bases,
687                              const char** const new_member_name)
688 {
689     rc_t rc = 0;
690     uint32_t i;
691     PoolMember_FindReverse_Data data;
692 
693     /* close left and right brackets tech reads as spot terminators */
694     pool[0].start = 0;
695     pool[0].length = 0;
696     pool[self->reads->nreads + 1].start = nbases;
697     pool[self->reads->nreads + 1].length = 0;
698 
699     /* setup known tags if it is predefined member search */
700     for(i = 0; i <= self->reads->nreads + 1; i++) {
701         data.tags[i] = src->spec[i].item[0] ? src->spec[i].item[0]->read_group_tag : NULL;
702     }
703     /* start from read, not bracket read */
704     rc = Experiment_FillRead(self->reads->reads, &pool[1], &data.tags[1], self->reads->nreads, 0, bases, nbases);
705 
706     if( rc != 0 ) {
707         DEBUG_MSG(3, ("ERROR: %R\n", rc));
708     }
709     DEBUG_MSG(3, ("%s: bases %u: '%.*s'\n", new_member_name ? *new_member_name : NULL, nbases, nbases, bases));
710     for(i = 1; i <= self->reads->nreads; i++) {
711         const char* b = NULL;
712         if( pool[i].start >= 0 ) {
713             b = &bases[pool[i].start];
714         }
715         DEBUG_MSG(3, ("Read %u. [%hd:%hd] - '%.*s'",
716                        i, pool[i].start, pool[i].length, pool[i].length < 0 ? 0 : pool[i].length, b));
717         if( self->reads->spec[i].coord_type == rdsp_FIXED_BRACKET_ct ) {
718             DEBUG_MSG(3, (" terminator???"));
719         }
720         if( self->reads->spec[i].coord_type >= rdsp_ExpectedBaseCall_ct ) {
721             if( pool[i].item[0] ) {
722                 if( pool[i].item[0]->match_edge == match_edge_End ) {
723                     DEBUG_MSG(3, (" in reverse"));
724                 }
725                 DEBUG_MSG(3, (" matches %s", pool[i].item[0]->basecall));
726                 if( pool[i].item[0]->read_group_tag ) {
727                     DEBUG_MSG(3, (" tagged %s", pool[i].item[0]->read_group_tag));
728                 }
729                 DEBUG_MSG(3, (" with %d mismatches", pool[i].score));
730             } else {
731                 DEBUG_MSG(3, (" no match"));
732             }
733         }
734         DEBUG_MSG(3, ("\n"));
735     }
736 
737     if( rc == 0 && new_member_name != NULL ) {
738         data.nreads = self->reads->nreads;
739         data.member = NULL;
740         for(i = 1; i <= self->reads->nreads; i++) {
741             if( self->reads->spec[i].coord_type == rdsp_ExpectedBaseCallTable_ct && pool[i].item[0] != NULL ) {
742                 data.tags[i] = pool[i].item[0]->read_group_tag;
743             } else {
744                 data.tags[i] = NULL;
745             }
746         }
747         if( BSTreeDoUntil(self->member_pool, false, PoolMember_FindReverse, &data) ) {
748             *new_member_name = data.member->name;
749             DEBUG_MSG(2, ("Assigned member_name '%s'\n", *new_member_name));
750         } else {
751 #if _DEBUGGING
752             DEBUG_MSG(2, ("member_name reverse lookup failed for tags:"));
753             for(i = 1; i <= self->reads->nreads; i++) {
754                 DEBUG_MSG(2, (" '%s'", data.tags[i]));
755             }
756             DEBUG_MSG(2, ("\n"));
757 #endif
758         }
759     }
760     return rc;
761 }
762 
Experiment_MemberSeg(const ExperimentXML * self,const char * const file_member_name,const char * const data_block_member_name,uint32_t nbases,const char * bases,SRASegment * seg,const char ** const new_member_name)763 rc_t Experiment_MemberSeg(const ExperimentXML* self,
764                           const char* const file_member_name, const char* const data_block_member_name,
765                           uint32_t nbases, const char* bases,
766                           SRASegment* seg, const char** const new_member_name)
767 {
768     rc_t rc = 0;
769     uint32_t i;
770     PoolReadSpec pool[MAX_SPOT_DESCRIPTOR_READS];
771 
772     if( self == NULL || bases == NULL || seg == NULL || new_member_name == NULL ) {
773         rc = RC(rcSRA, rcFormatter, rcResolving, rcParam, rcNull);
774     } else if( nbases < 1 ) {
775         rc = RC(rcSRA, rcFormatter, rcResolving, rcData, rcEmpty);
776     }
777 
778     if( rc == 0 ) {
779         const char* mm = NULL;
780         switch(self->processing.barcode_rule) {
781             case eBarcodeRule_not_set:
782                 mm = data_block_member_name ? data_block_member_name : file_member_name;
783                 break;
784 
785             case eBarcodeRule_use_file_spot_name:
786                 mm = file_member_name;
787                 break;
788 
789             case eBarcodeRule_use_table_in_experiment:
790             case eBarcodeRule_ignore_barcode:
791                 mm = NULL;
792                 break;
793         }
794         if( self->member_pool == NULL || self->processing.barcode_rule == eBarcodeRule_ignore_barcode ) {
795             if( (rc = Experiment_FillSegAgrep(self, pool, self->member_null, nbases, bases, NULL)) == 0 ) {
796                 *new_member_name = mm;
797             }
798         } else if( mm == NULL ) {
799             *new_member_name = NULL;
800 	    rc = Experiment_FillSegAgrep(self, pool, self->member_null, nbases, bases, new_member_name);
801         } else {
802             PoolMember* pm = (PoolMember*)BSTreeFind(self->member_pool, mm, PoolMember_FindByName);
803             if( pm == NULL ) {
804                 rc = RC(rcSRA, rcFormatter, rcResolving, rcData, rcNotFound);
805                 PLOGERR(klogErr, (klogErr, rc, "specified member_name '$(m)'", PLOG_S(m), mm));
806             } else {
807                 if( (rc = Experiment_FillSegAgrep(self, pool, pm, nbases, bases, NULL)) == 0 ) {
808                     *new_member_name = mm;
809                 }
810             }
811         }
812     }
813 
814     if( rc == 0 ) {
815         int32_t new_bases = 0;
816 
817         for(i = 1; i <= self->reads->nreads; i++) {
818             if( pool[i].start < 0 || pool[i].length < 0 ) {
819                 rc = RC(rcSRA, rcFormatter, rcResolving, rcData, rcIncomplete);
820                 PLOGERR(klogErr, (klogErr, rc, "Read #$(i) boundaries", PLOG_U32(i), i));
821             } else if( i == 1 ) {
822                 if( pool[i].start != 0 ) {
823                     rc = RC(rcSRA, rcFormatter, rcResolving, rcData, rcInvalid);
824                     PLOGERR(klogErr, (klogErr, rc, "Read #$(i) do not start at 0", PLOG_U32(i), i));
825                 }
826             } else {
827                 int16_t delta = pool[i].start - pool[i - 1].length - pool[i - 1].start;
828                 /* try to close gap by failng back to default length IF:
829                    prev read was EXPECTED_BASECALL OR
830                                  EXPECTED_BASECALL_TABLE and there an BASECALL chosen
831                    AND
832                     defailt_length != 0 (was set)
833                    */
834                 if( delta != 0 &&
835                     (self->reads->spec[i - 1].coord_type == rdsp_ExpectedBaseCall_ct ||
836                      (self->reads->spec[i - 1].coord_type == rdsp_ExpectedBaseCallTable_ct && pool[i - 1].item[0] != NULL)) &&
837                     self->reads->spec[i - 1].coord.expected_basecalls.default_length > 0 ) {
838                     new_bases -= pool[i - 1].length;
839                     pool[i - 1].length = self->reads->spec[i - 1].coord.expected_basecalls.default_length;
840                     DEBUG_MSG(2, ("trying to close delta %hi for read %u: length reset default %hi\n", delta, i - 1, pool[i - 1].length));
841                     if( pool[i - 1].item[0] != NULL  && pool[i - 1].item[0]->read_group_tag != NULL ) {
842                         DEBUG_MSG(2, ("member_name '%s' reset to ''\n", *new_member_name));
843                         *new_member_name = NULL;
844                     }
845                     delta = pool[i].start - pool[i - 1].length - pool[i - 1].start;
846                     new_bases += pool[i - 1].length;
847                     seg[i - 2].len = pool[i - 1].length;
848                 }
849                 if( delta > 0 ) {
850                     rc = RC(rcSRA, rcFormatter, rcResolving, rcData, rcInconsistent);
851                     PLOGERR(klogErr, (klogErr, rc, "Gap between reads #$(p) and #$(i)", PLOG_2(PLOG_U32(p),PLOG_U32(i)), i - 1, i));
852                 } else if( delta < 0 ) {
853                     rc = RC(rcSRA, rcFormatter, rcResolving, rcData, rcInconsistent);
854                     PLOGERR(klogErr, (klogErr, rc, "Reads #$(p) and #$(i) overlap", PLOG_2(PLOG_U32(p),PLOG_U32(i)), i - 1, i));
855                 }
856             }
857             seg[i - 1].start = pool[i].start;
858             seg[i - 1].len = pool[i].length;
859             new_bases += pool[i].length;
860         }
861         if( rc == 0 && (new_bases < 0 || nbases != new_bases) ) {
862             rc = RC(rcSRA, rcFormatter, rcResolving, rcData, rcInconsistent);
863             PLOGERR(klogErr, (klogErr, rc, "total read length $(c)", PLOG_U32(c), new_bases));
864         }
865         if( rc != 0 ) {
866             char err_buf[8192], *p = err_buf;
867             size_t max = sizeof(err_buf);
868 
869             for(i = 1; i <= self->reads->nreads; i++) {
870                 int x = snprintf(p, max, "[%hd:%d, len = %hd", pool[i].start, pool[i].start + pool[i].length, pool[i].length);
871                 if( x > 0 && x < max ) {
872                     max -= x;
873                     p += x;
874                     if( self->reads->spec[i].coord_type == rdsp_ExpectedBaseCallTable_ct ) {
875                         if( pool[i].item[0] != NULL ) {
876                             if( pool[i].item[0]->read_group_tag != NULL ) {
877                                 x = snprintf(p, max, ", matched tag '%s', mismatches %d]", pool[i].item[0]->read_group_tag, pool[i].score);
878                             } else {
879                                 x = snprintf(p, max, ", mismatches %d]", pool[i].score);
880                             }
881                         } else {
882                             x = snprintf(p, max, ", no match]");
883                         }
884                     } else {
885                         x = snprintf(p, max, "]");
886                     }
887                 }
888                 if( x <= 0 || x >= max ) {
889                     max = strlen(err_buf);
890                     err_buf[max - 4] = '\0';
891                     strcat(err_buf, "... ");
892                     break;
893                 }
894                 max -= x;
895                 p += x;
896             }
897             PLOGMSG(klogErr, (klogErr, "segments: $(seg) spot length actual: $(actual), calculated $(calc)",
898                 "seg=%s,actual=%u,calc=%d", err_buf, nbases, new_bases));
899         }
900     }
901     return rc;
902 }
903 
Experiment_MemberSegSimple(const ExperimentXML * self,const char * const file_member_name,const char * const data_block_member_name,const char ** const new_member_name)904 rc_t Experiment_MemberSegSimple(const ExperimentXML* self,
905                                 const char* const file_member_name, const char* const data_block_member_name,
906                                 const char** const new_member_name)
907 {
908     rc_t rc = 0;
909     if( self == NULL || new_member_name == NULL ) {
910         rc = RC(rcSRA, rcFormatter, rcResolving, rcParam, rcNull);
911     } else {
912         switch(self->processing.barcode_rule) {
913             case eBarcodeRule_not_set:
914                 *new_member_name = data_block_member_name ? data_block_member_name : file_member_name;
915                 break;
916 
917             case eBarcodeRule_use_file_spot_name:
918                 *new_member_name = file_member_name;
919                 break;
920 
921             case eBarcodeRule_use_table_in_experiment:
922             case eBarcodeRule_ignore_barcode:
923                 *new_member_name = NULL;
924                 break;
925         }
926     }
927     return rc;
928 }
929 
930 static
Experiment_Apply_RUN_ATTRIBUTES(const ExperimentXML * cself,RunAttributes * attr,uint32_t spot_length)931 rc_t Experiment_Apply_RUN_ATTRIBUTES(const ExperimentXML* cself, RunAttributes* attr, uint32_t spot_length)
932 {
933     rc_t rc = 0;
934 
935     if( cself == NULL ) {
936         rc = RC(rcSRA, rcFormatter, rcEvaluating, rcSelf, rcNull);
937     } else if( attr != NULL ) {
938         ExperimentXML* obj = (ExperimentXML*)cself;
939 
940         obj->processing.barcode_rule = attr->barcode_rule;
941         if( attr->quality_offset != '\0' ) {
942             obj->processing.quality_offset = attr->quality_offset;
943         }
944         if( attr->quality_type != eExperimentQualityType_Undefined ) {
945             obj->processing.quality_type = attr->quality_type;
946         }
947         obj->spot_length = attr->spot_length ? attr->spot_length : spot_length;
948         if( obj->spot_length == 0 &&
949             (obj->platform->id == SRA_PLATFORM_ILLUMINA || obj->platform->id == SRA_PLATFORM_ABSOLID) ) {
950             PLOGMSG(klogWarn, (klogWarn,
951                 "SPOT_LENGTH is not found or specified as $(l)", PLOG_U32(l), obj->spot_length));
952         }
953     }
954     return rc;
955 }
956 
Experiment_Make(const ExperimentXML ** self,const KXMLDoc * doc,RunAttributes * attr)957 rc_t Experiment_Make(const ExperimentXML** self, const KXMLDoc* doc, RunAttributes* attr)
958 {
959     rc_t rc = 0;
960     ExperimentXML* obj = NULL;
961     const KXMLNodeset* ns = NULL;
962     uint32_t p_spot_length = 0, r_spot_length = 0;
963 
964     if( self == NULL || doc == NULL ) {
965         return RC(rcSRA, rcFormatter, rcConstructing, rcParam, rcNull);
966     }
967     obj = calloc(1, sizeof(*obj));
968     if( obj == NULL ) {
969         return RC(rcSRA, rcFormatter, rcConstructing, rcMemory, rcExhausted);
970     }
971     if( (rc = KXMLDocOpenNodesetRead(doc, &ns, "/EXPERIMENT_SET/EXPERIMENT | /EXPERIMENT")) == 0 ) {
972         uint32_t count = 0;
973         if( (rc = KXMLNodesetCount(ns, &count)) == 0 && count != 1 ) {
974             rc = RC(rcSRA, rcFormatter, rcConstructing, rcTag, count ? rcExcessive : rcNotFound);
975             LOGERR(klogErr, rc, "EXPERIMENT");
976         } else {
977             const KXMLNode* EXPERIMENT = NULL;
978             if( (rc = KXMLNodesetGetNodeRead(ns, &EXPERIMENT, 0)) == 0 ) {
979                 if( (rc = PlatformXML_Make(&obj->platform, EXPERIMENT, &p_spot_length)) == 0 ) {
980                     if( attr->platform != NULL ) {
981                         if( obj->platform->id != attr->platform->id ) {
982                             rc = RC(rcSRA, rcFormatter, rcConstructing, rcData, rcInconsistent);
983                             LOGERR(klogErr, rc, "EXPERIMENT and RUN platforms differ");
984                         } else {
985                             PlatformXML_Whack(obj->platform);
986                             obj->platform = attr->platform;
987                             /* take ownership */
988                             attr->platform = NULL;
989                         }
990                     }
991                     if( rc == 0 ) {
992                         if( attr->reads == NULL ) {
993                             if( (rc = ReadSpecXML_Make(&obj->reads, EXPERIMENT, "DESIGN/SPOT_DESCRIPTOR", &r_spot_length)) != 0 ) {
994                                 LOGERR(klogErr, rc, "EXPERIMENT/.../READ_SPEC");
995                             }
996                         } else {
997                             obj->reads = attr->reads;
998                             /* take ownership */
999                             attr->reads = NULL;
1000                         }
1001                         if( rc == 0 && (rc = parse_POOL(EXPERIMENT, obj)) == 0 ) {
1002                             rc = Experiment_Apply_RUN_ATTRIBUTES(obj, attr, r_spot_length ? r_spot_length : p_spot_length);
1003                         }
1004                         if( rc == 0 && p_spot_length != 0 && r_spot_length != 0 && p_spot_length != r_spot_length ) {
1005                             PLOGMSG(klogWarn, (klogWarn,
1006                                 "in EXPERIMENT sequence length (cycle count) in PLATFORM $(p) differ"
1007                                 " from SPOT_LENGTH $(l) in SPOT_DECODE_SPEC",
1008                                 PLOG_2(PLOG_U32(p),PLOG_U32(l)), p_spot_length, r_spot_length));
1009                         }
1010                     }
1011                 } else {
1012                     LOGERR(klogErr, rc, "EXPERIMENT/PLATFORM");
1013                 }
1014                 KXMLNodeRelease(EXPERIMENT);
1015             }
1016         }
1017         KXMLNodesetRelease(ns);
1018     }
1019     if( rc != 0 ) {
1020         *self = NULL;
1021         Experiment_Whack(obj);
1022     } else {
1023         *self = obj;
1024     }
1025     return rc;
1026 }
1027 
Experiment_GetPlatform(const ExperimentXML * cself,const PlatformXML ** platform)1028 rc_t Experiment_GetPlatform(const ExperimentXML* cself, const PlatformXML** platform)
1029 {
1030     rc_t rc = 0;
1031     if( cself == NULL || platform == NULL ) {
1032         rc = RC(rcSRA, rcFormatter, rcAccessing, rcParam, rcNull);
1033     } else {
1034         *platform = cself->platform;
1035     }
1036     return rc;
1037 }
1038 
Experiment_GetProcessing(const ExperimentXML * cself,const ProcessingXML ** processing)1039 rc_t Experiment_GetProcessing(const ExperimentXML* cself, const ProcessingXML** processing)
1040 {
1041     rc_t rc = 0;
1042     if( cself == NULL || processing == NULL ) {
1043         rc = RC(rcSRA, rcFormatter, rcAccessing, rcParam, rcNull);
1044     } else {
1045         *processing = &cself->processing;
1046     }
1047     return rc;
1048 }
1049 
Experiment_GetReadNumber(const ExperimentXML * cself,uint8_t * nreads)1050 rc_t Experiment_GetReadNumber(const ExperimentXML* cself, uint8_t* nreads)
1051 {
1052     rc_t rc = 0;
1053     if( cself == NULL || nreads == NULL ) {
1054         rc = RC(rcSRA, rcFormatter, rcAccessing, rcParam, rcNull);
1055     } else {
1056         *nreads = cself->reads->nreads;
1057     }
1058     return rc;
1059 }
1060 
Experiment_GetSpotLength(const ExperimentXML * cself,uint32_t * spot_length)1061 rc_t Experiment_GetSpotLength(const ExperimentXML* cself, uint32_t* spot_length)
1062 {
1063     rc_t rc = 0;
1064     if( cself == NULL || spot_length == NULL ) {
1065         rc = RC(rcSRA, rcFormatter, rcAccessing, rcParam, rcNull);
1066     } else {
1067         *spot_length= cself->spot_length;
1068     }
1069     return rc;
1070 }
1071 
Experiment_GetRead(const ExperimentXML * cself,uint8_t read_id,const ReadSpecXML_read ** read_spec)1072 rc_t Experiment_GetRead(const ExperimentXML* cself, uint8_t read_id, const ReadSpecXML_read** read_spec)
1073 {
1074     rc_t rc = 0;
1075     if( cself == NULL || read_spec == NULL ) {
1076         rc = RC(rcSRA, rcFormatter, rcAccessing, rcParam, rcNull);
1077     } else if( read_id >= cself->reads->nreads ) {
1078         rc = RC(rcSRA, rcFormatter, rcAccessing, rcId, rcOutofrange);
1079     } else {
1080         *read_spec = &(cself->reads->reads[read_id]);
1081     }
1082     return rc;
1083 }
1084 
1085 typedef struct PoolMember_FindByTag_Data_struct {
1086     /* in */
1087     const char* tag;
1088     uint8_t readid;
1089     /* out */
1090     const char* member_name;
1091 } PoolMember_FindByTag_Data;
1092 
1093 static
PoolMember_FindByTag(BSTNode * node,void * data)1094 bool CC PoolMember_FindByTag(BSTNode *node, void *data)
1095 {
1096     PoolMember_FindByTag_Data* d = (PoolMember_FindByTag_Data*)data;
1097     const PoolMember* n = (const PoolMember*)node;
1098     int i = 0;
1099 
1100     const ReadSpecXML_read_BASECALL* bc = n->spec[d->readid].item[i];
1101     while( bc != NULL ) {
1102         if( strcmp(bc->read_group_tag, d->tag) == 0 ) {
1103             d->member_name = n->name;
1104             return true;
1105         }
1106         bc = n->spec[d->readid].item[++i];
1107     }
1108     return false;
1109 }
1110 
Experiment_FindReadInTable(const ExperimentXML * cself,uint8_t read_id,const char * key,const char ** basecall,const char ** member_name)1111 rc_t Experiment_FindReadInTable(const ExperimentXML* cself, uint8_t read_id, const char* key, const char** basecall, const char** member_name)
1112 {
1113     rc_t rc = 0;
1114     if( cself == NULL || key == NULL || (basecall == NULL && member_name == NULL) ) {
1115         rc = RC(rcSRA, rcFormatter, rcAccessing, rcParam, rcNull);
1116     } else if( read_id >= cself->reads->nreads ) {
1117         rc = RC(rcSRA, rcFormatter, rcAccessing, rcId, rcOutofrange);
1118     } else {
1119         const ReadSpecXML_read* read_spec = &(cself->reads->reads[read_id++]);
1120         if( read_spec->coord_type != rdsp_ExpectedBaseCallTable_ct ) {
1121             rc = RC(rcSRA, rcFormatter, rcAccessing, rcName, rcNotFound);
1122         } else {
1123             ExpectedTableMatch match;
1124 
1125             rc = RC(rcSRA, rcFormatter, rcAccessing, rcName, rcNotFound);
1126             /* try to find key as basecall */
1127             match.seq = key;
1128             match.len = strlen(key);
1129             match.tag = NULL;
1130             match.direction = etm_Both;
1131             match.left_adjusted = true;
1132             if( Experiment_ExpectedTableMatch(&read_spec->coord.expected_basecalls, &match) ) {
1133                 rc = 0;
1134                 if( basecall != NULL ) {
1135                     *basecall = match.bc->basecall;
1136                 }
1137                 if( member_name != NULL ) {
1138                     if( cself->member_pool != NULL ) {
1139                         PoolMember_FindByTag_Data data;
1140                         data.readid = read_id;
1141                         data.tag = match.bc->read_group_tag;
1142                         if( BSTreeDoUntil(cself->member_pool, false, PoolMember_FindByTag, &data) ) {
1143                             *member_name = data.member_name;
1144                         } else {
1145                             rc = RC(rcSRA, rcFormatter, rcAccessing, rcSelf, rcCorrupt);
1146                         }
1147                     } else {
1148                         *member_name = NULL;
1149                     }
1150                 }
1151             } else if( cself->member_pool != NULL ) {
1152                 /* try to find key as member_name */
1153                 PoolMember* pm = (PoolMember*)BSTreeFind(cself->member_pool, key, PoolMember_FindByName);
1154                 if( pm != NULL ) {
1155                     rc = 0;
1156                     if( member_name != NULL ) {
1157                         *member_name = pm->name;
1158                     }
1159                     if( basecall != NULL ) {
1160                         if( pm->spec[read_id].item[0] != NULL ) {
1161                             *basecall = pm->spec[read_id].item[0]->basecall;
1162                             if( pm->spec[read_id].item[1] != NULL ) {
1163                                 rc = RC(rcSRA, rcFormatter, rcAccessing, rcData, rcAmbiguous);
1164                             }
1165                         } else {
1166                             rc = RC(rcSRA, rcFormatter, rcAccessing, rcData, rcViolated);
1167                         }
1168                     }
1169                 }
1170             }
1171         }
1172     }
1173     return rc;
1174 }
1175 
Experiment_HasPool(const ExperimentXML * cself,bool * has_pool)1176 rc_t Experiment_HasPool(const ExperimentXML* cself, bool* has_pool)
1177 {
1178     rc_t rc = 0;
1179     if( cself == NULL || has_pool == NULL ) {
1180         rc = RC(rcSRA, rcFormatter, rcAccessing, rcParam, rcNull);
1181     } else {
1182         *has_pool = cself->member_pool != NULL;
1183     }
1184     return rc;
1185 }
1186 
Experiment_ReadSegDefault(const ExperimentXML * self,SRASegment * seg)1187 rc_t Experiment_ReadSegDefault(const ExperimentXML* self, SRASegment* seg)
1188 {
1189     rc_t rc = 0;
1190 
1191     /* TBD to do memberseg call based on junk seq of known length */
1192     if( self == NULL || seg == NULL ) {
1193         rc = RC(rcSRA, rcFormatter, rcResolving, rcParam, rcNull);
1194     } else if( self->spot_length == 0 ) {
1195         rc = RC(rcSRA, rcFormatter, rcConstructing, rcData, rcUnsupported);
1196     } else {
1197         uint32_t i;
1198         int32_t spot_len = self->spot_length;
1199 
1200         i = self->reads->nreads;
1201         do {
1202             int16_t len = 0;
1203             --i;
1204             switch(self->reads->reads[i].coord_type) {
1205                 case rdsp_RelativeOrder_ct:
1206                     if( i == 0 ) {
1207                         len = spot_len;
1208                     } else {
1209                         len = 0;
1210                     }
1211                     break;
1212                 case rdsp_BaseCoord_ct:
1213                 case rdsp_CycleCoord_ct:
1214                     len = spot_len - (self->reads->reads[i].coord.start_coord - 1);
1215                     break;
1216                 case rdsp_ExpectedBaseCall_ct:
1217                 case rdsp_ExpectedBaseCallTable_ct:
1218                     if( self->reads->reads[i].coord.expected_basecalls.base_coord > 0 ) {
1219                         len = spot_len - (self->reads->reads[i].coord.expected_basecalls.base_coord - 1);
1220                     }
1221                     len += self->reads->reads[i].coord.expected_basecalls.default_length;
1222                     break;
1223                 default:
1224                     rc = RC(rcSRA, rcFormatter, rcConstructing, rcData, rcUnexpected);
1225                     LOGERR(klogErr, rc, "read type in default");
1226                     break;
1227             }
1228             spot_len -= len;
1229             if( spot_len < 0 || len < 0 ) {
1230                 rc = RC(rcSRA, rcFormatter, rcConstructing, rcData, rcInconsistent);
1231                 LOGERR(klogErr, rc, "cumulative read lengths and SEQUENCE_LENGTH");
1232                 return rc;
1233             } else {
1234                 seg[i].start = spot_len;
1235                 seg[i].len = len;
1236             }
1237         } while( i > 0 );
1238     }
1239     return rc;
1240 }
1241 
Experiment_Whack(const ExperimentXML * cself)1242 void Experiment_Whack(const ExperimentXML* cself)
1243 {
1244     if( cself != NULL ) {
1245         ExperimentXML* self = (ExperimentXML*)cself;
1246 
1247         free(self->processing.quality_scorer);
1248         ReadSpecXML_Whack(self->reads);
1249         BSTreeWhack(self->member_pool, PoolMember_Whack, NULL);
1250         free(self->member_pool);
1251         PoolMember_Whack(&self->member_null->node, NULL);
1252         PlatformXML_Whack(self->platform);
1253         free(self);
1254     }
1255 }
1256