1 /*===========================================================================
2  *
3  *                            PUBLIC DOMAIN NOTICE
4  *               National Center for Biotechnology Information
5  *
6  *  This software/database is a "United States Government Work" under the
7  *  terms of the United States Copyright Act.  It was written as part of
8  *  the author's official duties as a United States Government employee and
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  */
26 #ifndef _sra_load_common_xml_
27 #define _sra_load_common_xml_
28 
29 #include <kxml/xml.h>
30 #include <search/grep.h>
31 #include <sra/sradb.h>
32 
33 #define SRALOAD_MAX_READS 32
34 
35 rc_t XMLNode_get_strnode(const KXMLNode* node, const char* child, bool optional, char** value);
36 
37 typedef struct PlatformXML_struct {
38     SRAPlatforms id;
39     union {
40         struct {
41             char* key_sequence; /* optional */
42             char* flow_sequence; /* optional */
43             uint32_t flow_count; /* optional */
44         } ls454;
45 
46         struct {
47             char* key_sequence; /* optional */
48             char* flow_sequence; /* optional */
49             uint32_t flow_count; /* optional */
50         } ion_torrent;
51 
52         struct {
53             char* flow_sequence; /* optional */
54             uint32_t flow_count; /* optional */
55         } helicos;
56     } param;
57 } PlatformXML;
58 
59 rc_t PlatformXML_Make(const PlatformXML** cself, const KXMLNode* node, uint32_t* spot_length);
60 
61 void PlatformXML_Whack(const PlatformXML* cself);
62 
63 typedef struct ReadSpecXML_read_BASECALL_struct {
64     char* basecall;
65     char* read_group_tag;
66     uint32_t min_match;
67     uint32_t max_mismatch;
68     enum {
69         match_edge_Full = 1,
70         match_edge_Start,
71         match_edge_End
72     } match_edge;
73     Agrep* agrep;
74 } ReadSpecXML_read_BASECALL;
75 
76 typedef struct ReadSpecXML_read_BASECALL_TABLE_struct {
77     uint32_t default_length;
78     uint32_t base_coord;
79     ReadSpecXML_read_BASECALL* table;
80     uint32_t size; /* allocated structures qty */
81     uint32_t count; /* used structures qty */
82     bool pooled; /* true disables search if member is present from run */
83     uint16_t match_start; /* length of longest bc with match_edge="start" */
84     uint16_t match_end; /* length of longest bc with match_edge="end" */
85 } ReadSpecXML_read_BASECALL_TABLE;
86 
87 typedef struct ReadSpecXML_read_struct {
88     char* read_label; /* asciiz */
89     SRAReadTypes read_class;
90     enum {
91         rdsp_Forward_rt = 1,
92         rdsp_Reverse_rt,
93         rdsp_Adapter_rt,
94         rdsp_Primer_rt,
95         rdsp_Linker_rt,
96         rdsp_BarCode_rt,
97         rdsp_Other_rt
98     } read_type;
99 
100     enum {
101         /* order is important !!! */
102         rdsp_FIXED_BRACKET_ct = 1, /* special fixed size type */
103         rdsp_RelativeOrder_ct,
104         rdsp_BaseCoord_ct,
105         rdsp_CycleCoord_ct,
106         rdsp_ExpectedBaseCall_ct,
107         rdsp_ExpectedBaseCallTable_ct
108     } coord_type;
109 
110     union {
111         struct {
112             int16_t follows;
113             int16_t precedes;
114         } relative_order;
115         /* starting position for *_COORD types */
116         int16_t start_coord;
117         /* EXPECTED_BASECALL is a table of 1 element, unless IUPAC is used in values */
118         ReadSpecXML_read_BASECALL_TABLE expected_basecalls;
119     } coord;
120 } ReadSpecXML_read;
121 
122 typedef struct ReadSpecXML_struct {
123     uint32_t nreads;
124     ReadSpecXML_read spec[SRALOAD_MAX_READS + 2];
125     ReadSpecXML_read* reads;
126 } ReadSpecXML;
127 
128 rc_t ReadSpecXML_Make(const ReadSpecXML** cself, const KXMLNode* node, const char* path, uint32_t* spot_length);
129 
130 void ReadSpecXML_Whack(const ReadSpecXML* cself);
131 
132 typedef enum {
133     eExperimentQualityType_Undefined = 0,
134     eExperimentQualityType_Phred,
135     eExperimentQualityType_LogOdds,
136     eExperimentQualityType_Other
137 } ExperimentQualityType;
138 
139 typedef enum {
140     eExperimentQualityEncoding_Undefined = 0,
141     eExperimentQualityEncoding_Ascii,
142     eExperimentQualityEncoding_Decimal,
143     eExperimentQualityEncoding_Hexadecimal
144 } ExperimentQualityEncoding;
145 
146 /*
147 
148 The value 'default' is same as '' or NULL in terms coding and represents default group (member) in SRA.
149 
150 We use value of these data elements as SPOT_GROUP (barcode, member, etc) column value.
151 Rules (whichever is present in order of importance):
152 
153 1.	Run.xml: /RUN/DATA_BLOCK/ attribute member_name. (must have POOL to set read lengths).
154 2.	Barcode in spot name in file data. (if not explicitly chosen [below] must have pool, otherwise fail)(must have POOL to set read lengths).
155 3.	Experiment.xml: /EXPERIMENT/DESIGN/SAMPLE_DESCRIPTOR/POOL elements MEMBER. Match subsequences to determine SPOT_GROUP value.
156 4.	No SPOT_GROUP column
157 
158 Than <RUN_ATTRIBUTE> with <TAG> case-insensitive value of read_name_barcode_proc_directive comes in with case-insensitive <VALUE> values:
159 
160 1.	RUN_ATTRIBUTE not present at all  use rules above.
161 2.	'interpret_as_spotgroup' (use_file_spot_name)  use data from files spot names. Force rule #2. (new addition)
162 3.	use_table_in_experiment  use POOL table from experiment.xml. Force rule #3.
163 4.	ignore  do not write anything to SPOT_GROUP column. Force rule #4
164 
165 */
166 
167 typedef enum {
168     eBarcodeRule_not_set = 0,
169     eBarcodeRule_use_file_spot_name,
170     eBarcodeRule_use_table_in_experiment,
171     eBarcodeRule_ignore_barcode
172 } ExperimentBarcodeRule;
173 
174 typedef struct RunAttributes_struct {
175     uint32_t spot_length;
176     const PlatformXML* platform;
177     const ReadSpecXML* reads;
178     ExperimentBarcodeRule barcode_rule;
179     ExperimentQualityType quality_type;
180     uint8_t quality_offset;
181 } RunAttributes;
182 
183 #endif /* _sra_load_common_xml_ */
184