1 /*===========================================================================
2  *
3  *                            PUBLIC DOMAIN NOTICE
4  *               National Center for Biotechnology Information
5  *
6  *  This software/database is a "United States Government Work" under the
7  *  terms of the United States Copyright Act.  It was written as part of
8  *  the author's official duties as a United States Government employee and
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  */
26 #ifndef _sra_load_common_xml_
27 #define _sra_load_common_xml_
29 #include <kxml/xml.h>
30 #include <search/grep.h>
31 #include <sra/sradb.h>
33 #define SRALOAD_MAX_READS 32
35 rc_t XMLNode_get_strnode(const KXMLNode* node, const char* child, bool optional, char** value);
37 typedef struct PlatformXML_struct {
38     SRAPlatforms id;
39     union {
40         struct {
41             char* key_sequence; /* optional */
42             char* flow_sequence; /* optional */
43             uint32_t flow_count; /* optional */
44         } ls454;
46         struct {
47             char* key_sequence; /* optional */
48             char* flow_sequence; /* optional */
49             uint32_t flow_count; /* optional */
50         } ion_torrent;
52         struct {
53             char* flow_sequence; /* optional */
54             uint32_t flow_count; /* optional */
55         } helicos;
56     } param;
57 } PlatformXML;
59 rc_t PlatformXML_Make(const PlatformXML** cself, const KXMLNode* node, uint32_t* spot_length);
61 void PlatformXML_Whack(const PlatformXML* cself);
63 typedef struct ReadSpecXML_read_BASECALL_struct {
64     char* basecall;
65     char* read_group_tag;
66     uint32_t min_match;
67     uint32_t max_mismatch;
68     enum {
69         match_edge_Full = 1,
70         match_edge_Start,
71         match_edge_End
72     } match_edge;
73     Agrep* agrep;
74 } ReadSpecXML_read_BASECALL;
76 typedef struct ReadSpecXML_read_BASECALL_TABLE_struct {
77     uint32_t default_length;
78     uint32_t base_coord;
79     ReadSpecXML_read_BASECALL* table;
80     uint32_t size; /* allocated structures qty */
81     uint32_t count; /* used structures qty */
82     bool pooled; /* true disables search if member is present from run */
83     uint16_t match_start; /* length of longest bc with match_edge="start" */
84     uint16_t match_end; /* length of longest bc with match_edge="end" */
85 } ReadSpecXML_read_BASECALL_TABLE;
87 typedef struct ReadSpecXML_read_struct {
88     char* read_label; /* asciiz */
89     SRAReadTypes read_class;
90     enum {
91         rdsp_Forward_rt = 1,
92         rdsp_Reverse_rt,
93         rdsp_Adapter_rt,
94         rdsp_Primer_rt,
95         rdsp_Linker_rt,
96         rdsp_BarCode_rt,
97         rdsp_Other_rt
98     } read_type;
100     enum {
101         /* order is important !!! */
102         rdsp_FIXED_BRACKET_ct = 1, /* special fixed size type */
103         rdsp_RelativeOrder_ct,
104         rdsp_BaseCoord_ct,
105         rdsp_CycleCoord_ct,
106         rdsp_ExpectedBaseCall_ct,
107         rdsp_ExpectedBaseCallTable_ct
108     } coord_type;
110     union {
111         struct {
112             int16_t follows;
113             int16_t precedes;
114         } relative_order;
115         /* starting position for *_COORD types */
116         int16_t start_coord;
117         /* EXPECTED_BASECALL is a table of 1 element, unless IUPAC is used in values */
118         ReadSpecXML_read_BASECALL_TABLE expected_basecalls;
119     } coord;
120 } ReadSpecXML_read;
122 typedef struct ReadSpecXML_struct {
123     uint32_t nreads;
124     ReadSpecXML_read spec[SRALOAD_MAX_READS + 2];
125     ReadSpecXML_read* reads;
126 } ReadSpecXML;
128 rc_t ReadSpecXML_Make(const ReadSpecXML** cself, const KXMLNode* node, const char* path, uint32_t* spot_length);
130 void ReadSpecXML_Whack(const ReadSpecXML* cself);
132 typedef enum {
133     eExperimentQualityType_Undefined = 0,
134     eExperimentQualityType_Phred,
135     eExperimentQualityType_LogOdds,
136     eExperimentQualityType_Other
137 } ExperimentQualityType;
139 typedef enum {
140     eExperimentQualityEncoding_Undefined = 0,
141     eExperimentQualityEncoding_Ascii,
142     eExperimentQualityEncoding_Decimal,
143     eExperimentQualityEncoding_Hexadecimal
144 } ExperimentQualityEncoding;
146 /*
148 The value 'default' is same as '' or NULL in terms coding and represents default group (member) in SRA.
150 We use value of these data elements as SPOT_GROUP (barcode, member, etc) column value.
151 Rules (whichever is present in order of importance):
153 1.	Run.xml: /RUN/DATA_BLOCK/ attribute member_name. (must have POOL to set read lengths).
154 2.	Barcode in spot name in file data. (if not explicitly chosen [below] must have pool, otherwise fail)(must have POOL to set read lengths).
155 3.	Experiment.xml: /EXPERIMENT/DESIGN/SAMPLE_DESCRIPTOR/POOL elements MEMBER. Match subsequences to determine SPOT_GROUP value.
156 4.	No SPOT_GROUP column
158 Than <RUN_ATTRIBUTE> with <TAG> case-insensitive value of read_name_barcode_proc_directive comes in with case-insensitive <VALUE> values:
160 1.	RUN_ATTRIBUTE not present at all  use rules above.
161 2.	'interpret_as_spotgroup' (use_file_spot_name)  use data from files spot names. Force rule #2. (new addition)
162 3.	use_table_in_experiment  use POOL table from experiment.xml. Force rule #3.
163 4.	ignore  do not write anything to SPOT_GROUP column. Force rule #4
165 */
167 typedef enum {
168     eBarcodeRule_not_set = 0,
169     eBarcodeRule_use_file_spot_name,
170     eBarcodeRule_use_table_in_experiment,
171     eBarcodeRule_ignore_barcode
172 } ExperimentBarcodeRule;
174 typedef struct RunAttributes_struct {
175     uint32_t spot_length;
176     const PlatformXML* platform;
177     const ReadSpecXML* reads;
178     ExperimentBarcodeRule barcode_rule;
179     ExperimentQualityType quality_type;
180     uint8_t quality_offset;
181 } RunAttributes;
183 #endif /* _sra_load_common_xml_ */