1 /*=========================================================================== 2 * 3 * PUBLIC DOMAIN NOTICE 4 * National Center for Biotechnology Information 5 * 6 * This software/database is a "United States Government Work" under the 7 * terms of the United States Copyright Act. It was written as part of 8 * the author's official duties as a United States Government employee and 9 * thus cannot be copyrighted. This software/database is freely available 10 * to the public for use. The National Library of Medicine and the U.S. 11 * Government have not placed any restriction on its use or reproduction. 12 * 13 * Although all reasonable efforts have been taken to ensure the accuracy 14 * and reliability of the software and data, the NLM and the U.S. 15 * Government do not and cannot warrant the performance or results that 16 * may be obtained by using this software or data. The NLM and the U.S. 17 * Government disclaim all warranties, express or implied, including 18 * warranties of performance, merchantability or fitness for any particular 19 * purpose. 20 * 21 * Please cite the author in any work or product based on this material. 22 * 23 * =========================================================================== 24 * 25 */ 26 #ifndef _sra_load_common_xml_ 27 #define _sra_load_common_xml_ 28 29 #include <kxml/xml.h> 30 #include <search/grep.h> 31 #include <sra/sradb.h> 32 33 #define SRALOAD_MAX_READS 32 34 35 rc_t XMLNode_get_strnode(const KXMLNode* node, const char* child, bool optional, char** value); 36 37 typedef struct PlatformXML_struct { 38 SRAPlatforms id; 39 union { 40 struct { 41 char* key_sequence; /* optional */ 42 char* flow_sequence; /* optional */ 43 uint32_t flow_count; /* optional */ 44 } ls454; 45 46 struct { 47 char* key_sequence; /* optional */ 48 char* flow_sequence; /* optional */ 49 uint32_t flow_count; /* optional */ 50 } ion_torrent; 51 52 struct { 53 char* flow_sequence; /* optional */ 54 uint32_t flow_count; /* optional */ 55 } helicos; 56 } param; 57 } PlatformXML; 58 59 rc_t PlatformXML_Make(const PlatformXML** cself, const KXMLNode* node, uint32_t* spot_length); 60 61 void PlatformXML_Whack(const PlatformXML* cself); 62 63 typedef struct ReadSpecXML_read_BASECALL_struct { 64 char* basecall; 65 char* read_group_tag; 66 uint32_t min_match; 67 uint32_t max_mismatch; 68 enum { 69 match_edge_Full = 1, 70 match_edge_Start, 71 match_edge_End 72 } match_edge; 73 Agrep* agrep; 74 } ReadSpecXML_read_BASECALL; 75 76 typedef struct ReadSpecXML_read_BASECALL_TABLE_struct { 77 uint32_t default_length; 78 uint32_t base_coord; 79 ReadSpecXML_read_BASECALL* table; 80 uint32_t size; /* allocated structures qty */ 81 uint32_t count; /* used structures qty */ 82 bool pooled; /* true disables search if member is present from run */ 83 uint16_t match_start; /* length of longest bc with match_edge="start" */ 84 uint16_t match_end; /* length of longest bc with match_edge="end" */ 85 } ReadSpecXML_read_BASECALL_TABLE; 86 87 typedef struct ReadSpecXML_read_struct { 88 char* read_label; /* asciiz */ 89 SRAReadTypes read_class; 90 enum { 91 rdsp_Forward_rt = 1, 92 rdsp_Reverse_rt, 93 rdsp_Adapter_rt, 94 rdsp_Primer_rt, 95 rdsp_Linker_rt, 96 rdsp_BarCode_rt, 97 rdsp_Other_rt 98 } read_type; 99 100 enum { 101 /* order is important !!! */ 102 rdsp_FIXED_BRACKET_ct = 1, /* special fixed size type */ 103 rdsp_RelativeOrder_ct, 104 rdsp_BaseCoord_ct, 105 rdsp_CycleCoord_ct, 106 rdsp_ExpectedBaseCall_ct, 107 rdsp_ExpectedBaseCallTable_ct 108 } coord_type; 109 110 union { 111 struct { 112 int16_t follows; 113 int16_t precedes; 114 } relative_order; 115 /* starting position for *_COORD types */ 116 int16_t start_coord; 117 /* EXPECTED_BASECALL is a table of 1 element, unless IUPAC is used in values */ 118 ReadSpecXML_read_BASECALL_TABLE expected_basecalls; 119 } coord; 120 } ReadSpecXML_read; 121 122 typedef struct ReadSpecXML_struct { 123 uint32_t nreads; 124 ReadSpecXML_read spec[SRALOAD_MAX_READS + 2]; 125 ReadSpecXML_read* reads; 126 } ReadSpecXML; 127 128 rc_t ReadSpecXML_Make(const ReadSpecXML** cself, const KXMLNode* node, const char* path, uint32_t* spot_length); 129 130 void ReadSpecXML_Whack(const ReadSpecXML* cself); 131 132 typedef enum { 133 eExperimentQualityType_Undefined = 0, 134 eExperimentQualityType_Phred, 135 eExperimentQualityType_LogOdds, 136 eExperimentQualityType_Other 137 } ExperimentQualityType; 138 139 typedef enum { 140 eExperimentQualityEncoding_Undefined = 0, 141 eExperimentQualityEncoding_Ascii, 142 eExperimentQualityEncoding_Decimal, 143 eExperimentQualityEncoding_Hexadecimal 144 } ExperimentQualityEncoding; 145 146 /* 147 148 The value 'default' is same as '' or NULL in terms coding and represents default group (member) in SRA. 149 150 We use value of these data elements as SPOT_GROUP (barcode, member, etc) column value. 151 Rules (whichever is present in order of importance): 152 153 1. Run.xml: /RUN/DATA_BLOCK/ attribute member_name. (must have POOL to set read lengths). 154 2. Barcode in spot name in file data. (if not explicitly chosen [below] must have pool, otherwise fail)(must have POOL to set read lengths). 155 3. Experiment.xml: /EXPERIMENT/DESIGN/SAMPLE_DESCRIPTOR/POOL elements MEMBER. Match subsequences to determine SPOT_GROUP value. 156 4. No SPOT_GROUP column 157 158 Than <RUN_ATTRIBUTE> with <TAG> case-insensitive value of read_name_barcode_proc_directive comes in with case-insensitive <VALUE> values: 159 160 1. RUN_ATTRIBUTE not present at all use rules above. 161 2. 'interpret_as_spotgroup' (use_file_spot_name) use data from files spot names. Force rule #2. (new addition) 162 3. use_table_in_experiment use POOL table from experiment.xml. Force rule #3. 163 4. ignore do not write anything to SPOT_GROUP column. Force rule #4 164 165 */ 166 167 typedef enum { 168 eBarcodeRule_not_set = 0, 169 eBarcodeRule_use_file_spot_name, 170 eBarcodeRule_use_table_in_experiment, 171 eBarcodeRule_ignore_barcode 172 } ExperimentBarcodeRule; 173 174 typedef struct RunAttributes_struct { 175 uint32_t spot_length; 176 const PlatformXML* platform; 177 const ReadSpecXML* reads; 178 ExperimentBarcodeRule barcode_rule; 179 ExperimentQualityType quality_type; 180 uint8_t quality_offset; 181 } RunAttributes; 182 183 #endif /* _sra_load_common_xml_ */ 184