1 /* ftablock.h 2 * 3 * =========================================================================== 4 * 5 * PUBLIC DOMAIN NOTICE 6 * National Center for Biotechnology Information 7 * 8 * This software/database is a "United States Government Work" under the 9 * terms of the United States Copyright Act. It was written as part of 10 * the author's official duties as a United States Government employee and 11 * thus cannot be copyrighted. This software/database is freely available 12 * to the public for use. The National Library of Medicine and the U.S. 13 * Government have not placed any restriction on its use or reproduction. 14 * 15 * Although all reasonable efforts have been taken to ensure the accuracy 16 * and reliability of the software and data, the NLM and the U.S. 17 * Government do not and cannot warrant the performance or results that 18 * may be obtained by using this software or data. The NLM and the U.S. 19 * Government disclaim all warranties, express or implied, including 20 * warranties of performance, merchantability or fitness for any particular 21 * purpose. 22 * 23 * Please cite the author in any work or product based on this material. 24 * 25 * =========================================================================== 26 * 27 * File Name: ftablock.h 28 * 29 * Author: Karl Sirotkin, Hsiu-Chuan Chen 30 * 31 * File Description: 32 * ----------------- 33 */ 34 35 #ifndef _BLOCK_ 36 #define _BLOCK_ 37 38 #include <objects/seqloc/Patent_seq_id.hpp> 39 #include <objects/seqloc/Seq_id.hpp> 40 #include <objects/seqloc/Seq_loc.hpp> 41 #include <objects/seq/Linkage_evidence.hpp> 42 #include <objects/general/Date_std.hpp> 43 #include <objects/seqset/Seq_entry.hpp> 44 #include <objects/seqfeat/Seq_feat.hpp> 45 #include <objects/seqfeat/OrgMod.hpp> 46 #include <objects/seqfeat/Genetic_code.hpp> 47 #include <objects/pub/Pub.hpp> 48 #include <objects/seq/Delta_seq.hpp> 49 50 #include <objtools/flatfile/flatfile_parse_info.hpp> 51 #include "valnode.h" 52 53 BEGIN_NCBI_SCOPE 54 55 typedef std::list<CRef<objects::CSeq_feat> > TSeqFeatList; 56 typedef std::list<std::string> TAccessionList; 57 typedef std::list<CRef<objects::CSeq_id> > TSeqIdList; 58 typedef std::list<CRef<objects::COrgMod> > TOrgModList; 59 typedef std::vector<CRef<objects::CGb_qual> > TGbQualVector; 60 typedef std::list<CRef<objects::CSeqdesc> > TSeqdescList; 61 typedef std::vector<CRef<objects::CUser_object> > TUserObjVector; 62 typedef std::list<CRef<objects::CPub> > TPubList; 63 typedef std::list<CRef<objects::CSeq_loc> > TSeqLocList; 64 typedef std::list<CRef<objects::CDelta_seq> > TDeltaList; 65 66 67 #define ParFlat_ENTRYNODE 500 68 69 /* 70 #define FTA_RELEASE_MODE 0 71 #define FTA_HTGS_MODE 1 72 #define FTA_HTGSCON_MODE 2 73 */ 74 75 typedef struct info_bioseq { 76 TSeqIdList ids; /* for this Bioseq */ 77 char* locus; 78 char* acnum; 79 info_bioseqinfo_bioseq80 info_bioseq() : 81 locus(NULL), 82 acnum(NULL) 83 {} 84 85 } InfoBioseq, *InfoBioseqPtr; 86 87 typedef struct protein_block { 88 objects::CSeq_entry* biosep; /* for the toppest level of the BioseqSet */ 89 90 bool segset; /* TRUE if a BioseqSet SeqEntry */ 91 92 TEntryList entries; /* a ProtRef SeqEntry list, link to above 93 biosep */ 94 95 TSeqFeatList feats; /* a CodeRegionPtr list to link the BioseqSet 96 with class = nuc-prot */ 97 objects::CGenetic_code::C_E gcode; /* for this Bioseq */ 98 InfoBioseqPtr ibp; 99 Uint1 genome; 100 Int4 orig_gcode; 101 protein_blockprotein_block102 protein_block() : 103 biosep(nullptr), 104 segset(false), 105 ibp(NULL), 106 genome(0), 107 orig_gcode(0) 108 {} 109 110 } ProtBlk, *ProtBlkPtr; 111 112 typedef struct _locus_cont { 113 Int4 bases; 114 Int4 bp; 115 Int4 strand; 116 Int4 molecule; 117 Int4 topology; 118 Int4 div; 119 Int4 date; 120 } LocusCont, *LocusContPtr; 121 122 123 typedef struct _gap_feats { 124 Int4 from; 125 Int4 to; 126 Int4 estimated_length; 127 bool leftNs; 128 bool rightNs; 129 bool assembly_gap; 130 char* gap_type; 131 Int4 asn_gap_type; 132 133 objects::CLinkage_evidence::TLinkage_evidence asn_linkage_evidence; 134 135 struct _gap_feats *next; 136 137 _gap_feats(); 138 139 } GapFeats, *GapFeatsPtr; 140 141 typedef struct token_block { 142 char* str; /* the token string */ 143 struct token_block *next; /* points to next token */ 144 } TokenBlk, *TokenBlkPtr; 145 146 typedef struct token_statistics_block { 147 TokenBlkPtr list; /* a pointer points to the first 148 token */ 149 Int2 num; /* total number of token in the 150 chain list */ 151 } TokenStatBlk, *TokenStatBlkPtr; 152 153 typedef struct _XmlIndex { 154 Int4 tag; 155 Int4 order; 156 size_t start; /* Offset from the beginning of the 157 record, not file! */ 158 size_t end; /* Offset from the beginning of the 159 record, not file! */ 160 Int4 start_line; 161 Int4 end_line; 162 Int2 type; /* Used for references */ 163 struct _XmlIndex *subtags; 164 struct _XmlIndex *next; 165 } XmlIndex, *XmlIndexPtr; 166 167 typedef std::list<std::string> TKeywordList; 168 169 typedef struct indexblk_struct { 170 Char acnum[200]; /* accession num */ 171 Int2 vernum; /* version num */ 172 size_t offset; /* byte-offset of in the flatfile at 173 which the entry starts */ 174 Char locusname[200]; /* locus name */ 175 Char division[4]; /* division code */ 176 size_t bases; /* basepair length of the entry */ 177 Uint2 segnum; /* the number of the entry w/i a 178 segment set */ 179 Uint2 segtotal; /* total number of members in 180 segmented set to which this 181 entry belongs */ 182 Char blocusname[200]; /* base locus name s.t. w/o tailing 183 number */ 184 size_t linenum; /* line number at which the entry 185 starts */ 186 Uint1 drop; /* 1 if the accession should be 187 dropped, otherwise 0 */ 188 size_t len; /* total length (or sizes in bytes) 189 of the entry */ 190 191 CRef<objects::CDate_std> date; /* the record's entry-date or last 192 update's date */ 193 194 CRef<objects::CPatent_seq_id> psip; /* patent reference */ 195 196 bool EST; /* special EST entries */ 197 bool STS; /* special STS entries */ 198 bool GSS; /* special Genome servey entries */ 199 bool HTC; /* high throughput cDNA */ 200 Int2 htg; /* special HTG [0,1,2,3] entries */ 201 bool is_contig; /* TRUE if entry has CONTIG line, 202 otherwise FALSE */ 203 bool is_mga; /* TRUE if entry has MGA line, 204 otherwise FALSE */ 205 bool origin; /* TRUE if sequence is present */ 206 bool is_pat; /* TRUE if accession prefix is 207 patented and matches source. 208 FALSE - otherwise. */ 209 bool is_wgs; 210 bool is_tpa; 211 bool is_tsa; 212 bool is_tls; 213 bool is_tpa_wgs_con; /* TRUE if "is_contig", "is_wgs" and 214 "is_tpa" are TRUE */ 215 bool tsa_allowed; 216 LocusCont lc; 217 char* moltype; /* the value of /mol_type qual */ 218 GapFeatsPtr gaps; 219 220 // list<string> secondary_accessions; 221 TokenBlkPtr secaccs; 222 XmlIndexPtr xip; 223 bool embl_new_ID; 224 bool env_sample_qual; /* TRUE if at least one source 225 feature has /environmental_sample 226 qualifier */ 227 bool is_prot; 228 char* organism; /* The value of /organism qualifier */ 229 Int4 taxid; /* The value gotten from source feature 230 /db_xref qualifier if any */ 231 bool no_gc_warning; /* If TRUE then suppress 232 ERR_SERVER_GcFromSuppliedLineage 233 WARNING message */ 234 size_t qsoffset; 235 size_t qslength; 236 Int4 wgs_and_gi; /* 01 - has GI, 02 - WGS contig, 237 03 - both above */ 238 bool got_plastid; /* Set to TRUE if there is at least 239 one /organelle qual beginning 240 with "plastid" */ 241 Char wgssec[100]; /* Reserved buffer for WGS master or 242 project accession as secondary */ 243 Int4 gc_genomic; /* Genomic Genetic code from OrgRef */ 244 Int4 gc_mito; /* Mitochondrial Genetic code */ 245 TKeywordList keywords; /* All keywords from a flat record */ 246 bool assembly; /* TRUE for TPA:assembly in 247 KEYWORDS line */ 248 bool specialist_db; /* TRUE for TPA:specialist_db in 249 KEYWORDS line */ 250 bool inferential; /* TRUE for TPA:inferential in 251 KEYWORDS line */ 252 bool experimental; /* TRUE for TPA:experimental in 253 KEYWORDS line */ 254 char* submitter_seqid; 255 Parser *ppp; 256 257 indexblk_struct(); 258 259 } Indexblk, *IndexblkPtr; 260 261 typedef struct _fta_operon { 262 const Char* featname; /* Do not free! Just a pointer. */ 263 const Char* operon; /* Do not free! Just a pointer. */ 264 265 CConstRef<objects::CSeq_loc> location; /* Do not free! Just a pointer. */ 266 267 char* strloc; /* String value of location. */ 268 bool operon_feat; 269 bool ret; 270 struct _fta_operon *next; 271 _fta_operon_fta_operon272 _fta_operon() : 273 featname(nullptr), 274 operon(nullptr), 275 strloc(nullptr), 276 operon_feat(false), 277 ret(false), 278 next(nullptr) 279 {} 280 281 } FTAOperon, *FTAOperonPtr; 282 283 typedef struct data_block { 284 Int2 type; /* which keyword block or node type */ 285 void* data; /* any pointer type points to 286 information block */ 287 char* offset; /* points to beginning of the entry 288 in the memory */ 289 size_t len; /* lenght of data in bytes */ 290 char* qscore; /* points to quality score buffer */ 291 Uint1 drop; /* 1 if drop this data block */ 292 struct data_block *next; 293 } DataBlk, *DataBlkPtr; 294 295 typedef struct entry_block { 296 DataBlkPtr chain; /* a header points to key-word 297 block information */ 298 CRef<objects::CSeq_entry> seq_entry; /* points to sequence entry */ 299 300 struct entry_block *next; 301 entry_blockentry_block302 entry_block() : 303 chain(NULL), 304 next(NULL) 305 {} 306 307 } EntryBlk, *EntryBlkPtr; 308 309 typedef struct keyword_block { 310 const char *str; 311 Int2 len; 312 } KwordBlk, *KwordBlkPtr; 313 314 /**************************************************************************/ 315 316 void FreeDatablk(DataBlkPtr dbp); 317 void FreeEntry(DataBlkPtr entry); 318 void FreeIndexblk(IndexblkPtr ibp); 319 void GapFeatsFree(GapFeatsPtr gfp); 320 void XMLIndexFree(XmlIndexPtr xip); 321 322 void FreeEntryBlk(EntryBlkPtr entry); 323 EntryBlkPtr CreateEntryBlk(); 324 325 END_NCBI_SCOPE 326 327 #endif 328