1 /* fta_parser.h 2 * 3 * =========================================================================== 4 * 5 * PUBLIC DOMAIN NOTICE 6 * National Center for Biotechnology Information 7 * 8 * This software/database is a "United States Government Work" under the 9 * terms of the United States Copyright Act. It was written as part of 10 * the author's official duties as a United States Government employee and 11 * thus cannot be copyrighted. This software/database is freely available 12 * to the public for use. The National Library of Medicine and the U.S. 13 * Government have not placed any restriction on its use or reproduction. 14 * 15 * Although all reasonable efforts have been taken to ensure the accuracy 16 * and reliability of the software and data, the NLM and the U.S. 17 * Government do not and cannot warrant the performance or results that 18 * may be obtained by using this software or data. The NLM and the U.S. 19 * Government disclaim all warranties, express or implied, including 20 * warranties of performance, merchantability or fitness for any particular 21 * purpose. 22 * 23 * Please cite the author in any work or product based on this material. 24 * 25 * =========================================================================== 26 * 27 * File Name: fta_parser.h 28 * 29 * Author: Karl Sirotkin, Hsiu-Chuan Chen, Alexey Dobronadezhdin 30 * 31 * File Description: 32 * ----------------- 33 */ 34 35 #ifndef __FLATFILE_PARSE_INFO__ 36 #define __FLATFILE_PARSE_INFO__ 37 38 #include <list> 39 #include <objects/seqset/Seq_entry.hpp> 40 41 BEGIN_NCBI_SCOPE 42 43 // some forward declarations 44 struct FileBuf { 45 const char* start=nullptr; 46 const char* current=nullptr; 47 }; 48 struct indexblk_struct; 49 struct protein_block; 50 struct _fta_operon; 51 52 53 typedef struct indexblk_struct* IndexblkPtr; 54 typedef struct protein_block* ProtBlkPtr; 55 typedef struct _fta_operon* FTAOperonPtr; 56 57 using TEntryList = list<CRef<objects::CSeq_entry>>; 58 59 struct Parser { 60 61 enum class EOutput { 62 BioseqSet, 63 Seqsubmit 64 }; 65 66 enum class EMode { 67 Release, 68 HTGS, 69 HTGSCON, 70 Relaxed 71 }; 72 73 enum class ESource { 74 unknown, 75 NCBI, 76 EMBL, 77 GenBank, 78 DDBJ, 79 LANL, 80 PIR, 81 SPROT, 82 PRF, 83 Refseq, 84 Flybase, 85 USPTO, 86 All 87 }; 88 89 enum class EFormat { 90 unknown, 91 EMBL, 92 GenBank, 93 PIR, 94 SPROT, 95 DDBJ, 96 PRF, 97 XML, 98 ALL 99 }; 100 101 102 Int4 indx=0; /* total number of records in the 103 flat file, exclude BadLocusName entries */ 104 IndexblkPtr* entrylist=nullptr; /* a pointer points to the index block */ 105 Int4 curindx=0; /* current index of the entrylist */ 106 107 /* all the files will be produced in the directory where the program was 108 * executed except the input file which located in the argument path 109 */ 110 FileBuf ffbuf; 111 112 string release_str; 113 string authors_str; 114 115 TEntryList entries; 116 117 /* next 4 + 3 variables record data from command arguments 118 */ 119 Int4 limit=0; /* limit to sequence length. 120 As of June, 2004 sequence length 121 limitation removed. This variable 122 will be always 0 */ 123 EFormat format=EFormat::unknown; /* flat file format */ 124 ESource source=ESource::unknown; /* source of flat file */ 125 bool all=false; /* any source of flat file */ 126 Uint1 seqtype=0; /* sequence type based on source 127 of flat file */ 128 Int4 num_drop=0; /* number of entries with foregn 129 acc# (dropped) */ 130 const char *acprefix=nullptr; /* decide the drop value, s.t. 131 checking the prefix character of 132 the accession number, an option 133 user provided from the command 134 line argument */ 135 Uint1 entrez_fetch=0; /* PUBSEQBioseqFetchEnable() 136 0 - do not need this connection; 137 1 - need it and got it; 138 2 - need it and failed, will 139 reconnect */ 140 Uint1 taxserver=0; /* if != 0, call TaxArchInit() */ 141 ProtBlkPtr pbp=nullptr; /* for processing nucleic acid 142 protein sequence */ 143 Uint1 medserver=0; /* == 1, if MedArchInit() call 144 succeeded */ 145 146 struct SFindPubOptions { 147 bool always_look=true; /* if TRUE, look up even if muid in 148 Pub-equiv */ 149 bool replace_cit=true; /* if TRUE, replace Cit-art w/ replace 150 from MEDLINE */ 151 int lookups_attempted; /* citartmatch tries */ 152 int lookups_succeeded; /* citartmatch worked */ 153 int fetches_attempted; /* FetchPubs tried */ 154 int fetches_succeeded; /* FetchPubs that worked */ 155 bool merge_ids = true; /* If TRUE then merges Cit-art.ids from 156 input Cit-sub and one gotten from 157 med server. */ 158 }; 159 160 161 SFindPubOptions fpo; /* for medline uid lookup */ 162 bool date=false; /* if TRUE, replace update date 163 from LOCUS */ 164 bool no_date=false; /* if TRUE, if no update and curr 165 date come out */ 166 bool citat=false; /* if TRUE, removes serial-numbers */ 167 bool transl=false; /* if TRUE program replaces translation */ 168 bool sort=false; /* if TRUE, program doesn't sort entries */ 169 bool debug=false; /* output everthing */ 170 bool segment=false; /* treat the input file as segment in embl format */ 171 bool no_code=false; /* no genetic code from server try to guess */ 172 bool seg_acc=false; /* use accession for segmented set Id */ 173 bool convert=false; /* convert to new asn.1 spec (ver. 4.0) */ 174 char** accpref=nullptr; /* a list of allowable 2-letter 175 prefixes in new format of accession 176 numbers 2 letters + 6 digits */ 177 bool accver=false; /* ACCESSION.VERSION */ 178 bool histacc=false; /* Populate Seq-inst.hist.replaces with secondaries */ 179 bool ign_toks=false; /* Ignore multiple tokens in DDBJ's VERSION line. Default = FALSE */ 180 bool ign_prot_src=false; /* If set to TRUE, then does not reject record if protein accession 181 prefix does not fit sequence owner */ 182 bool ign_bad_qs=false; /* If TRUE, then does not reject the record with bad quality score */ 183 EMode mode=EMode::Release; /* Known so far: RELEASE and HTGS. For now only difference between 184 severity of error messages. */ 185 bool diff_lt=false; /* If TRUE, then will allow to have same genes with different 186 locus_tags. Default is FALSE. */ 187 Int4 errstat=0; /* Just a temporary storage */ 188 bool allow_uwsec=false; /* Allows unusual secondary WGS accessions with prefixes not 189 matching the primary one */ 190 FTAOperonPtr operon=nullptr; 191 bool xml_comp=false; /* INSDSeq/GenBank/EMBL compatible */ 192 bool sp_dt_seq_ver=true; /* For SwissProt "Reviewed" records 193 only: puts the sequence version 194 number from "sequence version" DT 195 line into Seq-id.version slot */ 196 bool simple_genes=false; /* If set to TRUE, then will always 197 merge join locations to the single 198 ones while generating genes */ 199 Int4 cleanup=0; /* pick the required cleanup function: 200 0 - legacy parser version of SSEC; 201 1 - SSEC; 202 2 - none. 203 Default is 0. */ 204 bool allow_crossdb_featloc=false; 205 bool genenull=false; 206 const char* qsfile=nullptr; /* Do not free, just a pointer */ 207 208 209 FILE* qsfd=nullptr; 210 bool qamode=false; 211 char* buf=nullptr; /* Temporary storage for locations checks */ 212 EOutput output_format=EOutput::BioseqSet; /* Bioseq-set or Seq-submit */ 213 214 // buffer based parsing 215 bool ffdb=false; /* Use FlatFile database */ 216 bool farseq=false; 217 void* user_data=nullptr; 218 char*(*ff_get_entry)(const char* accession)=nullptr; 219 char*(*ff_get_entry_v)(const char* accession, Int2 vernum)=nullptr; 220 char*(*ff_get_qscore)(const char* accession, Int2 v)=nullptr; 221 char*(*ff_get_qscore_pp)(const char* accession, Int2 v, Parser *pp)=nullptr; 222 char*(*ff_get_entry_pp)(const char* accession, Parser *pp)=nullptr; 223 char*(*ff_get_entry_v_pp)(const char* accession, Int2 vernum, Parser *pp)=nullptr; 224 225 virtual ~Parser(); 226 }; 227 228 using ParserPtr = Parser*; 229 230 /**************************************************************************/ 231 void fta_init_pp(Parser& pp); 232 233 END_NCBI_SCOPE 234 235 #endif 236