1 /* fta_parser.h
2  *
3  * ===========================================================================
4  *
5  *                            PUBLIC DOMAIN NOTICE
6  *               National Center for Biotechnology Information
7  *
8  *  This software/database is a "United States Government Work" under the
9  *  terms of the United States Copyright Act.  It was written as part of
10  *  the author's official duties as a United States Government employee and
11  *  thus cannot be copyrighted.  This software/database is freely available
12  *  to the public for use. The National Library of Medicine and the U.S.
13  *  Government have not placed any restriction on its use or reproduction.
14  *
15  *  Although all reasonable efforts have been taken to ensure the accuracy
16  *  and reliability of the software and data, the NLM and the U.S.
17  *  Government do not and cannot warrant the performance or results that
18  *  may be obtained by using this software or data. The NLM and the U.S.
19  *  Government disclaim all warranties, express or implied, including
20  *  warranties of performance, merchantability or fitness for any particular
21  *  purpose.
22  *
23  *  Please cite the author in any work or product based on this material.
24  *
25  * ===========================================================================
26  *
27  * File Name:  fta_parser.h
28  *
29  * Author: Karl Sirotkin, Hsiu-Chuan Chen, Alexey Dobronadezhdin
30  *
31  * File Description:
32  * -----------------
33  */
34 
35 #ifndef  __FLATFILE_PARSE_INFO__
36 #define  __FLATFILE_PARSE_INFO__
37 
38 #include <list>
39 #include <objects/seqset/Seq_entry.hpp>
40 
41 BEGIN_NCBI_SCOPE
42 
43 // some forward declarations
44 struct FileBuf {
45     const char* start=nullptr;
46     const char* current=nullptr;
47 };
48 struct indexblk_struct;
49 struct protein_block;
50 struct _fta_operon;
51 
52 
53 typedef struct indexblk_struct* IndexblkPtr;
54 typedef struct protein_block* ProtBlkPtr;
55 typedef struct _fta_operon* FTAOperonPtr;
56 
57 using TEntryList = list<CRef<objects::CSeq_entry>>;
58 
59 struct Parser {
60 
61     enum class EOutput {
62         BioseqSet,
63         Seqsubmit
64     };
65 
66     enum class EMode {
67         Release,
68         HTGS,
69         HTGSCON,
70         Relaxed
71     };
72 
73     enum class ESource {
74         unknown,
75         NCBI,
76         EMBL,
77         GenBank,
78         DDBJ,
79         LANL,
80         PIR,
81         SPROT,
82         PRF,
83         Refseq,
84         Flybase,
85         USPTO,
86         All
87     };
88 
89     enum class EFormat {
90         unknown,
91         EMBL,
92         GenBank,
93         PIR,
94         SPROT,
95         DDBJ,
96         PRF,
97         XML,
98         ALL
99     };
100 
101 
102     Int4 indx=0;                          /* total number of records in the
103                                            flat file, exclude BadLocusName entries */
104     IndexblkPtr* entrylist=nullptr;     /* a pointer points to the index block */
105     Int4 curindx=0;                     /* current index of the entrylist */
106 
107     /* all the files will be produced in the directory where the program was
108      * executed except the input file which located in the argument path
109      */
110     FileBuf       ffbuf;
111 
112     string      release_str;
113     string      authors_str;
114 
115     TEntryList       entries;
116 
117     /* next 4 + 3 variables record data from command arguments
118      */
119     Int4 limit=0;                     /* limit to sequence length.
120                                          As of June, 2004 sequence length
121                                          limitation removed. This variable
122                                          will be always 0 */
123     EFormat format=EFormat::unknown;  /* flat file format */
124     ESource source=ESource::unknown;  /* source of flat file */
125     bool    all=false;                /* any source of flat file */
126     Uint1   seqtype=0;                /* sequence type based on source
127                                          of flat file */
128     Int4 num_drop=0;                  /* number of entries with foregn
129                                          acc# (dropped) */
130     const char *acprefix=nullptr;     /* decide the drop value, s.t.
131                                          checking the prefix character of
132                                          the accession number, an option
133                                          user provided from the command
134                                          line argument */
135     Uint1 entrez_fetch=0;             /* PUBSEQBioseqFetchEnable()
136                                          0 - do not need this connection;
137                                          1 - need it and got it;
138                                          2 - need it and failed, will
139                                          reconnect */
140     Uint1 taxserver=0;                /* if != 0, call TaxArchInit() */
141     ProtBlkPtr pbp=nullptr;           /* for processing nucleic acid
142                                          protein sequence */
143     Uint1 medserver=0;                /* == 1, if MedArchInit() call
144                                          succeeded */
145 
146     struct SFindPubOptions {
147         bool    always_look=true;       /* if TRUE, look up even if muid in
148                                            Pub-equiv */
149         bool    replace_cit=true;       /* if TRUE, replace Cit-art w/ replace
150                                            from MEDLINE */
151         int lookups_attempted;           /* citartmatch tries */
152         int lookups_succeeded;           /* citartmatch worked */
153         int  fetches_attempted;          /* FetchPubs tried */
154         int  fetches_succeeded;          /* FetchPubs that worked */
155         bool merge_ids = true;           /* If TRUE then merges Cit-art.ids from
156                                             input Cit-sub and one gotten from
157                                             med server. */
158     };
159 
160 
161     SFindPubOptions  fpo;         /* for medline uid lookup */
162     bool date=false;              /* if TRUE, replace update date
163                                      from LOCUS */
164     bool no_date=false;           /* if TRUE, if no update and curr
165                                      date come out */
166     bool citat=false;             /* if TRUE, removes serial-numbers */
167     bool transl=false;            /* if TRUE program replaces translation */
168     bool sort=false;              /* if TRUE, program doesn't sort entries */
169     bool debug=false;             /* output everthing */
170     bool segment=false;           /* treat the input file as segment in embl format */
171     bool no_code=false;           /* no genetic code from server try to guess */
172     bool seg_acc=false;           /* use accession for segmented set Id */
173     bool convert=false;           /* convert to new asn.1 spec (ver. 4.0) */
174     char** accpref=nullptr;       /* a list of allowable 2-letter
175                                      prefixes in new format of accession
176                                     numbers 2 letters + 6 digits */
177     bool accver=false;            /* ACCESSION.VERSION */
178     bool histacc=false;           /* Populate Seq-inst.hist.replaces with secondaries */
179     bool ign_toks=false;          /* Ignore multiple tokens in DDBJ's VERSION line. Default = FALSE */
180     bool ign_prot_src=false;      /* If set to TRUE, then does not reject record if protein accession
181                                      prefix does not fit sequence owner */
182     bool ign_bad_qs=false;        /* If TRUE, then does not reject the record with bad quality score */
183     EMode mode=EMode::Release;    /* Known so far: RELEASE and HTGS. For now only difference between
184                                      severity of error messages. */
185     bool diff_lt=false;           /* If TRUE, then will allow to have same genes with different
186                                      locus_tags. Default is FALSE. */
187     Int4 errstat=0;               /* Just a temporary storage */
188     bool allow_uwsec=false;       /* Allows unusual secondary WGS accessions with prefixes not
189                                      matching the primary one */
190     FTAOperonPtr operon=nullptr;
191     bool xml_comp=false;          /* INSDSeq/GenBank/EMBL compatible */
192     bool sp_dt_seq_ver=true;      /* For SwissProt "Reviewed" records
193                                      only: puts the sequence version
194                                      number from "sequence version" DT
195                                      line into Seq-id.version slot */
196     bool simple_genes=false;      /* If set to TRUE, then will always
197                                      merge join locations to the single
198                                      ones while generating genes */
199     Int4 cleanup=0;               /* pick the required cleanup function:
200                                      0 - legacy parser version of SSEC;
201                                      1 - SSEC;
202                                      2 - none.
203                                      Default is 0. */
204     bool allow_crossdb_featloc=false;
205     bool genenull=false;
206     const char* qsfile=nullptr;   /* Do not free, just a pointer */
207 
208 
209     FILE* qsfd=nullptr;
210     bool  qamode=false;
211     char* buf=nullptr;         /* Temporary storage for locations checks */
212     EOutput output_format=EOutput::BioseqSet; /* Bioseq-set or Seq-submit */
213 
214     // buffer based parsing
215     bool ffdb=false;              /* Use FlatFile database */
216     bool farseq=false;
217     void* user_data=nullptr;
218     char*(*ff_get_entry)(const char* accession)=nullptr;
219     char*(*ff_get_entry_v)(const char* accession, Int2 vernum)=nullptr;
220     char*(*ff_get_qscore)(const char* accession, Int2 v)=nullptr;
221     char*(*ff_get_qscore_pp)(const char* accession, Int2 v, Parser *pp)=nullptr;
222     char*(*ff_get_entry_pp)(const char* accession, Parser *pp)=nullptr;
223     char*(*ff_get_entry_v_pp)(const char* accession, Int2 vernum, Parser *pp)=nullptr;
224 
225     virtual ~Parser();
226 };
227 
228 using ParserPtr = Parser*;
229 
230 /**************************************************************************/
231 void fta_init_pp(Parser& pp);
232 
233 END_NCBI_SCOPE
234 
235 #endif
236