1 /* ftablock.h
2  *
3  * ===========================================================================
4  *
5  *                            PUBLIC DOMAIN NOTICE
6  *               National Center for Biotechnology Information
7  *
8  *  This software/database is a "United States Government Work" under the
9  *  terms of the United States Copyright Act.  It was written as part of
10  *  the author's official duties as a United States Government employee and
11  *  thus cannot be copyrighted.  This software/database is freely available
12  *  to the public for use. The National Library of Medicine and the U.S.
13  *  Government have not placed any restriction on its use or reproduction.
14  *
15  *  Although all reasonable efforts have been taken to ensure the accuracy
16  *  and reliability of the software and data, the NLM and the U.S.
17  *  Government do not and cannot warrant the performance or results that
18  *  may be obtained by using this software or data. The NLM and the U.S.
19  *  Government disclaim all warranties, express or implied, including
20  *  warranties of performance, merchantability or fitness for any particular
21  *  purpose.
22  *
23  *  Please cite the author in any work or product based on this material.
24  *
25  * ===========================================================================
26  *
27  * File Name:  ftablock.h
28  *
29  * Author: Karl Sirotkin, Hsiu-Chuan Chen
30  *
31  * File Description:
32  * -----------------
33  */
34 
35 #ifndef  _BLOCK_
36 #define  _BLOCK_
37 
38 #include <objects/seqloc/Patent_seq_id.hpp>
39 #include <objects/seqloc/Seq_id.hpp>
40 #include <objects/seqloc/Seq_loc.hpp>
41 #include <objects/seq/Linkage_evidence.hpp>
42 #include <objects/general/Date_std.hpp>
43 #include <objects/seqset/Seq_entry.hpp>
44 #include <objects/seqfeat/Seq_feat.hpp>
45 #include <objects/seqfeat/OrgMod.hpp>
46 #include <objects/seqfeat/Genetic_code.hpp>
47 #include <objects/pub/Pub.hpp>
48 #include <objects/seq/Delta_seq.hpp>
49 
50 #include <objtools/flatfile/flatfile_parse_info.hpp>
51 #include "valnode.h"
52 
53 BEGIN_NCBI_SCOPE
54 
55 typedef std::list<CRef<objects::CSeq_feat> > TSeqFeatList;
56 typedef std::list<std::string> TAccessionList;
57 typedef std::list<CRef<objects::CSeq_id> > TSeqIdList;
58 typedef std::list<CRef<objects::COrgMod> > TOrgModList;
59 typedef std::vector<CRef<objects::CGb_qual> > TGbQualVector;
60 typedef std::list<CRef<objects::CSeqdesc> > TSeqdescList;
61 typedef std::vector<CRef<objects::CUser_object> > TUserObjVector;
62 typedef std::list<CRef<objects::CPub> > TPubList;
63 typedef std::list<CRef<objects::CSeq_loc> > TSeqLocList;
64 typedef std::list<CRef<objects::CDelta_seq> > TDeltaList;
65 
66 
67 #define ParFlat_ENTRYNODE    500
68 
69 /*
70 #define FTA_RELEASE_MODE     0
71 #define FTA_HTGS_MODE        1
72 #define FTA_HTGSCON_MODE     2
73 */
74 
75 typedef struct info_bioseq {
76     TSeqIdList ids;                       /* for this Bioseq */
77     char*  locus;
78     char*  acnum;
79 
info_bioseqinfo_bioseq80     info_bioseq() :
81         locus(NULL),
82         acnum(NULL)
83     {}
84 
85 } InfoBioseq, *InfoBioseqPtr;
86 
87 typedef struct protein_block {
88     objects::CSeq_entry* biosep; /* for the toppest level of the BioseqSet */
89 
90     bool           segset;      /* TRUE if a BioseqSet SeqEntry */
91 
92     TEntryList     entries;     /* a ProtRef SeqEntry list, link to above
93                                    biosep */
94 
95     TSeqFeatList   feats;       /* a CodeRegionPtr list to link the BioseqSet
96                                    with class = nuc-prot */
97     objects::CGenetic_code::C_E gcode;         /* for this Bioseq */
98     InfoBioseqPtr  ibp;
99     Uint1          genome;
100     Int4           orig_gcode;
101 
protein_blockprotein_block102     protein_block() :
103         biosep(nullptr),
104         segset(false),
105         ibp(NULL),
106         genome(0),
107         orig_gcode(0)
108     {}
109 
110 } ProtBlk, *ProtBlkPtr;
111 
112 typedef struct _locus_cont {
113     Int4 bases;
114     Int4 bp;
115     Int4 strand;
116     Int4 molecule;
117     Int4 topology;
118     Int4 div;
119     Int4 date;
120 } LocusCont, *LocusContPtr;
121 
122 
123 typedef struct _gap_feats {
124     Int4    from;
125     Int4    to;
126     Int4    estimated_length;
127     bool    leftNs;
128     bool    rightNs;
129     bool    assembly_gap;
130     char* gap_type;
131     Int4    asn_gap_type;
132 
133     objects::CLinkage_evidence::TLinkage_evidence asn_linkage_evidence;
134 
135     struct _gap_feats *next;
136 
137     _gap_feats();
138 
139 } GapFeats, *GapFeatsPtr;
140 
141 typedef struct token_block {
142     char*                 str;        /* the token string */
143     struct token_block *next;       /* points to next token */
144 } TokenBlk, *TokenBlkPtr;
145 
146 typedef struct token_statistics_block {
147     TokenBlkPtr list;                   /* a pointer points to the first
148                                            token */
149     Int2        num;                    /* total number of token in the
150                                            chain list */
151 } TokenStatBlk, *TokenStatBlkPtr;
152 
153 typedef struct _XmlIndex {
154     Int4                  tag;
155     Int4                  order;
156     size_t                start;        /* Offset from the beginning of the
157                                            record, not file! */
158     size_t                end;          /* Offset from the beginning of the
159                                            record, not file! */
160     Int4                  start_line;
161     Int4                  end_line;
162     Int2                  type;         /* Used for references */
163     struct _XmlIndex *subtags;
164     struct _XmlIndex *next;
165 } XmlIndex, *XmlIndexPtr;
166 
167 typedef std::list<std::string> TKeywordList;
168 
169 typedef struct indexblk_struct {
170     Char               acnum[200];      /* accession num */
171     Int2               vernum;          /* version num */
172     size_t             offset;          /* byte-offset of in the flatfile at
173                                            which the entry starts */
174     Char               locusname[200];  /* locus name */
175     Char               division[4];     /* division code */
176     size_t             bases;           /* basepair length of the entry */
177     Uint2              segnum;          /* the number of the entry w/i a
178                                            segment set */
179     Uint2              segtotal;        /* total number of members in
180                                            segmented set to which this
181                                            entry belongs */
182     Char               blocusname[200]; /* base locus name s.t. w/o tailing
183                                            number */
184     size_t             linenum;         /* line number at which the entry
185                                            starts */
186     Uint1              drop;            /* 1 if the accession should be
187                                            dropped, otherwise 0 */
188     size_t             len;             /* total length (or sizes in bytes)
189                                            of the entry */
190 
191     CRef<objects::CDate_std> date; /* the record's entry-date or last
192                                                   update's date */
193 
194     CRef<objects::CPatent_seq_id> psip; /* patent reference */
195 
196     bool               EST;             /* special EST entries */
197     bool               STS;             /* special STS entries */
198     bool               GSS;             /* special Genome servey entries */
199     bool               HTC;             /* high throughput cDNA */
200     Int2               htg;             /* special HTG [0,1,2,3] entries */
201     bool               is_contig;       /* TRUE if entry has CONTIG line,
202                                            otherwise FALSE */
203     bool               is_mga;          /* TRUE if entry has MGA line,
204                                            otherwise FALSE */
205     bool               origin;          /* TRUE if sequence is present */
206     bool               is_pat;          /* TRUE if accession prefix is
207                                            patented and matches source.
208                                            FALSE - otherwise. */
209     bool               is_wgs;
210     bool               is_tpa;
211     bool               is_tsa;
212     bool               is_tls;
213     bool               is_tpa_wgs_con;  /* TRUE if "is_contig", "is_wgs" and
214                                            "is_tpa" are TRUE */
215     bool               tsa_allowed;
216     LocusCont          lc;
217     char*            moltype;         /* the value of /mol_type qual */
218     GapFeatsPtr        gaps;
219 
220 //    list<string>       secondary_accessions;
221     TokenBlkPtr        secaccs;
222     XmlIndexPtr        xip;
223     bool               embl_new_ID;
224     bool               env_sample_qual; /* TRUE if at least one source
225                                            feature has /environmental_sample
226                                            qualifier */
227     bool               is_prot;
228     char*            organism;        /* The value of /organism qualifier */
229     Int4               taxid;           /* The value gotten from source feature
230                                            /db_xref qualifier if any */
231     bool               no_gc_warning;   /* If TRUE then suppress
232                                            ERR_SERVER_GcFromSuppliedLineage
233                                            WARNING message */
234     size_t             qsoffset;
235     size_t             qslength;
236     Int4               wgs_and_gi;      /* 01 - has GI, 02 - WGS contig,
237                                            03 - both above */
238     bool               got_plastid;     /* Set to TRUE if there is at least
239                                            one /organelle qual beginning
240                                            with "plastid" */
241     Char               wgssec[100];     /* Reserved buffer for WGS master or
242                                            project accession as secondary */
243     Int4               gc_genomic;      /* Genomic Genetic code from OrgRef */
244     Int4               gc_mito;         /* Mitochondrial Genetic code */
245     TKeywordList       keywords;        /* All keywords from a flat record */
246     bool               assembly;        /* TRUE for TPA:assembly in
247                                            KEYWORDS line */
248     bool               specialist_db;   /* TRUE for TPA:specialist_db in
249                                            KEYWORDS line */
250     bool               inferential;     /* TRUE for TPA:inferential in
251                                            KEYWORDS line */
252     bool               experimental;    /* TRUE for TPA:experimental in
253                                            KEYWORDS line */
254     char*            submitter_seqid;
255     Parser *ppp;
256 
257     indexblk_struct();
258 
259 } Indexblk, *IndexblkPtr;
260 
261 typedef struct _fta_operon {
262     const Char*             featname;   /* Do not free! Just a pointer. */
263     const Char*             operon;     /* Do not free! Just a pointer. */
264 
265     CConstRef<objects::CSeq_loc> location;   /* Do not free! Just a pointer. */
266 
267     char*                 strloc;     /* String value of location. */
268     bool                    operon_feat;
269     bool                    ret;
270     struct _fta_operon *next;
271 
_fta_operon_fta_operon272     _fta_operon() :
273         featname(nullptr),
274         operon(nullptr),
275         strloc(nullptr),
276         operon_feat(false),
277         ret(false),
278         next(nullptr)
279     {}
280 
281 } FTAOperon, *FTAOperonPtr;
282 
283 typedef struct data_block {
284     Int2                   type;        /* which keyword block or node type */
285     void*                data;        /* any pointer type points to
286                                            information block */
287     char*                offset;      /* points to beginning of the entry
288                                            in the memory */
289     size_t                 len;         /* lenght of data in bytes */
290     char*                qscore;      /* points to quality score buffer */
291     Uint1                  drop;        /* 1 if drop this data block */
292     struct data_block *next;
293 } DataBlk, *DataBlkPtr;
294 
295 typedef struct entry_block {
296     DataBlkPtr              chain;      /* a header points to key-word
297                                            block information */
298     CRef<objects::CSeq_entry> seq_entry; /* points to sequence entry */
299 
300     struct entry_block *next;
301 
entry_blockentry_block302     entry_block() :
303         chain(NULL),
304         next(NULL)
305     {}
306 
307 } EntryBlk, *EntryBlkPtr;
308 
309 typedef struct keyword_block {
310     const char *str;
311     Int2       len;
312 } KwordBlk, *KwordBlkPtr;
313 
314 /**************************************************************************/
315 
316 void FreeDatablk(DataBlkPtr dbp);
317 void FreeEntry(DataBlkPtr entry);
318 void FreeIndexblk(IndexblkPtr ibp);
319 void GapFeatsFree(GapFeatsPtr gfp);
320 void XMLIndexFree(XmlIndexPtr xip);
321 
322 void FreeEntryBlk(EntryBlkPtr entry);
323 EntryBlkPtr CreateEntryBlk();
324 
325 END_NCBI_SCOPE
326 
327 #endif
328