1 /* loadfeat.cpp
2  *
3  * ===========================================================================
4  *
5  *                            PUBLIC DOMAIN NOTICE
6  *               National Center for Biotechnology Information
7  *
8  *  This software/database is a "United States Government Work" under the
9  *  terms of the United States Copyright Act.  It was written as part of
10  *  the author's official duties as a United States Government employee and
11  *  thus cannot be copyrighted.  This software/database is freely available
12  *  to the public for use. The National Library of Medicine and the U.S.
13  *  Government have not placed any restriction on its use or reproduction.
14  *
15  *  Although all reasonable efforts have been taken to ensure the accuracy
16  *  and reliability of the software and data, the NLM and the U.S.
17  *  Government do not and cannot warrant the performance or results that
18  *  may be obtained by using this software or data. The NLM and the U.S.
19  *  Government disclaim all warranties, express or implied, including
20  *  warranties of performance, merchantability or fitness for any particular
21  *  purpose.
22  *
23  *  Please cite the author in any work or product based on this material.
24  *
25  * ===========================================================================
26  *
27  * File Name:  loadfeat.c
28  *
29  * Author: Karl Sirotkin, Hsiu-Chuan Chen
30  *
31  * File Description:
32  * -----------------
33  *      Parse features block to subblock.
34  *      Process each subblock.
35  *      Output each subblock.
36  *      Free out subblock.
37  */
38 #include <ncbi_pch.hpp>
39 
40 #include "ftacpp.hpp"
41 
42 #include <objects/seqfeat/Seq_feat.hpp>
43 #include <objects/seqfeat/Imp_feat.hpp>
44 #include <objmgr/bioseq_handle.hpp>
45 #include <objmgr/scope.hpp>
46 #include <objects/seqloc/Seq_bond.hpp>
47 #include <objects/seqfeat/Org_ref.hpp>
48 #include <objects/general/Dbtag.hpp>
49 #include <objects/general/Object_id.hpp>
50 #include <objects/seqfeat/OrgName.hpp>
51 #include <objects/seqfeat/SubSource.hpp>
52 #include <objects/seq/Seq_descr.hpp>
53 #include <objects/seqfeat/RNA_ref.hpp>
54 #include <objects/seqfeat/RNA_gen.hpp>
55 #include <objects/seqfeat/RNA_qual_set.hpp>
56 #include <objects/seqfeat/RNA_qual.hpp>
57 #include <objects/seqfeat/Trna_ext.hpp>
58 #include <objects/pub/Pub_set.hpp>
59 #include <objects/pub/Pub.hpp>
60 #include <serial/objostr.hpp>
61 #include <objmgr/util/seq_loc_util.hpp>
62 #include <objects/seq/seq_loc_from_string.hpp>
63 #include <objects/seq/Pubdesc.hpp>
64 #include <objects/seqfeat/BioSource.hpp>
65 #include <objects/seqfeat/SeqFeatData.hpp>
66 #include <objects/seq/MolInfo.hpp>
67 #include <objects/seq/Seq_inst.hpp>
68 #include <objects/seq/Seq_ext.hpp>
69 #include <objects/seq/Delta_ext.hpp>
70 #include <objects/seq/Delta_seq.hpp>
71 
72 #include "index.h"
73 #include "embl.h"
74 #include "genbank.h"
75 
76 #include <objtools/flatfile/flatfile_parser.hpp>
77 #include <objtools/flatfile/flatdefn.h>
78 
79 #include "ftaerr.hpp"
80 #include "indx_blk.h"
81 #include "asci_blk.h"
82 #include "utilfeat.h"
83 #include "loadfeat.h"
84 #include "add.h"
85 #include "fta_src.h"
86 #include "buf_data_loader.h"
87 #include "utilfun.h"
88 #include "ref.h"
89 #include "xgbfeat.h"
90 #include "xgbparint.h"
91 #include "fta_xml.h"
92 
93 #ifdef THIS_FILE
94 #    undef THIS_FILE
95 #endif
96 #define THIS_FILE "loadfeat.cpp"
97 
98 BEGIN_NCBI_SCOPE
99 USING_SCOPE(objects);
100 
101 #define Seq_descr_GIBB_mol_unknown       0
102 #define Seq_descr_GIBB_mol_genomic       1
103 #define Seq_descr_GIBB_mol_preRNA        2
104 #define Seq_descr_GIBB_mol_mRNA          3
105 #define Seq_descr_GIBB_mol_rRNA          4
106 #define Seq_descr_GIBB_mol_tRNA          5
107 #define Seq_descr_GIBB_mol_uRNA          6
108 #define Seq_descr_GIBB_mol_snRNA         6
109 #define Seq_descr_GIBB_mol_scRNA         7
110 #define Seq_descr_GIBB_mol_other_genetic 9
111 #define Seq_descr_GIBB_mol_cRNA          11
112 #define Seq_descr_GIBB_mol_snoRNA        12
113 #define Seq_descr_GIBB_mol_trRNA         13
114 #define Seq_descr_GIBB_mol_other         255
115 
116 typedef struct _trna_aa {
117     const char *name;
118     Uint1      aa;
119 } TrnaAa, *TrnaAaPtr;
120 
121 typedef struct _str_num {
122     const char *str;
123     Int4       num;
124 } StrNum, *StrNumPtr;
125 
126 TrnaAa taa[] = {
127     {"alanine",        'A'},
128     {"arginine",       'R'},
129     {"asparagine",     'N'},
130     {"aspartic acid",  'D'},
131     {"aspartate",      'D'},
132     {"cysteine",       'C'},
133     {"glutamine",      'Q'},
134     {"glutamic acid",  'E'},
135     {"glutamate",      'E'},
136     {"glycine",        'G'},
137     {"histidine",      'H'},
138     {"isoleucine",     'I'},
139     {"leucine",        'L'},
140     {"lysine",         'K'},
141     {"methionine",     'M'},
142     {"phenylalanine",  'F'},
143     {"proline",        'P'},
144     {"selenocysteine", 'U'},
145     {"serine",         'S'},
146     {"threonine",      'T'},
147     {"tryptophan",     'W'},
148     {"tyrosine",       'Y'},
149     {"valine",         'V'},
150     {NULL,             '\0'}
151 };
152 
153 typedef struct _aa_codons {
154     const char *straa;
155     Uint1      intaa;
156     Uint1      gencode;
157     Int4       vals[8];
158 } AaCodons, *AaCodonsPtr;
159 
160 AaCodons aacodons[] = {
161    {"Ala",    'A',  0, {52, 53, 54, 55, -1, -1, -1, -1}},  /* GCT, GCC, GCA, GCG */
162    {"Arg",    'R',  2, {28, 29, 30, 31, -1, -1, -1, -1}},  /* CGT, CGC, CGA, CGG */
163    {"Arg",    'R',  5, {28, 29, 30, 31, -1, -1, -1, -1}},  /* CGT, CGC, CGA, CGG */
164    {"Arg",    'R',  9, {28, 29, 30, 31, -1, -1, -1, -1}},  /* CGT, CGC, CGA, CGG */
165    {"Arg",    'R', 13, {28, 29, 30, 31, -1, -1, -1, -1}},  /* CGT, CGC, CGA, CGG */
166    {"Arg",    'R', 14, {28, 29, 30, 31, -1, -1, -1, -1}},  /* CGT, CGC, CGA, CGG */
167    {"Arg",    'R',  0, {28, 29, 30, 31, 46, 47, -1, -1}},  /* CGT, CGC, CGA, CGG, AGA, AGG */
168    {"Asn",    'N',  9, {40, 41, 42, -1, -1, -1, -1, -1}},  /* AAT, AAC, AAA */
169    {"Asn",    'N', 14, {40, 41, 42, -1, -1, -1, -1, -1}},  /* AAT, AAC, AAA */
170    {"Asn",    'N',  0, {40, 41, -1, -1, -1, -1, -1, -1}},  /* AAT, AAC */
171    {"Asp",    'D',  0, {56, 57, -1, -1, -1, -1, -1, -1}},  /* GAT, GAC */
172    {"Asx",    'B',  9, {40, 41, 42, 56, 57, -1, -1, -1}},  /* Asn + Asp */
173    {"Asx",    'B', 14, {40, 41, 42, 56, 57, -1, -1, -1}},  /* Asn + Asp */
174    {"Asx",    'B',  0, {40, 41, 56, 57, -1, -1, -1, -1}},  /* Asn + Asp */
175    {"Cys",    'C', 10, {12, 13, 14, -1, -1, -1, -1, -1}},  /* TGT, TGC, TGA */
176    {"Cys",    'C',  0, {12, 13, -1, -1, -1, -1, -1, -1}},  /* TGT, TGC */
177    {"Gln",    'Q',  6, {10, 11, 26, 27, -1, -1, -1, -1}},  /* TAA, TAG, CAA, CAG */
178    {"Gln",    'Q', 15, {11, 26, 27, -1, -1, -1, -1, -1}},  /* TAG, CAA, CAG */
179    {"Gln",    'Q',  0, {26, 27, -1, -1, -1, -1, -1, -1}},  /* CAA, CAG */
180    {"Glu",    'E',  0, {58, 59, -1, -1, -1, -1, -1, -1}},  /* GAA, GAG */
181    {"Glx",    'Z',  6, {10, 11, 26, 27, 58, 59, -1, -1}},  /* Gln + Glu */
182    {"Glx",    'Z',  0, {11, 26, 27, 58, 59, -1, -1, -1}},  /* Gln + Glu */
183    {"Glx",    'Z',  0, {26, 27, 58, 59, -1, -1, -1, -1}},  /* Gln + Glu */
184    {"Gly",    'G', 13, {46, 47, 60, 61, 62, 63, -1, -1}},  /* AGA, AGG, GGT, GGC, GGA, GGG */
185    {"Gly",    'G',  0, {60, 61, 62, 63, -1, -1, -1, -1}},  /* GGT, GGC, GGA, GGG */
186    {"His",    'H',  0, {24, 25, -1, -1, -1, -1, -1, -1}},  /* CAT, CAC */
187    {"Ile",    'I',  2, {32, 33, -1, -1, -1, -1, -1, -1}},  /* ATT, ATC */
188    {"Ile",    'I',  3, {32, 33, -1, -1, -1, -1, -1, -1}},  /* ATT, ATC */
189    {"Ile",    'I',  5, {32, 33, -1, -1, -1, -1, -1, -1}},  /* ATT, ATC */
190    {"Ile",    'I', 13, {32, 33, -1, -1, -1, -1, -1, -1}},  /* ATT, ATC */
191    {"Ile",    'I',  0, {32, 33, 34, -1, -1, -1, -1, -1}},  /* ATT, ATC, ATA */
192    {"Leu",    'L',  3, { 2,  3, -1, -1, -1, -1, -1, -1}},  /* TTA, TTG */
193    {"Leu",    'L', 12, { 2,  3, 16, 17, 18, -1, -1, -1}},  /* TTA, TTG, CTT, CTC, CTA */
194    {"Leu",    'L',  0, { 2,  3, 16, 17, 18, 19, -1, -1}},  /* TTA, TTG, CTT, CTC, CTA, CTG */
195    {"Lys",    'K',  9, {43, -1, -1, -1, -1, -1, -1, -1}},  /* AAG */
196    {"Lys",    'K', 14, {43, -1, -1, -1, -1, -1, -1, -1}},  /* AAG */
197    {"Lys",    'K',  0, {42, 43, -1, -1, -1, -1, -1, -1}},  /* AAA, AAG */
198    {"Met",    'M',  2, {34, 35, -1, -1, -1, -1, -1, -1}},  /* ATA, ATG */
199    {"Met",    'M',  3, {34, 35, -1, -1, -1, -1, -1, -1}},  /* ATA, ATG */
200    {"Met",    'M',  5, {34, 35, -1, -1, -1, -1, -1, -1}},  /* ATA, ATG */
201    {"Met",    'M', 13, {34, 35, -1, -1, -1, -1, -1, -1}},  /* ATA, ATG */
202    {"Met",    'M',  0, {35, -1, -1, -1, -1, -1, -1, -1}},  /* ATG */
203    {"fMet",   'M',  2, {34, 35, -1, -1, -1, -1, -1, -1}},  /* ATA, ATG */
204    {"fMet",   'M',  3, {34, 35, -1, -1, -1, -1, -1, -1}},  /* ATA, ATG */
205    {"fMet",   'M',  5, {34, 35, -1, -1, -1, -1, -1, -1}},  /* ATA, ATG */
206    {"fMet",   'M', 13, {34, 35, -1, -1, -1, -1, -1, -1}},  /* ATA, ATG */
207    {"fMet",   'M',  0, {35, -1, -1, -1, -1, -1, -1, -1}},  /* ATG */
208    {"Phe",    'F',  0, { 0,  1, -1, -1, -1, -1, -1, -1}},  /* TTT, TTC */
209    {"Pro",    'P',  0, {20, 21, 22, 23, -1, -1, -1, -1}},  /* CCT, CCC, CCA, CCG */
210    {"Sec",    'U',  0, {-1, -1, -1, -1, -1, -1, -1, -1}},
211    {"Ser",    'S',  5, { 4,  5,  6,  7, 44, 45, 46, 47}},  /* TCT, TCC, TCA, TCG, AGT, AGC, AGA, AGG */
212    {"Ser",    'S',  9, { 4,  5,  6,  7, 44, 45, 46, 47}},  /* TCT, TCC, TCA, TCG, AGT, AGC, AGA, AGG */
213    {"Ser",    'S', 12, { 4,  5,  6,  7, 19, 44, 45, -1}},  /* TCT, TCC, TCA, TCG, CTG, AGT, AGC */
214    {"Ser",    'S', 14, { 4,  5,  6,  7, 44, 45, 46, 47}},  /* TCT, TCC, TCA, TCG, AGT, AGC, AGA, AGG */
215    {"Ser",    'S',  0, { 4,  5,  6,  7, 44, 45, -1, -1}},  /* TCT, TCC, TCA, TCG, AGT, AGC */
216    {"Thr",    'T',  3, {16, 17, 18, 19, 36, 37, 38, 39}},  /* CTT, CTC, CTA, CTG, ACT, ACC, ACA, ACG */
217    {"Thr",    'T',  0, {36, 37, 38, 39, -1, -1, -1, -1}},  /* ACT, ACC, ACA, ACG */
218    {"Trp",    'W',  1, {15, -1, -1, -1, -1, -1, -1, -1}},  /* TGG */
219    {"Trp",    'W',  6, {15, -1, -1, -1, -1, -1, -1, -1}},  /* TGG */
220    {"Trp",    'W', 10, {15, -1, -1, -1, -1, -1, -1, -1}},  /* TGG */
221    {"Trp",    'W', 11, {15, -1, -1, -1, -1, -1, -1, -1}},  /* TGG */
222    {"Trp",    'W', 12, {15, -1, -1, -1, -1, -1, -1, -1}},  /* TGG */
223    {"Trp",    'W', 15, {15, -1, -1, -1, -1, -1, -1, -1}},  /* TGG */
224    {"Trp",    'W',  0, {14, 15, -1, -1, -1, -1, -1, -1}},  /* TGA, TGG */
225    {"Tyr",    'Y', 14, { 8,  9, 10, -1, -1, -1, -1, -1}},  /* TAT, TAC, TAA */
226    {"Tyr",    'Y',  0, { 8,  9, -1, -1, -1, -1, -1, -1}},  /* TAT, TAC */
227    {"Val",    'V',  0, {48, 49, 50, 51, -1, -1, -1, -1}},  /* GTT, GTC, GTA, GTG */
228    {"TERM",   '*',  1, {10, 11, 14, -1, -1, -1, -1, -1}},  /* TAA, TAG, TGA */
229    {"TERM",   '*',  2, {10, 11, 46, 47, -1, -1, -1, -1}},  /* TAA, TAG, AGA, AGG */
230    {"TERM",   '*',  6, {14, -1, -1, -1, -1, -1, -1, -1}},  /* TGA */
231    {"TERM",   '*', 11, {10, 11, 14, -1, -1, -1, -1, -1}},  /* TAA, TAG, TGA */
232    {"TERM",   '*', 12, {10, 11, 14, -1, -1, -1, -1, -1}},  /* TAA, TAG, TGA */
233    {"TERM",   '*', 14, {11, -1, -1, -1, -1, -1, -1, -1}},  /* TAG */
234    {"TERM",   '*', 15, {10, 14, -1, -1, -1, -1, -1, -1}},  /* TAA, TGA */
235    {"TERM",   '*',  0, {10, 11, -1, -1, -1, -1, -1, -1}},  /* TAA, TAG */
236    {"OTHER",  'X',  0, {-1, -1, -1, -1, -1, -1, -1, -1}},
237    {NULL,    '\0',  0, {-1, -1, -1, -1, -1, -1, -1, -1}}
238 };
239 
240 static const char *trna_tags[] = {
241     "TRANSFERN RNA",
242     "TRANSFER RRNA",
243     "TRANSFER TRNA",
244     "TRANSFER RNA",
245     "TRASNFER RNA",
246     "TRANSDER RNA",
247     "TRANSFERRNA",
248     "TRANFER RNA",
249     "T RNA",
250     "TRNA",
251     NULL
252 };
253 
254 const char *ParFlat_ESTmod[] = {
255     "EST",
256     "expressed sequence tag",
257     "partial cDNA sequence",
258     "transcribed sequence fragment",
259     "TSR",
260     "putatively transcribed partial sequence",
261     "UK putts",
262     "Plastid",
263     NULL
264 };
265 
266 static const char *ParFlat_RNA_array[] = {
267     "precursor_RNA",
268     "mRNA",
269     "tRNA",
270     "rRNA",
271     "snRNA",
272     "scRNA",
273     "snoRNA",
274     "ncRNA",
275     "tmRNA",
276     "misc_RNA",
277     NULL
278 };
279 
280 static const char *DbxrefTagAny[] = {
281     "ASAP",
282     "CDD",
283     "DBEST",
284     "DBSTS",
285     "GDB",
286     "HMP",
287     "MAIZEGDB",
288     NULL
289 };
290 
291 static const char *DbxrefObsolete[] = {
292     "BHB",
293     "BIOHEALTHBASE",
294     "GENEW",
295     "IFO",
296     "SWISS-PROT",
297     "SPTREMBL",
298     "TREMBL",
299     NULL
300 };
301 
302 static const char *EMBLDbxrefTagStr[] = {
303     "BIOMUTA",
304     "DEPOD",
305     "ENSEMBLGENOMES-GN",
306     "ENSEMBLGENOMES-TR",
307     "ESTHER",
308     "GENEVISIBLE",
309     "MOONPROT",
310     "PROTEOMES",
311     "UNITE",
312     "WBPARASITE",
313     NULL
314 };
315 
316 static const char *DbxrefTagStr[] = {
317     "ACEVIEW/WORMGENES",
318     "APHIDBASE",
319     "APIDB",
320     "ARAPORT",
321     "BEEBASE",
322     "BEETLEBASE",
323     "BGD",
324     "BOLD",
325     "CGD",
326     "COLLECTF",
327     "DBSNP",
328     "DICTYBASE",
329     "ECOCYC",
330     "ECOGENE",
331     "ENSEMBL",
332     "ENSEMBLGENOMES",
333     "ERIC",
334     "FANTOM_DB",
335     "FLYBASE",
336     "GABI",
337     "GENEDB",
338     "GOA",
339     "H-INVDB",
340     "HGNC",
341     "HOMD",
342     "HSSP",
343     "I5KNAL",
344     "IMGT/GENE-DB",
345     "IMGT/HLA",
346     "IMGT/LIGM",
347     "INTERPRO",
348     "IRD",
349     "ISD",
350     "ISFINDER",
351     "ISHAM-ITS",
352     "JGIDB",
353     "MARPOLBASE",
354     "MEDGEN",
355     "MGI",
356     "MIRBASE",
357     "NEXTDB",
358     "NIAEST",
359     "NMPDR",
360     "NRESTDB",
361     "OSA1",
362     "PATHEMA",
363     "PDB",
364     "PFAM",
365     "PGN",
366     "PHYTOZOME",
367     "PIR",
368     "POMBASE",
369     "PSEUDO",
370     "PSEUDOCAP",
371     "RAP-DB",
372     "REMTREMBL",
373     "RFAM",
374     "RICEGENES",
375     "RZPD",
376     "SEED",
377     "SGD",
378     "SGN",
379     "SPTREMBL",
380     "SRPDB",
381     "SUBTILIST",
382     "SWISS-PROT",
383     "TAIR",
384     "TIGRFAM",
385     "TREMBL",
386     "TUBERCULIST",
387     "UNIPROT/SWISS-PROT",
388     "UNIPROT/TREMBL",
389     "UNIPROTKB/SWISS-PROT",
390     "UNIPROTKB/TREMBL",
391     "UNITE",
392     "VBASE2",
393     "VECTORBASE",
394     "VGNC",
395     "VIPR",
396     "VISTA",
397     "WORFDB",
398     "WORMBASE",
399     "XENBASE",
400     "ZFIN",
401     NULL
402 };
403 
404 static const char *DbxrefTagInt[] = {
405     "ATCC",
406     "ATCC(DNA)",
407     "ATCC(IN HOST)",
408     "BDGP_EST",
409     "BDGP_INS",
410     "ESTLIB",
411     "GENEID",
412     "GI",
413     "GO",
414     "GREENGENES",
415     "INTREPIDBIO",
416     "JCM",
417     "LOCUSID",
418     "MIM",
419     "MYCOBANK",
420     "NBRC",
421     "PBMICE",
422     "RATMAP",
423     "RGD",
424     "UNILIB",
425     "UNISTS",
426     NULL
427 };
428 
429 static const char *EmptyQuals[] = {
430     "artificial_location",              /* Fake. Put here to catch
431                                            it's empty */
432     "chloroplast",
433     "chromoplast",
434     "cyanelle",
435     "environmental_sample",
436     "focus",
437     "germline",
438     "kinetoplast",
439     "macronuclear",
440     "metagenomic",
441     "mitochondrion",
442     "mobile_element_type",              /* Fake. Put here to catch
443                                            it's empty */
444     "partial",
445     "proviral",
446     "pseudo",
447     "rearranged",
448     "ribosomal_slippage",
449     "trans_splicing",
450     "transgenic",
451     "virion",
452     NULL
453 };
454 
455 const char *TransSplicingFeats[] = {
456     "3'UTR",
457     "5'UTR",
458     "CDS",
459     "gene",
460     "mRNA",
461     "misc_RNA",
462     "precursor_RNA",
463     "tRNA",
464     NULL
465 };
466 
467 const char *ncRNA_class_values[] = {
468     "antisense_RNA",
469     "autocatalytically_spliced_intron",
470     "hammerhead_ribozyme",
471     "lncRNA",
472     "RNase_P_RNA",
473     "RNase_MRP_RNA",
474     "telomerase_RNA",
475     "guide_RNA",
476     "rasiRNA",
477     "ribozyme",
478     "scRNA",
479     "siRNA",
480     "miRNA",
481     "piRNA",
482     "pre_miRNA",
483     "snoRNA",
484     "snRNA",
485     "SRP_RNA",
486     "vault_RNA",
487     "Y_RNA",
488     "other",
489     NULL
490 };
491 
492 const char *SatelliteValues[] = {
493     "satellite",
494     "minisatellite",
495     "microsatellite",
496     NULL
497 };
498 
499 const char *PseudoGeneValues[] = {
500     "allelic",
501     "processed",
502     "unitary",
503     "unknown",
504     "unprocessed",
505     NULL
506 };
507 
508 const char *RegulatoryClassValues[] = {
509     "attenuator",
510     "CAAT_signal",
511     "DNase_I_hypersensitive_site",
512     "enhancer",
513     "enhancer_blocking_element",
514     "GC_signal",
515     "imprinting_control_region",
516     "insulator",
517     "locus_control_region",
518     "matrix_attachment_region",
519     "minus_35_signal",
520     "minus_10_signal",
521     "response_element",
522     "polyA_signal_sequence",
523     "promoter",
524     "recoding_stimulatory_region",
525     "replication_regulatory_region",
526     "ribosome_binding_site",
527     "riboswitch",
528     "silencer",
529     "TATA_box",
530     "terminator",
531     "transcriptional_cis_regulatory_region",
532     "other",
533     NULL
534 };
535 
536 StrNum GapTypeValues[] = {
537     {"between scaffolds",         8},   /* contig          */
538     {"within scaffold",           9},   /* scaffold        */
539     {"telomere",                  6},   /* telomere        */
540     {"centromere",                5},   /* centromere      */
541     {"short arm",                 3},   /* short-arm       */
542     {"heterochromatin",           4},   /* heterochromatin */
543     {"repeat within scaffold",    7},   /* repeat          */
544     {"repeat between scaffolds",  7},   /* repeat          */
545     {"unknown",                   0},   /* unknown         */
546     {NULL,                       -1}
547 };
548 
549 StrNum LinkageEvidenceValues[] = {
550     {"paired-ends",         0},         /* paired-end         */
551     {"align genus",         1},         /* align-genus        */
552     {"align xgenus",        2},         /* align-xgenus       */
553     {"align trnscpt",       3},         /* align-trnscpt      */
554     {"within clone",        4},         /* within-clone       */
555     {"clone contig",        5},         /* clone-contig       */
556     {"map",                 6},         /* map                */
557     {"strobe",              7},         /* strobe             */
558     {"unspecified",         8},         /* unspecified        */
559     {"pcr",                 9},         /* pcr                */
560     {"proximity ligation", 10},         /* proximity-ligation */
561     {NULL,                 -1}
562 };
563 
564 /**********************************************************/
FreeFeatBlkQual(FeatBlkPtr fbp)565 static void FreeFeatBlkQual(FeatBlkPtr fbp)
566 {
567     MemFree(fbp->key);
568     MemFree(fbp->location);
569     delete fbp;
570 }
571 
572 /**********************************************************/
FreeFeatBlk(DataBlkPtr dbp,Parser::EFormat format)573 static void FreeFeatBlk(DataBlkPtr dbp, Parser::EFormat format)
574 {
575     DataBlkPtr dbpnext;
576     FeatBlkPtr fbp;
577 
578     for(; dbp != NULL; dbp = dbpnext)
579     {
580         dbpnext = dbp->next;
581         fbp = (FeatBlkPtr) dbp->data;
582         if(fbp != NULL)
583         {
584             FreeFeatBlkQual(fbp);
585             dbp->data = NULL;
586         }
587         if(format == Parser::EFormat::XML)
588             MemFree(dbp);
589     }
590 }
591 
592 /**********************************************************
593  *
594  *   static void DelCharBtwData(value):
595  *
596  *      Deletes blanks in the "str".
597  *
598  **********************************************************/
DelCharBtwData(char * value)599 static void DelCharBtwData(char* value)
600 {
601     char* p;
602 
603     for(p = value; *p != '\0'; p++)
604         if(*p != ' ')
605             *value++ = *p;
606     *value = '\0';
607 }
608 
609 /**********************************************************
610  *
611  *   static Int4 flat2asn_range_func(pp, sip):
612  *
613  *      For error handle in gbparint.c routines.
614  *      This function has to return the length corresponding
615  *   to the SeqId it is passed.
616  *
617  *                                              ks 1/13/94
618  *
619  **********************************************************/
flat2asn_range_func(void * pp_ptr,const objects::CSeq_id & id)620 static Int4 flat2asn_range_func(void* pp_ptr, const objects::CSeq_id& id)
621 {
622     ParserPtr pp = reinterpret_cast<ParserPtr>(pp_ptr);
623 
624     int          use_indx = pp->curindx;
625     char*      acnum;
626 
627     Int2         vernum;
628 
629 #ifdef BIOSEQ_FIND_METHOD
630 
631     bsp = BioseqFind(sip);
632     if(bsp != NULL)
633         return(bsp->length);
634 
635     /* could try ID0 server
636     */
637     return(-1);
638 
639 #else
640 
641     const objects::CTextseq_id* text_id = nullptr;
642     if (id.IsGenbank() || id.IsEmbl() || id.IsDdbj() || id.IsTpg() ||
643         id.IsTpe() || id.IsTpd())
644         text_id = id.GetTextseq_Id();
645 
646     if (text_id != nullptr)
647     {
648         Int2 text_id_ver = text_id->IsSetVersion() ? text_id->GetVersion() : INT2_MIN;
649         const std::string& text_id_acc = text_id->GetAccession();
650         for (use_indx = 0; use_indx < pp->indx; use_indx++)
651         {
652             acnum = pp->entrylist[use_indx]->acnum;
653             vernum = pp->entrylist[use_indx]->vernum;
654             if (text_id_acc == acnum &&
655                 (pp->accver == false || vernum == text_id_ver))
656                 break;
657         }
658 
659         if (use_indx >= pp->indx)
660         {
661             /* entry is not present in this file use remote fetch function
662             * use_indx = pp->curindx;
663             */
664             size_t len = (!pp->ffdb) ? -1 : CheckOutsideEntry(pp, text_id_acc.c_str(), text_id_ver);
665             if (len != static_cast<size_t>(-1))
666                 return static_cast<Int4>(len);
667 
668             if (pp->buf == NULL)
669             {
670                 if (pp->farseq)
671                     return -1;
672 
673                 if (pp->accver == false || text_id_ver < 0)
674                 {
675                     Nlm_ErrSetContext("validatr", __FILE__, __LINE__);
676                     Nlm_ErrPostEx(SEV_WARNING, ERR_LOCATION_FailedCheck,
677                               "Location points to outside entry %s",
678                               text_id_acc.c_str());
679                 }
680                 else
681                 {
682                     Nlm_ErrSetContext("validatr", __FILE__, __LINE__);
683                     Nlm_ErrPostEx(SEV_WARNING, ERR_LOCATION_FailedCheck,
684                               "Location points to outside entry %s.%d",
685                               text_id_acc.c_str(), text_id_ver);
686                 }
687                 return(-1);
688             }
689 
690             if (*pp->buf == '\0')
691                 return(-1);
692 
693             if (pp->source == Parser::ESource::NCBI || pp->source == Parser::ESource::Refseq)
694                 ErrPostEx(SEV_WARNING, ERR_LOCATION_NCBIRefersToExternalRecord,
695                 "Feature location references an interval on another record : %s",
696                 pp->buf);
697             else
698                 ErrPostEx(SEV_WARNING, ERR_LOCATION_RefersToExternalRecord,
699                 "Feature location references an interval on another record : %s",
700                 pp->buf);
701             MemFree(pp->buf);
702             pp->buf = (char*)MemNew(1);
703             *pp->buf = '\0';
704             return(-1);
705         }
706     }
707     return static_cast<Int4>(pp->entrylist[use_indx]->bases);
708 
709 #endif
710 
711 }
712 
713 /**********************************************************/
CheckForeignLoc(const objects::CSeq_loc & loc,const objects::CSeq_id & sid)714 static bool CheckForeignLoc(const objects::CSeq_loc& loc, const objects::CSeq_id& sid)
715 {
716     const objects::CSeq_id& pid = *loc.GetId();
717 
718     if (loc.IsMix() || loc.IsEquiv() ||
719         sid.Compare(pid) == objects::CSeq_id::e_YES)
720         return false;
721 
722     return true;
723 }
724 
725 /**********************************************************/
DbxrefQualToDbtag(const objects::CGb_qual & qual,Parser::ESource source)726 static CRef<objects::CDbtag> DbxrefQualToDbtag(const objects::CGb_qual& qual, Parser::ESource source)
727 {
728     CRef<objects::CDbtag> tag;
729 
730     if (!qual.IsSetQual() ||
731         qual.GetQual() != "db_xref")
732         return tag;
733 
734     if (!qual.IsSetVal() || qual.GetVal().empty())
735     {
736         ErrPostEx(SEV_WARNING, ERR_QUALIFIER_EmptyQual,
737                   "Found empty /db_xref qualifier. Qualifier dropped.");
738         return tag;
739     }
740 
741     const std::string& val = qual.GetVal();
742     if (StringICmp(val.c_str(), "taxon") == 0)
743         return tag;
744 
745     std::string line = val;
746 
747     if (StringNICmp(line.c_str(), "MGD:MGI:", 8) == 0)
748         line = line.substr(4);
749 
750     size_t colon = line.find(':');
751     if (colon == std::string::npos)
752     {
753         ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DbxrefIncorrect,
754                   "Badly formatted /db_xref qualifier: \"%s\". Qualifier dropped.",
755                   val.c_str());
756         return tag;
757     }
758 
759     std::string tail = line.substr(colon + 1);
760     line = line.substr(0, colon);
761 
762     if (MatchArrayIString(DbxrefObsolete, line.c_str()) > -1)
763     {
764         ErrPostEx(SEV_WARNING, ERR_FEATURE_ObsoleteDbXref,
765                   "/db_xref type \"%s\" is obsolete.", line.c_str());
766 
767         std::string buf;
768         if(StringICmp(line.c_str(), "BHB") == 0)
769             buf = "IRD";
770         else if (StringICmp(line.c_str(), "BioHealthBase") == 0)
771             buf = "IRD";
772         else if (StringICmp(line.c_str(), "GENEW") == 0)
773             buf = "HGNC";
774         else if (StringICmp(line.c_str(), "IFO") == 0)
775             buf = "NBRC";
776         else if (StringICmp(line.c_str(), "SWISS-PROT") == 0)
777             buf = "UniProt/Swiss-Prot";
778         else
779             buf = "UniProt/TrEMBL";
780 
781         line = buf;
782     }
783 
784     if(StringICmp(line.c_str(), "UNIPROT/SWISS-PROT") == 0 ||
785        StringICmp(line.c_str(), "UNIPROT/TREMBL") == 0)
786     {
787         std::string buf("UniProtKB");
788         buf += line.substr(7);
789 
790         line = buf;
791     }
792 
793     const Char* strid = NULL;
794     Int4 intid = 0;
795 
796     const Char* p = tail.c_str();
797     if (MatchArrayIString(DbxrefTagAny, line.c_str()) > -1)
798     {
799         for(strid = p; *p >= '0' && *p <= '9';)
800             p++;
801         if(*p == '\0' && *strid != '0')
802         {
803             intid = atoi(strid);
804             strid = NULL;
805         }
806     }
807     else if(MatchArrayIString(DbxrefTagStr, line.c_str()) > -1 ||
808             (source == Parser::ESource::EMBL &&
809              MatchArrayIString(EMBLDbxrefTagStr, line.c_str()) > -1))
810     {
811         for(strid = p; *p >= '0' && *p <= '9';)
812             p++;
813         if(*p == '\0')
814         {
815             ErrPostEx(SEV_WARNING, ERR_QUALIFIER_DbxrefWrongType,
816                       "/db_xref qualifier \"%s\" is supposed to be a string, but its value consists of digits only.",
817                       val.c_str());
818             if(*strid != '0')
819             {
820                 intid = atoi(strid);
821                 strid = NULL;
822             }
823         }
824     }
825     else if(MatchArrayIString(DbxrefTagInt, line.c_str()) > -1)
826     {
827         const Char* q = p;
828         for(; *q == '0';)
829             q++;
830         if(*q == '\0')
831         {
832             ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DbxrefShouldBeNumeric,
833                       "/db_xref qual should have numeric value greater than 0: \"%s\". Qualifier dropped.",
834                       val.c_str());
835             return tag;
836         }
837 
838         const Char* r = q;
839         for(; *r >= '0' && *r <= '9';)
840             r++;
841         if(*r != '\0')
842         {
843             ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DbxrefWrongType,
844                       "/db_xref qualifier \"%s\" is supposed to be a numeric identifier, but its value includes alphabetic characters. Qualifier dropped.",
845                       val.c_str());
846             return tag;
847         }
848         if(*r != '\0' || q != p)
849             strid = p;
850         else if(StringICmp(line.c_str(), "IntrepidBio") == 0 && fta_number_is_huge(q))
851             strid = q;
852         else
853             intid = atoi(q);
854     }
855     else if(StringICmp(line.c_str(), "PID") == 0)
856     {
857         if(*p != 'e' && *p != 'g' && *p != 'd')
858         {
859             ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DbxrefIncorrect,
860                       "Badly formatted /db_xref qual \"PID\": \"%s\". Qualifier dropped.",
861                       val.c_str());
862             return tag;
863         }
864 
865         const Char* q = p + 1;
866         for(; *q == '0';)
867             q++;
868 
869         const Char* r = q;
870         for (r = q; *r >= '0' && *r <= '9';)
871             r++;
872         if(*q == '\0' || *r != '\0')
873         {
874             ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DbxrefShouldBeNumeric,
875                       "/db_xref qual \"PID\" should contain numeric value greater than 0: \"%s\". Qualifier dropped.",
876                       val.c_str());
877             return tag;
878         }
879         strid = p;
880     }
881     else
882     {
883         ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DbxrefUnknownDBName,
884                   "Unknown data base name /db_xref = \"%s\". Qualifier dropped.",
885                   val.c_str());
886         return tag;
887     }
888 
889 
890     tag.Reset(new objects::CDbtag);
891 
892     tag->SetDb(line);
893 
894     if(strid != NULL)
895         tag->SetTag().SetStr(strid);
896     else
897         tag->SetTag().SetId(intid);
898 
899     return tag;
900 }
901 
902 /**********************************************************
903  *
904  *   Function:
905  *      static void FilterDb_xref(pSeqFeat, source)
906  *
907  *   Purpose:
908  *      Looks through SeqFeat's qualifiers which contain
909  *      "db_xref" in qual field, convert such qualifiers
910  *      into Dbtags removing the qualifiers from SeqFeat's
911  *      list, got Dbtags links in the chain of ValNodes
912  *      and puts the chain into the SeqFeat.
913  *
914  *   Parameters:
915  *      pSeqFeat - pointer to a SeqFeat for processing
916  *
917  *   Return:
918  *      None.
919  *
920  **********************************************************/
FilterDb_xref(objects::CSeq_feat & feat,Parser::ESource source)921 static void FilterDb_xref(objects::CSeq_feat& feat, Parser::ESource source)
922 {
923     if (!feat.IsSetQual())
924         return;
925 
926     objects::CSeq_feat::TDbxref& db_refs = feat.SetDbxref();
927 
928     for (objects::CSeq_feat::TQual::iterator qual = feat.SetQual().begin(); qual != feat.SetQual().end(); )
929     {
930         if (!(*qual)->IsSetQual() || (*qual)->GetQual() != "db_xref")
931         {
932             /* Just skip this qualifier, it isn't db_xref
933              */
934             ++qual;
935             continue;
936         }
937 
938         /* Current qualifier is db_xref, process it
939          */
940         CRef<objects::CDbtag> dbtag = DbxrefQualToDbtag(*(*qual), source);
941         if (dbtag.NotEmpty())
942         {
943             db_refs.push_back(dbtag);
944         }
945 
946         /* Remove converted qualifier from chain of qualifiers
947          */
948         qual = feat.SetQual().erase(qual);
949     }
950 
951     if (feat.GetQual().empty())
952         feat.ResetQual();
953 
954     if (db_refs.empty())
955         feat.ResetDbxref();
956 }
957 
958 /**********************************************************
959  *
960  *   bool GetSeqLocation(sfp, location, ids,
961  *                          hard_err, pp, name):
962  *
963  *      Return locmap = TRUE if mapping location rules not
964  *   work, then SeqLocPtr->whole = ids[0].
965  *      sfp->location is a SeqLocPtr which is defined
966  *   as a ValNodePtr.
967  *
968  *                                              7-26-93
969  *
970  **********************************************************/
GetSeqLocation(objects::CSeq_feat & feat,char * location,TSeqIdList & ids,bool * hard_err,ParserPtr pp,char * name)971 bool GetSeqLocation(objects::CSeq_feat& feat, char* location, TSeqIdList& ids,
972                     bool* hard_err, ParserPtr pp, char* name)
973 {
974     bool    sitesmap;
975     bool    locmap = true;
976     int        num_errs;
977 
978     *hard_err = false;
979     num_errs = 0;
980 
981     CRef<objects::CSeq_loc> loc = xgbparseint_ver(location, locmap, sitesmap,
982                                                               num_errs, ids, pp->accver);
983 
984     if (loc.NotEmpty())
985     {
986         TSeqLocList locs;
987         locs.push_back(loc);
988         fta_fix_seq_loc_id(locs, pp, location, name, false);
989 
990         feat.SetLocation(*loc);
991     }
992 
993     if (num_errs > 0)
994     {
995         feat.ResetLocation();
996         objects::CSeq_loc& cur_loc = feat.SetLocation();
997         cur_loc.SetWhole(*(*ids.begin()));
998         *hard_err = true;
999     }
1000     else if(!feat.GetLocation().IsEmpty())
1001     {
1002         if (feat.GetLocation().IsMix())
1003         {
1004             if (feat.GetLocation().GetMix().Get().size() == 1)
1005             {
1006                 CRef<objects::CSeq_loc> cur_loc(new objects::CSeq_loc);
1007 
1008                 cur_loc->Assign(*feat.GetLocation().GetMix().GetFirstLoc());
1009                 if (cur_loc->IsInt())
1010                     feat.SetLocation(*cur_loc);
1011             }
1012         }
1013     }
1014 
1015     return locmap;
1016 }
1017 
1018 /**********************************************************
1019  *
1020  *   static char* CheckLocStr(str):
1021  *
1022  *      Nlm_gbparseint routine does not parse certain types
1023  *   of interval correctly, so this routine will save input
1024  *   form in fbp before passing it:
1025  *      (bases 100 to 300) ==> 100 to 300;
1026  *      (bases 1 to 100; 200 to 300) no change.
1027  *
1028  *                                              5-20-93
1029  *
1030  **********************************************************/
CheckLocStr(const Char * str)1031 static char* CheckLocStr(const Char* str)
1032 {
1033     const Char* ptr;
1034     const Char* eptr;
1035     char* location;
1036 
1037     ptr = StringChr(str, ';');
1038     if(ptr != NULL)
1039         return StringSave(str);
1040 
1041     for(ptr = str; *ptr != ' ' && *ptr != '\0';)
1042         ptr++;
1043     while(*ptr == ' ')
1044         ptr++;
1045 
1046     eptr = StringChr(str, ')');
1047     if(eptr == NULL)
1048         return(NULL);
1049 
1050     while(*eptr == ' ' || *eptr == ')')
1051         --eptr;
1052 
1053     location = StringSave(std::string(ptr, eptr + 1).c_str());
1054     return(location);
1055 }
1056 
1057 /*****************************************************************************
1058 *
1059 *   bool SeqIntCheckCpp(loc) is instead of C-toolkit 'bool SeqIntCheck(sip)'
1060 *       checks that a seq interval is valid
1061 *
1062 *****************************************************************************/
SeqIntCheckCpp(const objects::CSeq_loc & loc)1063 static bool SeqIntCheckCpp(const objects::CSeq_loc& loc)
1064 {
1065     Uint4 len = UINT4_MAX;
1066 
1067     objects::CBioseq_Handle bio_h = GetScope().GetBioseqHandle(*loc.GetId());
1068     if (bio_h.CanGetInst() && bio_h.CanGetInst_Length())
1069         len = bio_h.GetBioseqLength();
1070 
1071     return loc.GetInt().GetFrom() <= loc.GetInt().GetTo() && loc.GetInt().GetTo() < len;
1072 }
1073 
1074 /*****************************************************************************
1075 *
1076 *   bool SeqPntCheckCpp(loc) is instead of C-toolkit 'Boolean SeqPntCheck(SeqPntPtr spp)'
1077 *       checks that a seq point is valid
1078 *
1079 *****************************************************************************/
SeqPntCheckCpp(const objects::CSeq_loc & loc)1080 static bool SeqPntCheckCpp(const objects::CSeq_loc& loc)
1081 {
1082     Uint4 len = UINT4_MAX;
1083 
1084     objects::CBioseq_Handle bio_h = GetScope().GetBioseqHandle(*loc.GetId());
1085     if (bio_h.CanGetInst() && bio_h.CanGetInst_Length())
1086         len = bio_h.GetBioseqLength();
1087 
1088     return loc.GetPnt().GetPoint() < len;
1089 }
1090 
1091 /*****************************************************************************
1092 *
1093 *   bool PackSeqPntCheck(loc) is instead of C-toolkit 'Boolean PackSeqPntCheck (pspp)'
1094 *
1095 *****************************************************************************/
PackSeqPntCheckCpp(const objects::CSeq_loc & loc)1096 static bool PackSeqPntCheckCpp(const objects::CSeq_loc& loc)
1097 {
1098     Uint4 len = UINT4_MAX;
1099 
1100     objects::CBioseq_Handle bio_h = GetScope().GetBioseqHandle(*loc.GetId());
1101     if (bio_h.CanGetInst() && bio_h.CanGetInst_Length())
1102         len = bio_h.GetBioseqLength();
1103 
1104     ITERATE(objects::CSeq_loc::TPoints, point, loc.GetPacked_pnt().GetPoints())
1105     {
1106         if (*point >= len)
1107             return false;
1108     }
1109 
1110     return true;
1111 }
1112 
1113 /**********************************************************/
1114 /* returns : 2 = Ok, 1 = mixed strands, 0 = error in location
1115  */
FTASeqLocCheck(const objects::CSeq_loc & locs,char * accession)1116 static Uint1 FTASeqLocCheck(const objects::CSeq_loc& locs, char* accession)
1117 {
1118     Uint1        strand = 99;
1119     Uint1        retval = 2;
1120 
1121     objects::CSeq_loc_CI ci(locs);
1122 
1123     bool good = true;
1124     for (; ci; ++ci)
1125     {
1126         CConstRef<objects::CSeq_loc> cur_loc = ci.GetRangeAsSeq_loc();
1127 
1128         const objects::CSeq_id* cur_id = nullptr;
1129 
1130         switch (cur_loc->Which())
1131         {
1132         case objects::CSeq_loc::e_Int:
1133             good = SeqIntCheckCpp(*cur_loc);
1134             if (good)
1135                 cur_id = cur_loc->GetId();
1136             break;
1137 
1138         case objects::CSeq_loc::e_Pnt:
1139             good = SeqPntCheckCpp(*cur_loc);
1140             if (good)
1141                 cur_id = cur_loc->GetId();
1142             break;
1143 
1144         case objects::CSeq_loc::e_Packed_pnt:
1145             good = PackSeqPntCheckCpp(*cur_loc);
1146             if (good)
1147                 cur_id = cur_loc->GetId();
1148             break;
1149 
1150         case objects::CSeq_loc::e_Bond:
1151             if (!cur_loc->GetBond().CanGetA())
1152                 good = false;
1153 
1154             if (good)
1155                 cur_id = cur_loc->GetId();
1156             break;
1157 
1158         case objects::CSeq_loc::e_Empty:
1159         case objects::CSeq_loc::e_Whole:
1160             cur_id = cur_loc->GetId();
1161             break;
1162 
1163         default:
1164             continue;
1165         }
1166 
1167         if (!good)
1168             break;
1169 
1170         if (accession == nullptr || cur_id == nullptr)
1171             continue;
1172 
1173         if (!cur_id->IsGenbank() && !cur_id->IsEmbl() && !cur_id->IsPir() &&
1174             !cur_id->IsSwissprot() && !cur_id->IsOther() && !cur_id->IsDdbj() &&
1175             !cur_id->IsPrf() && !cur_id->IsTpg() && !cur_id->IsTpe() &&
1176             !cur_id->IsTpd() && !cur_id->IsGpipe())
1177             continue;
1178 
1179         const objects::CTextseq_id* text_id = cur_id->GetTextseq_Id();
1180 
1181         if (text_id == nullptr || !text_id->CanGetAccession())
1182             continue;
1183 
1184         if (text_id->GetAccession() == accession)
1185         {
1186             if (strand == 99)
1187                 strand = cur_loc->GetStrand();
1188             else if (strand != cur_loc->GetStrand())
1189                 retval = 1;
1190         }
1191     }
1192 
1193     if (!good)
1194         return 0;
1195 
1196     return retval;
1197 }
1198 
1199 /**********************************************************/
fta_strip_aa(char * str)1200 static void fta_strip_aa(char* str)
1201 {
1202     if(str == NULL || *str == '\0')
1203         return;
1204 
1205     while(str != NULL)
1206     {
1207         str = StringStr(str, "aa");
1208         if(str != NULL)
1209             fta_StringCpy(str, str + 2);
1210     }
1211 }
1212 
1213 /**********************************************************
1214  *
1215  *   static SeqFeatPtr SeqFeatPub(pp, entry, hsfp, seq_id,
1216  *                                col_data, ibp):
1217  *
1218  *                                              5-26-93
1219  *
1220  **********************************************************/
SeqFeatPub(ParserPtr pp,DataBlkPtr entry,TSeqFeatList & feats,TSeqIdList & seqids,Int4 col_data,IndexblkPtr ibp)1221 static void SeqFeatPub(ParserPtr pp, DataBlkPtr entry, TSeqFeatList& feats,
1222                        TSeqIdList& seqids, Int4 col_data, IndexblkPtr ibp)
1223 {
1224     DataBlkPtr dbp;
1225     DataBlkPtr subdbp;
1226     char*    p;
1227     char*    q;
1228     char*    location = NULL;
1229 
1230     bool    err = false;
1231     Uint1      i;
1232 
1233     /* REFERENCE, to Seq-feat
1234      */
1235     if(pp->format == Parser::EFormat::XML)
1236         dbp = XMLBuildRefDataBlk(entry->offset, ibp->xip, ParFlat_REF_BTW);
1237     else
1238         dbp = TrackNodeType(entry, ParFlat_REF_BTW);
1239     if(dbp == NULL)
1240         return;
1241 
1242 
1243     for(; dbp != NULL; dbp = dbp->next)
1244     {
1245         if(dbp->type != ParFlat_REF_BTW)
1246             continue;
1247 
1248         CRef<objects::CPubdesc> pubdesc = DescrRefs(pp, dbp, col_data);
1249         if (pubdesc.Empty())
1250             continue;
1251 
1252         CRef<objects::CSeq_feat> feat(new objects::CSeq_feat);
1253         feat->SetData().SetPub(*pubdesc);
1254 
1255         location = NULL;
1256         if(pp->format == Parser::EFormat::XML)
1257         {
1258             location = XMLFindTagValue(dbp->offset, (XmlIndexPtr) dbp->data,
1259                                        INSDREFERENCE_POSITION);
1260             if(location == NULL)
1261             {
1262                 q = XMLFindTagValue(dbp->offset, (XmlIndexPtr) dbp->data,
1263                                     INSDREFERENCE_REFERENCE);
1264                 if(q != NULL)
1265                 {
1266                     for(p = q; *p != '\0' && *p != '(';)
1267                         p++;
1268                     if(*p != '\0')
1269                         location = CheckLocStr(p + 1);
1270                     MemFree(q);
1271                 }
1272             }
1273             else
1274             {
1275                 p = StringChr(location, ';');
1276                 if(p != NULL)
1277                 {
1278                     p = (char*) MemNew(StringLen(location) + 7);
1279                     StringCpy(p, "join(");
1280                     StringCat(p, location);
1281                     StringCat(p, ")");
1282                     MemFree(location);
1283                     location = p;
1284                 }
1285             }
1286         }
1287         else if(pp->format == Parser::EFormat::GenBank)
1288         {
1289             for(p = dbp->offset + col_data; *p != '\0' && *p != '(';)
1290                 p++;
1291             location = CheckLocStr(std::string(p, dbp->offset + dbp->len - p).c_str());
1292         }
1293         else if(pp->format == Parser::EFormat::EMBL)
1294         {
1295             subdbp = (DataBlkPtr) dbp->data;
1296             for(; subdbp != NULL; subdbp = subdbp->next)
1297             {
1298                 if(subdbp->type != ParFlat_RP)
1299                     continue;
1300 
1301                 for(p = subdbp->offset; *p != '\0' && IS_DIGIT(*p) == 0;)
1302                     p++;
1303                 if(StringChr(p, ',') != NULL)
1304                 {
1305                     location = (char*) MemNew(StringLen(p) + 7);
1306                     sprintf(location, "join(%s)", p);
1307                 }
1308                 else
1309                     location = StringSave(p);
1310                 break;
1311             }
1312         }
1313         if(location == NULL || *location == '\0')
1314         {
1315             ErrPostEx(SEV_REJECT, ERR_REFERENCE_UnparsableLocation,
1316                       "NULL or empty reference location. Entry dropped.");
1317             err = true;
1318             if(location != NULL)
1319                 MemFree(location);
1320             break;
1321         }
1322 
1323         if(ibp->is_prot)
1324             fta_strip_aa(location);
1325 
1326         if(pp->buf != NULL)
1327             MemFree(pp->buf);
1328         pp->buf = NULL;
1329 
1330         GetSeqLocation(*feat, location, seqids, &err, pp, (char*) "pub");
1331 
1332         if(err)
1333         {
1334             ErrPostEx(SEV_REJECT, ERR_REFERENCE_UnparsableLocation,
1335                       "Unparsable reference location. Entry dropped.");
1336             MemFree(location);
1337             break;
1338         }
1339 
1340         i = FTASeqLocCheck(feat->GetLocation(), ibp->acnum);
1341 
1342         if(i == 0)
1343         {
1344             ErrPostEx(SEV_WARNING, ERR_LOCATION_FailedCheck, location);
1345             if(pp->debug)
1346             {
1347                 feats.push_back(feat);
1348             }
1349         }
1350         else
1351         {
1352             if(i == 1)
1353             {
1354                 ErrPostEx(SEV_WARNING, ERR_LOCATION_MixedStrand,
1355                           "Mixed strands in SeqLoc: %s", location);
1356             }
1357             feats.push_back(feat);
1358         }
1359         if(location != NULL)
1360             MemFree(location);
1361     }
1362 
1363     if(!err)
1364         return;
1365 
1366     ibp->drop = 1;
1367     feats.clear();
1368 }
1369 
1370 /**********************************************************
1371  *
1372  *   static SeqFeatPtr ImpFeatPub(pp, entry, hsfp, seq_id,
1373  *                                col_data, ibp):
1374  *
1375  *                                              5-26-93
1376  *
1377  **********************************************************/
ImpFeatPub(ParserPtr pp,DataBlkPtr entry,TSeqFeatList & feats,objects::CSeq_id & seq_id,Int4 col_data,IndexblkPtr ibp)1378 static void ImpFeatPub(ParserPtr pp, DataBlkPtr entry, TSeqFeatList& feats,
1379                        objects::CSeq_id& seq_id, Int4 col_data, IndexblkPtr ibp)
1380 {
1381     DataBlkPtr dbp;
1382 
1383     bool    first;
1384 
1385     /* REFERENCE, Imp-feat
1386      */
1387     if(pp->format == Parser::EFormat::XML)
1388         dbp = XMLBuildRefDataBlk(entry->offset, ibp->xip, ParFlat_REF_SITES);
1389     else
1390         dbp = TrackNodeType(entry, ParFlat_REF_SITES);
1391     if(dbp == NULL)
1392         return;
1393 
1394     CRef<objects::CSeq_feat> feat;
1395     for (first = true; dbp != NULL; dbp = dbp->next)
1396     {
1397         if(dbp->type != ParFlat_REF_SITES)
1398             continue;
1399 
1400         CRef<objects::CPubdesc> pubdesc = DescrRefs(pp, dbp, col_data);
1401         if (pubdesc.Empty() || !pubdesc->IsSetPub())
1402             continue;
1403 
1404         if(first)
1405         {
1406             feat.Reset(new objects::CSeq_feat);
1407 
1408             objects::CImp_feat& imp_feat = feat->SetData().SetImp();
1409             imp_feat.SetKey("Site-ref");
1410             imp_feat.SetLoc("sites");
1411 
1412             feat->SetLocation(*fta_get_seqloc_int_whole(seq_id, ibp->bases));
1413             first = false;
1414         }
1415 
1416         CRef<objects::CPub> pub(new objects::CPub);
1417         pub->SetEquiv(pubdesc->SetPub());
1418 
1419         feat->SetCit().SetPub().push_back(pub);
1420 
1421         if (pubdesc->IsSetComment())
1422             feat->SetComment(pubdesc->GetComment());
1423         else
1424             feat->ResetComment();
1425     }
1426 
1427     if (!first && feat.NotEmpty())
1428         feats.push_back(feat);
1429 }
1430 
1431 /**********************************************************/
fta_fake_gbparse_err_handler(const Char *,const Char *)1432 static void fta_fake_gbparse_err_handler(const Char*, const Char*)
1433 {
1434 }
1435 
1436 /**********************************************************/
location_to_string_or_unknown(const objects::CSeq_loc & loc)1437 static Char* location_to_string_or_unknown(const objects::CSeq_loc& loc)
1438 {
1439     Char* ret = location_to_string(loc);
1440     if (ret == NULL)
1441         ret = StringSave("unknown location");
1442 
1443     return ret;
1444 }
1445 
1446 /**********************************************************/
GetTrnaAnticodon(const objects::CSeq_feat & feat,char * qval,const TSeqIdList & seqids,bool accver)1447 static CRef<objects::CSeq_loc> GetTrnaAnticodon(const objects::CSeq_feat& feat, char* qval, const TSeqIdList& seqids,
1448                                                             bool accver)
1449 {
1450     char*    loc_str;
1451     char*    p;
1452     char*    q;
1453     bool    fake1;
1454     bool    fake2;
1455     Int4       range;
1456     Int4       pars;
1457     Char       ch;
1458     int        fake3;
1459 
1460     CRef<objects::CSeq_loc> ret;
1461 
1462     if (qval == NULL)
1463         return ret;
1464 
1465     p = StringStr(qval, "pos:");
1466     if (p == NULL)
1467         return ret;
1468 
1469     for(q = p + 4; *q == ' ';)
1470         q++;
1471 
1472     for(pars = 0, p = q; *p != '\0'; p++)
1473     {
1474         if(*p == ',' && pars == 0)
1475             break;
1476         if(*p == '(')
1477             pars++;
1478         else if(*p == ')')
1479         {
1480             pars--;
1481             if(pars == 0)
1482             {
1483                 p++;
1484                 break;
1485             }
1486         }
1487     }
1488 
1489     ch = *p;
1490     *p = '\0';
1491     loc_str = StringSave(q);
1492     *p = ch;
1493 
1494     xinstall_gbparse_error_handler(fta_fake_gbparse_err_handler);
1495     ret = xgbparseint_ver(loc_str, fake1, fake2, fake3, seqids, accver);
1496     xinstall_gbparse_error_handler(NULL);
1497 
1498     if (ret.Empty())
1499     {
1500         p = location_to_string_or_unknown(feat.GetLocation());
1501 
1502         ErrPostEx(SEV_ERROR, ERR_FEATURE_InvalidAnticodonPos,
1503                   "Invalid position element for an /anticodon qualifier : \"%s\" : qualifier dropped : feature location \"%s\".",
1504                   loc_str, (p == NULL) ? "unknown" : p);
1505 
1506         if (p != NULL)
1507             MemFree(p);
1508         MemFree(loc_str);
1509 
1510         return ret;
1511     }
1512 
1513     range = objects::sequence::GetLength(*ret, &GetScope());
1514     if (range != 3)
1515     {
1516         p = location_to_string_or_unknown(feat.GetLocation());
1517 
1518         if (range == 4)
1519             ErrPostEx(SEV_WARNING, ERR_FEATURE_FourBaseAntiCodon,
1520                       "tRNA feature at \"%s\" has anticodon with location spanning four bases: \"%s\". Cannot generate corresponding codon value from the DNA sequence.",
1521                       (p == NULL) ? "unknown" : p, loc_str);
1522         else
1523             ErrPostEx(SEV_ERROR, ERR_FEATURE_StrangeAntiCodonSize,
1524                       "tRNA feature at \"%s\" has anticodon of an unusual size: \"%s\". Cannot generate corresponding codon value from the DNA sequence.",
1525                       (p == NULL) ? "unknown" : p, loc_str);
1526 
1527         if (p != NULL)
1528             MemFree(p);
1529     }
1530 
1531     // Comparing two locations ignoring their IDs
1532     // Anticodon should be inside the original location (may be the same)
1533     CRange<TSeqPos> anticodon_range = ret->GetTotalRange();
1534     CRange<TSeqPos> xrange = feat.GetLocation().GetTotalRange().IntersectionWith(anticodon_range);
1535 
1536     if (xrange != anticodon_range)
1537     {
1538         p = location_to_string_or_unknown(feat.GetLocation());
1539 
1540         ErrPostEx(SEV_ERROR, ERR_FEATURE_BadAnticodonLoc,
1541                   "Anticodon location \"%s\" does not fall within tRNA feature at \"%s\".",
1542                   loc_str, (p == NULL) ? "unknown" : p);
1543 
1544         if(p != NULL)
1545             MemFree(p);
1546         MemFree(loc_str);
1547 
1548         ret.Reset();
1549         return ret;
1550     }
1551 
1552     MemFree(loc_str);
1553     return ret;
1554 }
1555 
1556 /**********************************************************/
fta_parse_rrna_feat(objects::CSeq_feat & feat,objects::CRNA_ref & rna_ref)1557 static void fta_parse_rrna_feat(objects::CSeq_feat& feat, objects::CRNA_ref& rna_ref)
1558 {
1559     char* qval;
1560     char* p;
1561     char* q;
1562     Char    ch;
1563 
1564     qval = GetTheQualValue(feat.SetQual(), "product");
1565     if (feat.GetQual().empty())
1566         feat.ResetQual();
1567 
1568     std::string qval_str;
1569     if (qval)
1570     {
1571         qval_str = qval;
1572         MemFree(qval);
1573         qval = NULL;
1574     }
1575 
1576     size_t len = 0;
1577     if (qval_str.empty() && feat.IsSetComment() && rna_ref.GetType() == objects::CRNA_ref::eType_rRNA)
1578     {
1579         std::string comment = feat.GetComment();
1580         len = comment.size();
1581 
1582         if(len > 15 && len < 20)
1583         {
1584             if(StringNICmp(comment.c_str() + len - 15, "S ribosomal RNA", 15) == 0)
1585             {
1586                 qval_str = comment;
1587                 feat.ResetComment();
1588             }
1589         }
1590         else if(len > 6 && len < 20)
1591         {
1592             if (StringNICmp(comment.c_str() + len - 6, "S rRNA", 6) == 0)
1593             {
1594                 qval_str = comment;
1595                 feat.ResetComment();
1596             }
1597         }
1598     }
1599 
1600     if (qval_str.empty())
1601         return;
1602 
1603     qval = StringSave(qval_str.c_str());
1604     for(p = qval; p != NULL; p += 13)
1605     {
1606         p = StringIStr(p, "ribosomal rrna");
1607         if(p == NULL)
1608             break;
1609         fta_StringCpy(p + 10, p + 11);
1610     }
1611 
1612     for(p = qval; p != NULL; p = qval + len)
1613     {
1614         p = StringIStr(p, "ribosomalrna");
1615         if(p == NULL)
1616             break;
1617         q = (char*) MemNew(StringLen(qval) + 2);
1618         p[9] = '\0';
1619         StringCpy(q, qval);
1620         StringCat(q, " RNA");
1621         StringCat(q, p + 12);
1622         len = p - qval + 13;
1623         MemFree(qval);
1624         qval = q;
1625     }
1626 
1627     if(qval != NULL)
1628     {
1629         p = StringIStr(qval, " rrna");
1630         if(p != NULL)
1631         {
1632             q = (char*) MemNew(StringLen(qval) + 10);
1633             *p = '\0';
1634             StringCpy(q, qval);
1635             StringCat(q, " ribosomal RNA");
1636             StringCat(q, p + 5);
1637             MemFree(qval);
1638             qval = q;
1639         }
1640     }
1641 
1642     for(p = qval, q = p; q != NULL; q = p + 13)
1643     {
1644         p = StringIStr(q, "ribosomal DNA");
1645         if(p == NULL)
1646         {
1647             p = StringIStr(q, "ribosomal RNA");
1648             if(p == NULL)
1649                 break;
1650         }
1651         p[10] = 'R';
1652         p[11] = 'N';
1653         p[12] = 'A';
1654     }
1655 
1656     p = StringIStr(qval, "s ribosomal RNA");
1657     if(p != NULL && p > qval && p[15] == '\0')
1658     {
1659         p--;
1660         if(*p >= '0' && *p <= '9')
1661             *++p = 'S';
1662     }
1663 
1664     for(p = qval;;)
1665     {
1666         p = StringIStr(p, "ribosomal");
1667         if(p == NULL)
1668             break;
1669         if(p == qval || (p[9] != ' ' && p[9] != '\0'))
1670         {
1671             p += 9;
1672             continue;
1673         }
1674         if(StringNCmp(p + 9, " RNA", 4) == 0)
1675         {
1676             p += 13;
1677             continue;
1678         }
1679         len = p - qval + 14;
1680         q = (char*) MemNew(StringLen(qval) + 5);
1681         p += 9;
1682         ch = *p;
1683         *p = '\0';
1684         StringCpy(q, qval);
1685         StringCat(q, " RNA");
1686         *p = ch;
1687         StringCat(q, p);
1688         MemFree(qval);
1689         qval = q;
1690         p = qval + len;
1691     }
1692 
1693     for(p = qval;;)
1694     {
1695         p = StringIStr(p, " ribosomal RNA");
1696         if(p == NULL)
1697             break;
1698         p += 14;
1699         if(StringNICmp(p, " ribosomal RNA", 14) == 0)
1700             fta_StringCpy(p, p + 14);
1701     }
1702 
1703     DeleteQual(feat.SetQual(), "product");
1704     if (feat.GetQual().empty())
1705         feat.ResetQual();
1706 
1707     if(StringLen(qval) > 511)
1708     {
1709         qval[510] = '>';
1710         qval[511] = '\0';
1711         p = StringSave(qval);
1712         MemFree(qval);
1713         qval = p;
1714     }
1715 
1716     rna_ref.SetExt().SetName(qval);
1717     MemFree(qval);
1718 }
1719 
1720 /**********************************************************/
fta_get_aa_from_symbol(Char ch)1721 static Uint1 fta_get_aa_from_symbol(Char ch)
1722 {
1723     AaCodonsPtr acp;
1724 
1725     for(acp = aacodons; acp->straa != NULL; acp++)
1726         if(acp->intaa == ch)
1727             break;
1728     if(acp->straa != NULL)
1729         return(acp->intaa);
1730 
1731     return(0);
1732 }
1733 
1734 /**********************************************************/
fta_get_aa_from_string(char * str)1735 static Uint1 fta_get_aa_from_string(char* str)
1736 {
1737     AaCodonsPtr acp;
1738     TrnaAaPtr   tap;
1739 
1740     for(tap = taa; tap->name != NULL; tap++)
1741         if(StringICmp(str, tap->name) == 0)
1742             break;
1743     if(tap->name != NULL)
1744         return(tap->aa);
1745 
1746     for(acp = aacodons; acp->straa != NULL; acp++)
1747         if(StringICmp(acp->straa, str) == 0)
1748             break;
1749     if(acp->straa != NULL)
1750         return(acp->intaa);
1751 
1752     return(0);
1753 }
1754 
1755 /**********************************************************/
get_aa_from_trna(const objects::CTrna_ext & trna)1756 static int get_aa_from_trna(const objects::CTrna_ext& trna)
1757 {
1758     int ret = 0;
1759     if (trna.IsSetAa() && trna.GetAa().IsNcbieaa())
1760         ret = trna.GetAa().GetNcbieaa();
1761 
1762     return ret;
1763 }
1764 
1765 /**********************************************************/
fta_get_trna_from_product(objects::CSeq_feat & feat,const Char * product,unsigned char * remove)1766 static CRef<objects::CTrna_ext> fta_get_trna_from_product(objects::CSeq_feat& feat, const Char* product,
1767                                                                       unsigned char* remove)
1768 {
1769     const char **b;
1770 
1771     char*    p;
1772     char*    q;
1773     char*    start;
1774     char*    end;
1775     char*    first;
1776     char*    second;
1777     char*    third;
1778     char*    fourth;
1779     bool       fmet;
1780     char*    prod;
1781 
1782     if (remove != NULL)
1783         *remove = 0;
1784 
1785     CRef<objects::CTrna_ext> ret(new objects::CTrna_ext);
1786 
1787     if(product == NULL || StringLen(product) < 7)
1788         return ret;
1789 
1790     bool digits = false;
1791     prod = StringSave(product);
1792     for(p = prod, q = prod; *p != '\0'; p++)
1793     {
1794         if(*p >= 'a' && *p <= 'z')
1795             *p &= ~040;
1796         else if((*p < 'A' || *p > 'Z') && *p != '(' && *p != ')')
1797         {
1798             if(*p >= '0' && *p <= '9')
1799                 digits = true;
1800             *p = ' ';
1801         }
1802     }
1803     ShrinkSpaces(prod);
1804 
1805     for(b = trna_tags; *b != NULL; b++)
1806     {
1807         start = StringStr(prod, *b);
1808         if(start != NULL)
1809             break;
1810     }
1811     if(*b == NULL)
1812     {
1813         MemFree(prod);
1814         return ret;
1815     }
1816 
1817     end = start + StringLen(*b);
1818     for(p = end; *p != '\0'; p++)
1819         if(*p == '(' || *p == ')')
1820             *p = ' ';
1821     ShrinkSpaces(prod);
1822 
1823     if(start == prod && *end == '\0')
1824     {
1825         if(remove != NULL && !digits)
1826             *remove = 1;
1827         MemFree(prod);
1828         return ret;
1829     }
1830 
1831     first = NULL;
1832     second = NULL;
1833     third = NULL;
1834     fourth = NULL;
1835     for(p = end; *p == ' ' || *p == ')' || *p == '(';)
1836         p++;
1837     q = p;
1838     if(StringNCmp(p, "F MET", 5) == 0)
1839         p += 5;
1840     else if(StringNCmp(p, "F MT", 4) == 0)
1841         p += 4;
1842     while(*p >= 'A' && *p <= 'Z')
1843         p++;
1844     if(p > q)
1845     {
1846         if(*p != '\0')
1847             *p++ = '\0';
1848         second = q;
1849     }
1850     while(*p == ' ' || *p == ')' || *p == '(')
1851         p++;
1852     for(q = p; *p >= 'A' && *p <= 'Z';)
1853         p++;
1854     if(p > q)
1855     {
1856         if(*p != '\0')
1857             *p++ = '\0';
1858         if(q[1] == '\0')
1859         {
1860             while(*p == ' ' || *p == ')' || *p == '(')
1861                 p++;
1862             for(q = p; *p >= 'A' && *p <= 'Z';)
1863                 p++;
1864             if(p > q)
1865             {
1866                 if(*p != '\0')
1867                     *p++ = '\0';
1868                 third = q;
1869             }
1870         }
1871         else
1872             third = q;
1873 
1874         while(*p == ' ' || *p == '(' || *p == ')')
1875             p++;
1876         if(*p != '\0')
1877             fourth = p;
1878     }
1879     if(start > prod)
1880     {
1881         for(p = start - 1; *p == ' ' || *p == ')' || *p == '('; p--)
1882             if(p == prod)
1883                 break;
1884 
1885         if(p > prod && p[1] == ')')
1886         {
1887             for(p--; *p != '('; p--)
1888                 if(p == prod)
1889                     break;
1890             if(p > prod)
1891             {
1892                 for(p--; *p == ' ' || *p == '(' || *p == '('; p--)
1893                     if(p == prod)
1894                         break;
1895             }
1896         }
1897         if(p > prod)
1898         {
1899             for(q = p++; *q >= 'A' && *q <= 'Z'; q--)
1900                 if(q == prod)
1901                     break;
1902             if(*q < 'A' || *q > 'Z')
1903                 q++;
1904             if(p > q)
1905             {
1906                 *p = '\0';
1907                 first = q;
1908             }
1909         }
1910     }
1911 
1912     fmet = false;
1913     if(second != NULL)
1914     {
1915         if(StringCmp(second, "F MET") == 0 ||
1916            StringCmp(second, "FMET") == 0 ||
1917            StringCmp(second, "F MT") == 0)
1918         {
1919             StringCpy(second, "FMET");
1920             fmet = true;
1921         }
1922 
1923         ret->SetAa().SetNcbieaa(fta_get_aa_from_string(second));
1924         if (get_aa_from_trna(*ret) != 0)
1925             second = NULL;
1926     }
1927 
1928     if (get_aa_from_trna(*ret) == 0 && first != NULL)
1929     {
1930         ret->SetAa().SetNcbieaa(fta_get_aa_from_string(first));
1931         if (get_aa_from_trna(*ret) != 0 && first == prod)
1932             first = NULL;
1933     }
1934 
1935     if(first == NULL && second == NULL && third == NULL && fourth == NULL &&
1936        remove != NULL && !digits)
1937         *remove = 1;
1938     MemFree(prod);
1939 
1940     if (!fmet)
1941         return ret;
1942 
1943     if (!feat.IsSetComment())
1944         feat.SetComment("fMet");
1945     else if (StringIStr(feat.GetComment().c_str(), "fmet") == NULL)
1946     {
1947         std::string& comment = feat.SetComment();
1948         comment += "; fMet";
1949     }
1950 
1951     return ret;
1952 }
1953 
1954 /**********************************************************/
fta_get_trna_from_comment(const Char * comment,unsigned char * remove)1955 static CRef<objects::CTrna_ext> fta_get_trna_from_comment(const Char* comment, unsigned char* remove)
1956 {
1957     char* comm;
1958     char* p;
1959     char* q;
1960 
1961     CRef<objects::CTrna_ext> ret(new objects::CTrna_ext);
1962 
1963     *remove = 0;
1964     if(comment == NULL)
1965         return ret;
1966 
1967     comm = StringSave(comment);
1968     for(p = comm, q = comm; *p != '\0'; p++)
1969     {
1970         if(*p >= 'a' && *p <= 'z')
1971             *p &= ~040;
1972         else if(*p < 'A' || *p > 'Z')
1973             *p = ' ';
1974     }
1975     ShrinkSpaces(comm);
1976 
1977     if(StringNCmp(comm, "CODON RECOGNIZED ", 17) == 0)
1978     {
1979         p = comm + 17;
1980         q = StringChr(p, ' ');
1981         if(q != NULL && StringCmp(q + 1, "PUTATIVE") == 0)
1982             *q = '\0';
1983         if(StringChr(p, ' ') == NULL && StringLen(p) == 3)
1984         {
1985             MemFree(comm);
1986             *remove = (q == NULL) ? 1 : 2;
1987             return ret;
1988         }
1989     }
1990 
1991     if(StringNCmp(comm, "PUTATIVE ", 9) == 0 && comm[10] == ' ' &&
1992        comm[14] == ' ' && StringNCmp(&comm[15], "TRNA", 4) == 0)
1993     {
1994         ret->SetAa().SetNcbieaa(fta_get_aa_from_symbol(comm[9]));
1995         if (get_aa_from_trna(*ret) != 0)
1996         {
1997             MemFree(comm);
1998             return ret;
1999         }
2000     }
2001 
2002     for(q = comm, p = q; p != NULL;)
2003     {
2004         p = StringChr(p, ' ');
2005         if(p != NULL)
2006             *p++ = '\0';
2007 
2008         ret->SetAa().SetNcbieaa(fta_get_aa_from_string(q));
2009         if (get_aa_from_trna(*ret) != 0)
2010             break;
2011         q = p;
2012     }
2013 
2014     MemFree(comm);
2015     return ret;
2016 }
2017 
2018 /**********************************************************/
get_first_codon_from_trna(const objects::CTrna_ext & trna)2019 static int get_first_codon_from_trna(const objects::CTrna_ext& trna)
2020 {
2021     int ret = 255;
2022     if (trna.IsSetCodon() && !trna.GetCodon().empty())
2023         ret = *trna.GetCodon().begin();
2024 
2025     return ret;
2026 }
2027 
2028 /**********************************************************/
GetRnaRef(objects::CSeq_feat & feat,objects::CBioseq & bioseq,Parser::ESource source,bool accver)2029 static void GetRnaRef(objects::CSeq_feat& feat, objects::CBioseq& bioseq,
2030                       Parser::ESource source, bool accver)
2031 {
2032     char*    qval;
2033     char*    p;
2034 
2035     Uint1      remove;
2036 
2037     Int2       type;
2038 
2039     if (!feat.GetData().IsImp())
2040         return;
2041 
2042     const objects::CImp_feat& imp_feat = feat.GetData().GetImp();
2043 
2044     CRef<objects::CRNA_ref> rna_ref(new objects::CRNA_ref);
2045 
2046     type = MatchArrayString(ParFlat_RNA_array, imp_feat.GetKey().c_str());
2047     if (type < 0)
2048         type = 255;
2049     else
2050         ++type;
2051 
2052     rna_ref->SetType(static_cast<objects::CRNA_ref::EType>(type));
2053 
2054     feat.SetData().SetRna(*rna_ref);
2055 
2056     if (type == objects::CRNA_ref::eType_rRNA)
2057     {
2058         fta_parse_rrna_feat(feat, *rna_ref);
2059         return;
2060     }
2061 
2062     CRef<objects::CRNA_gen> rna_gen;
2063     CRef<objects::CRNA_qual_set> rna_quals;
2064 
2065     if (type == objects::CRNA_ref::eType_ncRNA)
2066     {
2067         p = GetTheQualValue(feat.SetQual(), "ncRNA_class");
2068         if(p != NULL)
2069         {
2070             rna_gen.Reset(new objects::CRNA_gen);
2071             rna_gen->SetClass(p);
2072         }
2073     }
2074     else if (type == objects::CRNA_ref::eType_tmRNA)
2075     {
2076         p = GetTheQualValue(feat.SetQual(), "tag_peptide");
2077         if (p != NULL)
2078         {
2079             CRef<objects::CRNA_qual> rna_qual(new objects::CRNA_qual);
2080             rna_qual->SetQual("tag_peptide");
2081             rna_qual->SetVal(p);
2082 
2083             rna_quals.Reset(new objects::CRNA_qual_set);
2084             rna_quals->Set().push_back(rna_qual);
2085 
2086             rna_gen.Reset(new objects::CRNA_gen);
2087             rna_gen->SetQuals(*rna_quals);
2088         }
2089     }
2090 
2091     if (type != objects::CRNA_ref::eType_premsg && type != objects::CRNA_ref::eType_tRNA)    /* mRNA, snRNA, scRNA or other */
2092     {
2093         qval = GetTheQualValue(feat.SetQual(), "product");
2094         if(qval != NULL)
2095         {
2096             p = GetTheQualValue(feat.SetQual(), "product");
2097             if(p != NULL && p[0] != 0)
2098             {
2099                 if (!feat.IsSetComment())
2100                     feat.SetComment(p);
2101                 else
2102                 {
2103                     std::string& comment = feat.SetComment();
2104                     comment += "; ";
2105                     comment += p;
2106                 }
2107             }
2108         }
2109 
2110         if (qval == NULL && type == objects::CRNA_ref::eType_mRNA &&
2111            source != Parser::ESource::EMBL && source != Parser::ESource::DDBJ)
2112            qval = GetTheQualValue(feat.SetQual(), "standard_name");
2113 
2114         if (qval == NULL && feat.IsSetComment() && type == objects::CRNA_ref::eType_mRNA)
2115         {
2116             const Char* c_p = feat.GetComment().c_str();
2117             const Char* c_q = NULL;
2118             for ( ; ; c_p += 5, c_q = c_p)
2119             {
2120                 c_p = StringIStr(c_p, " mRNA");
2121                 if (c_p == NULL)
2122                     break;
2123             }
2124 
2125             const Char* c_r = NULL;
2126             for (c_p = feat.GetComment().c_str(); ; c_p += 4, c_r = c_p)
2127             {
2128                 c_p = StringIStr(c_p, " RNA");
2129                 if (c_p == NULL)
2130                     break;
2131             }
2132 
2133             if (c_q != NULL && c_r != NULL)
2134             {
2135                 c_p = (c_q > c_r) ? c_q : c_r;
2136             }
2137             else if (c_q != NULL)
2138                 c_p = c_q;
2139             else
2140                 c_p = c_r;
2141 
2142             if (c_p != NULL)
2143             {
2144                 while (*c_p == ' ' || *c_p == '\t' || *c_p == ',' || *c_p == ';')
2145                     ++c_p;
2146 
2147                 if (*c_p == '\0')
2148                 {
2149                     qval = StringSave(feat.GetComment().c_str());
2150                     feat.ResetComment();
2151                 }
2152             }
2153         }
2154 
2155         if (qval != NULL)
2156         {
2157             if(StringLen(qval) > 511)
2158             {
2159                 qval[510] = '>';
2160                 qval[511] = '\0';
2161                 p = StringSave(qval);
2162                 MemFree(qval);
2163                 qval = p;
2164             }
2165 
2166             if (type > objects::CRNA_ref::eType_snoRNA && type <= objects::CRNA_ref::eType_miscRNA)
2167             {
2168                 if (rna_gen.Empty())
2169                     rna_gen.Reset(new objects::CRNA_gen);
2170 
2171                 rna_gen->SetProduct(qval);
2172             }
2173             else
2174             {
2175                 rna_ref->SetExt().SetName(qval);
2176             }
2177         }
2178     }
2179 
2180     if (feat.GetQual().empty())
2181         feat.ResetQual();
2182 
2183     if (rna_gen.NotEmpty())
2184     {
2185         rna_ref->SetExt().SetGen(*rna_gen);
2186     }
2187 
2188     if (type != objects::CRNA_ref::eType_tRNA)                  /* if tRNA and codon value exist */
2189         return;
2190 
2191     qval = GetTheQualValue(feat.SetQual(), "anticodon");
2192     CRef<objects::CTrna_ext> trnaa;
2193     if (qval != NULL)
2194     {
2195         bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_na);
2196 
2197         CRef<objects::CSeq_loc> anticodon = GetTrnaAnticodon(feat, qval, bioseq.GetId(), accver);
2198         if (anticodon.NotEmpty())
2199         {
2200             trnaa.Reset(new objects::CTrna_ext);
2201 
2202             /* value has format: (pos:base_range, aa:amino_acid)
2203              */
2204             trnaa->SetAa().SetNcbieaa(GetQualValueAa(qval, true));
2205             trnaa->SetAnticodon(*anticodon);
2206             rna_ref->SetExt().SetTRNA(*trnaa);
2207         }
2208 
2209         MemFree(qval);
2210     }
2211 
2212     qval = CpTheQualValue(feat.SetQual(), "product");
2213 
2214     CRef<objects::CTrna_ext> trnap;
2215     if (qval != NULL)
2216     {
2217         trnap = fta_get_trna_from_product(feat, qval, NULL);
2218         MemFree(qval);
2219     }
2220 
2221     if (feat.IsSetComment() && feat.GetComment().empty())
2222     {
2223         feat.ResetComment();
2224     }
2225 
2226     remove = 0;
2227     CRef<objects::CTrna_ext> trnac;
2228     if (feat.IsSetComment())
2229     {
2230         trnac = fta_get_trna_from_product(feat, feat.GetComment().c_str(), &remove);
2231 
2232         if (get_aa_from_trna(*trnac) == 0)
2233         {
2234             trnac = fta_get_trna_from_comment(feat.GetComment().c_str(), &remove);
2235         }
2236 
2237         if (get_aa_from_trna(*trnac) == 0 && get_first_codon_from_trna(*trnac) == 255)
2238         {
2239             trnac.Reset();
2240         }
2241     }
2242 
2243     if (trnaa.Empty())
2244     {
2245         if (trnap.Empty())
2246         {
2247             if (trnac.NotEmpty() && get_aa_from_trna(*trnac) != 0)
2248             {
2249                 rna_ref->SetExt().SetTRNA(*trnac);
2250                 if(remove != 0)
2251                 {
2252                     feat.ResetComment();
2253                 }
2254             }
2255         }
2256         else
2257         {
2258             rna_ref->SetExt().SetTRNA(*trnap);
2259 
2260             if (get_aa_from_trna(*trnap) == 0)
2261             {
2262                 if (trnac.NotEmpty() && get_aa_from_trna(*trnac) != 0)
2263                     rna_ref->SetExt().SetTRNA(*trnac);
2264             }
2265             else if (trnac.NotEmpty())
2266             {
2267                 if (get_aa_from_trna(*trnac) == 0 && get_first_codon_from_trna(*trnac) != 255 &&
2268                     get_first_codon_from_trna(*trnap) == 255 && remove != 0)
2269                 {
2270                     trnap->SetCodon().assign(trnac->GetCodon().begin(), trnac->GetCodon().end());
2271 
2272                     feat.ResetComment();
2273                     if(remove == 2)
2274                         feat.SetComment("putative");
2275                 }
2276 
2277                 if (get_aa_from_trna(*trnac) == get_aa_from_trna(*trnap) && remove != 0)
2278                 {
2279                     feat.ResetComment();
2280                 }
2281             }
2282         }
2283     }
2284     else
2285     {
2286         if(trnap.NotEmpty())
2287         {
2288             trnap.Reset();
2289         }
2290 
2291         if (trnac.NotEmpty() && get_aa_from_trna(*trnac) != 0)
2292         {
2293             if (get_aa_from_trna(*trnac) == get_aa_from_trna(*trnaa) || get_aa_from_trna(*trnaa) == 88)
2294             {
2295                 trnac->SetAnticodon(trnaa->SetAnticodon());
2296                 trnaa->ResetAnticodon();
2297 
2298                 if (get_first_codon_from_trna(*trnac) == 255)
2299                 {
2300                     trnac->SetCodon().assign(trnaa->GetCodon().begin(), trnaa->GetCodon().end());
2301                 }
2302 
2303                 rna_ref->SetExt().SetTRNA(*trnac);
2304                 if(remove != 0)
2305                 {
2306                     feat.ResetComment();
2307                 }
2308             }
2309         }
2310     }
2311 
2312     if (feat.GetQual().empty())
2313         feat.ResetQual();
2314 
2315     if (rna_ref->IsSetExt() && rna_ref->GetExt().IsTRNA())
2316     {
2317         const objects::CTrna_ext& trna = rna_ref->GetExt().GetTRNA();
2318         if (get_aa_from_trna(trna) == 0 && !trna.IsSetAnticodon())
2319         {
2320             rna_ref->ResetExt();
2321         }
2322     }
2323 }
2324 
2325 /**********************************************************
2326  *
2327  *   static void GetImpFeat(sfp, fbp, locmap):
2328  *
2329  *      'replace' in loc will be changed later
2330  *   in SeqEntryToAsn3Ex.
2331  *
2332  *                                              01/07/97
2333  *
2334  **********************************************************/
GetImpFeat(objects::CSeq_feat & feat,FeatBlkPtr fbp,bool locmap)2335 static void GetImpFeat(objects::CSeq_feat& feat, FeatBlkPtr fbp, bool locmap)
2336 {
2337     CRef<objects::CImp_feat> imp_feat(new objects::CImp_feat);
2338     imp_feat->SetKey(fbp->key);
2339 
2340     if (locmap)
2341         imp_feat->SetLoc(fbp->location);
2342 
2343     feat.SetData().SetImp(*imp_feat);
2344 }
2345 
2346 /**********************************************************/
fta_sort_biosource(objects::CBioSource & bio)2347 void fta_sort_biosource(objects::CBioSource& bio)
2348 {
2349     if(bio.CanGetOrg() && !bio.GetOrg().GetDb().empty())
2350     {
2351         NON_CONST_ITERATE(objects::COrg_ref::TDb, db, bio.SetOrg().SetDb())
2352         {
2353             if (!(*db)->CanGetDb())
2354                 continue;
2355 
2356             objects::COrg_ref::TDb::iterator tdb = db;
2357             for (++tdb; tdb != bio.SetOrg().SetDb().end(); ++tdb)
2358             {
2359                 if (!(*tdb)->IsSetDb())
2360                     continue;
2361 
2362                 if ((*db)->GetDb() < (*tdb)->GetDb())
2363                     continue;
2364 
2365                 if ((*db)->GetDb() == (*tdb)->GetDb())
2366                 {
2367                     const objects::CObject_id& db_id = (*db)->GetTag();
2368                     const objects::CObject_id& tdb_id = (*tdb)->GetTag();
2369 
2370                     if (!db_id.IsStr() && tdb_id.IsStr())
2371                         continue;
2372 
2373                     if (db_id.IsStr() && tdb_id.IsStr() &&
2374                         db_id.GetStr() <= tdb_id.GetStr())
2375                         continue;
2376 
2377                     if (!db_id.IsStr() && !tdb_id.IsStr() &&
2378                         db_id.GetId() <= tdb_id.GetId())
2379                         continue;
2380                 }
2381 
2382                 db->Swap(*tdb);
2383             }
2384         }
2385 
2386         if (bio.GetOrg().IsSetOrgname() && bio.GetOrg().GetOrgname().IsSetMod())
2387         {
2388             NON_CONST_ITERATE(objects::COrgName::TMod, mod, bio.SetOrg().SetOrgname().SetMod())
2389             {
2390                 objects::COrgName::TMod::iterator tmod = mod;
2391                 for (++tmod; tmod != bio.SetOrg().SetOrgname().SetMod().end(); ++tmod)
2392                 {
2393                     if ((*mod)->GetSubtype() < (*tmod)->GetSubtype())
2394                         continue;
2395 
2396                     if ((*mod)->GetSubtype() == (*tmod)->GetSubtype() &&
2397                         (*mod)->GetSubname() <= (*tmod)->GetSubname())
2398                         continue;
2399 
2400                     mod->Swap(*tmod);
2401                 }
2402             }
2403         }
2404     }
2405 
2406     if (!bio.IsSetSubtype())
2407         return;
2408 
2409     NON_CONST_ITERATE(objects::CBioSource::TSubtype, sub, bio.SetSubtype())
2410     {
2411         objects::CBioSource::TSubtype::iterator tsub = sub;
2412         for (++tsub; tsub != bio.SetSubtype().end(); ++tsub)
2413         {
2414             if ((*sub)->GetSubtype() < (*tsub)->GetSubtype())
2415                 continue;
2416 
2417             if ((*sub)->GetSubtype() == (*tsub)->GetSubtype() &&
2418                 (*sub)->GetName() <= (*tsub)->GetName())
2419                 continue;
2420 
2421             sub->Swap(*tsub);
2422         }
2423     }
2424 }
2425 
2426 /**********************************************************/
ConvertQualifierValue(CRef<objects::CGb_qual> & qual)2427 static void ConvertQualifierValue(CRef<objects::CGb_qual>& qual)
2428 {
2429     std::string val = qual->GetVal();
2430     bool has_comma = val.find(',') != std::string::npos;
2431 
2432     if (has_comma)
2433     {
2434         std::replace(val.begin(), val.end(), ',', ';');
2435         qual->SetVal(val);
2436     }
2437 
2438     if (has_comma)
2439         ErrPostEx(SEV_WARNING, ERR_QUALIFIER_MultRptUnitComma,
2440         "Converting commas to semi-colons due to format conventions for multiple /rpt_unit qualifiers.");
2441 }
2442 
2443 /**********************************************************/
fta_parse_rpt_units(FeatBlkPtr fbp)2444 static void fta_parse_rpt_units(FeatBlkPtr fbp)
2445 {
2446     char*   p;
2447 
2448     if(fbp == NULL || fbp->quals.empty())
2449         return;
2450 
2451     TQualVector::iterator first = fbp->quals.end();
2452     size_t len = 0, count = 0;
2453 
2454     for (TQualVector::iterator qual = fbp->quals.begin(); qual != fbp->quals.end();)
2455     {
2456         if ((*qual)->GetQual() != "rpt_unit")
2457         {
2458             ++qual;
2459             continue;
2460         }
2461 
2462         ErrPostEx(SEV_ERROR, ERR_QUALIFIER_ObsoleteRptUnit,
2463                   "Obsolete /rpt_unit qualifier found on feature \"%s\" at location \"%s\".",
2464                   (fbp->key == NULL) ? "Unknown" : fbp->key,
2465                   (fbp->location == NULL) ? "unknown" : fbp->location);
2466 
2467         if ((*qual)->GetVal().empty())
2468         {
2469             qual = fbp->quals.erase(qual);
2470             continue;
2471         }
2472 
2473         count++;
2474         len += (*qual)->GetVal().size();
2475         if (first == fbp->quals.end())
2476             first = qual;
2477 
2478         if (count == 1)
2479         {
2480             ++qual;
2481             continue;
2482         }
2483 
2484         if(count == 2)
2485             ConvertQualifierValue(*first);
2486 
2487         ConvertQualifierValue(*qual);
2488         ++qual;
2489     }
2490 
2491     if(count == 0)
2492         return;
2493 
2494     if(count == 1)
2495     {
2496         const std::string& val = (*first)->GetVal();
2497         if(*val.begin() == '(' && *val.rbegin() == ')')
2498         {
2499             ConvertQualifierValue(*first);
2500         }
2501         return;
2502     }
2503 
2504     p = (char*) MemNew(len + count + 2);
2505     StringCpy(p, "(");
2506     StringCat(p, (*first)->GetVal().c_str());
2507 
2508     for (TQualVector::iterator qual = first; qual != fbp->quals.end();)
2509     {
2510         if ((*qual)->GetQual() != "rpt_unit")
2511         {
2512             ++qual;
2513             continue;
2514         }
2515 
2516         StringCat(p, ",");
2517         StringCat(p, (*qual)->GetVal().c_str());
2518         qual = fbp->quals.erase(qual);
2519     }
2520     StringCat(p, ")");
2521     (*first)->SetVal(p);
2522 }
2523 
2524 /**********************************************************/
fta_check_evidence(objects::CSeq_feat & feat,FeatBlkPtr fbp)2525 static bool fta_check_evidence(objects::CSeq_feat& feat, FeatBlkPtr fbp)
2526 {
2527     Int4      evi_exp;
2528     Int4      evi_not;
2529     Int4      exp_good;
2530     Int4      exp_bad;
2531     Int4      inf_good;
2532     Int4      inf_bad;
2533     Char      ch;
2534 
2535     if (fbp == NULL || fbp->quals.empty())
2536         return true;
2537 
2538     evi_exp = 0;
2539     evi_not = 0;
2540     exp_good = 0;
2541     exp_bad = 0;
2542     inf_good = 0;
2543     inf_bad = 0;
2544 
2545     for (TQualVector::iterator qual = fbp->quals.begin(); qual != fbp->quals.end();)
2546     {
2547         const std::string& qual_str = (*qual)->IsSetQual() ? (*qual)->GetQual() : "";
2548         const std::string& val_str = (*qual)->IsSetVal() ? (*qual)->GetVal() : "";
2549         if (qual_str == "experiment")
2550         {
2551             if (val_str == "experimental evidence, no additional details recorded")
2552             {
2553                 exp_good++;
2554                 qual = fbp->quals.erase(qual);
2555             }
2556             else
2557             {
2558                 exp_bad++;
2559                 ++qual;
2560             }
2561             continue;
2562         }
2563 
2564         if (qual_str == "inference")
2565         {
2566             if (val_str == "non-experimental evidence, no additional details recorded")
2567             {
2568                 inf_good++;
2569                 qual = fbp->quals.erase(qual);
2570             }
2571             else
2572             {
2573                 inf_bad++;
2574                 ++qual;
2575             }
2576             continue;
2577         }
2578 
2579         if (qual_str != "evidence")
2580         {
2581             ++qual;
2582             continue;
2583         }
2584 
2585         if (StringICmp(val_str.c_str(), "not_experimental") == 0)
2586             evi_not++;
2587         else if (StringICmp(val_str.c_str(), "experimental") == 0)
2588             evi_exp++;
2589         else
2590         {
2591             if(fbp->location != NULL && StringLen(fbp->location) > 50)
2592             {
2593                 ch = fbp->location[50];
2594                 fbp->location[50] = '\0';
2595             }
2596             else
2597                 ch = '\0';
2598             ErrPostEx(SEV_ERROR, ERR_QUALIFIER_InvalidEvidence,
2599                       "Illegal value \"%s\" for /evidence qualifier on the \"%s\" feature at \"%s\". Qualifier dropped.",
2600                       (val_str.empty()) ? "Unknown" : val_str.c_str(),
2601                       (fbp->key == NULL) ? "Unknown" : fbp->key,
2602                       (fbp->location == NULL) ? "unknown location" : fbp->location);
2603             if(ch != '\0')
2604                 fbp->location[50] = ch;
2605         }
2606 
2607         qual = fbp->quals.erase(qual);
2608     }
2609 
2610     if(evi_exp + evi_not > 0 && exp_good + exp_bad + inf_good + inf_bad > 0)
2611     {
2612         if(fbp->location != NULL && StringLen(fbp->location) > 50)
2613         {
2614             ch = fbp->location[50];
2615             fbp->location[50] = '\0';
2616         }
2617         else
2618             ch = '\0';
2619         ErrPostEx(SEV_REJECT, ERR_QUALIFIER_Conflict,
2620                   "Old /evidence and new /experiment or /inference qualifiers both exist on the \"%s\" feature at \"%s\". This is currently unsupported.",
2621                   (fbp->key == NULL) ? "Unknown" : fbp->key,
2622                   (fbp->location == NULL) ? "unknown location" : fbp->location);
2623         if(ch != '\0')
2624             fbp->location[50] = ch;
2625         return false;
2626     }
2627 
2628     if(evi_exp + exp_good > 0 && evi_not + inf_good > 0)
2629     {
2630         if(fbp->location != NULL && StringLen(fbp->location) > 50)
2631         {
2632             ch = fbp->location[50];
2633             fbp->location[50] = '\0';
2634         }
2635         else
2636             ch = '\0';
2637         ErrPostEx(SEV_REJECT, ERR_QUALIFIER_Conflict,
2638                   "The special \"no additional details recorded\" values for both /experiment and /inference exist on the \"%s\" feature at \"%s\". This is currently unsupported.",
2639                   (fbp->key == NULL) ? "Unknown" : fbp->key,
2640                   (fbp->location == NULL) ? "unknown location" : fbp->location);
2641         if(ch != '\0')
2642             fbp->location[50] = ch;
2643         return false;
2644     }
2645 
2646     if((exp_good > 0 && exp_bad > 0) || (inf_good > 0 && inf_bad > 0))
2647     {
2648         if(fbp->location != NULL && StringLen(fbp->location) > 50)
2649         {
2650             ch = fbp->location[50];
2651             fbp->location[50] = '\0';
2652         }
2653         else
2654             ch = '\0';
2655         ErrPostEx(SEV_REJECT, ERR_QUALIFIER_Conflict,
2656                   "The special \"no additional details recorded\" value for /experiment or /inference exists in conjunction with other /experiment or /inference qualifiers on the \"%s\" feature at \"%s\". This is currently unsupported.",
2657                   (fbp->key == NULL) ? "Unknown" : fbp->key,
2658                   (fbp->location == NULL) ? "unknown location" : fbp->location);
2659         if(ch != '\0')
2660             fbp->location[50] = ch;
2661         return false;
2662     }
2663 
2664     if(exp_good + evi_exp > 0)
2665         feat.SetExp_ev(objects::CSeq_feat::eExp_ev_experimental);
2666     else if (inf_good + evi_not > 0)
2667         feat.SetExp_ev(objects::CSeq_feat::eExp_ev_not_experimental);
2668     return true;
2669 }
2670 
2671 /**********************************************************
2672  *
2673  *   static CRef<objects::CSeq_feat> ProcFeatBlk(pp, fbp, seqids):
2674  *
2675  *      Process each feature sub-block.
2676  *      location, SeqLocPtr by calling Karl's routine,
2677  *   Nml_gbparseint which return locmap = TRUE if mapping
2678  *   location rules not work, then SeqLocPtr->whole = seqids[0].
2679  *   sitesmap = TRUE if found "(sites" string, num_errs > 0
2680  *   if any errors occurred.
2681  *      If there is a illegal location, then assign
2682  *   qualifier to be a Imp-feat.
2683  *
2684  **********************************************************/
ProcFeatBlk(ParserPtr pp,FeatBlkPtr fbp,TSeqIdList & seqids)2685 static CRef<objects::CSeq_feat> ProcFeatBlk(ParserPtr pp, FeatBlkPtr fbp, TSeqIdList& seqids)
2686 {
2687     const char **b;
2688 
2689     char* loc = NULL;
2690 
2691     bool    locmap = false;
2692     bool    err = false;
2693 
2694     CRef<objects::CSeq_feat> feat;
2695 
2696     if (fbp->location != NULL)
2697     {
2698         loc = fbp->location;
2699         DelCharBtwData(loc);
2700         if(pp->buf != NULL)
2701             MemFree(pp->buf);
2702         pp->buf = (char*) MemNew(StringLen(fbp->key) + StringLen(loc) + 4);
2703         StringCpy(pp->buf, fbp->key);
2704         StringCat(pp->buf, " : ");
2705         StringCat(pp->buf, loc);
2706 
2707         feat.Reset(new objects::CSeq_feat);
2708         locmap = GetSeqLocation(*feat, loc, seqids, &err, pp, fbp->key);
2709 
2710         if(pp->buf != NULL)
2711             MemFree(pp->buf);
2712         pp->buf = NULL;
2713     }
2714     if(err)
2715     {
2716         if(pp->debug == false)
2717         {
2718             ErrPostEx(SEV_ERROR, ERR_FEATURE_Dropped,
2719                       "%s|%s| range check detects problems", fbp->key, loc);
2720             feat.Reset();
2721             return feat;
2722         }
2723         ErrPostEx(SEV_WARNING, ERR_LOCATION_FailedCheck,
2724                   "%s|%s| range check detects problems", fbp->key, loc);
2725     }
2726 
2727     if (!fbp->quals.empty()) {
2728         if (DeleteQual(fbp->quals, "partial"))
2729             feat->SetPartial(true);
2730     }
2731 
2732     if (StringStr(loc, "order") != NULL)
2733         feat->SetPartial(true);
2734 
2735     if (!fbp->quals.empty())
2736     {
2737         if (DeleteQual(fbp->quals, "pseudo"))
2738             feat->SetPseudo(true);
2739     }
2740 
2741     if (!fbp->quals.empty())
2742         DeleteQual(fbp->quals, "gsdb_id");
2743 
2744     if (!fbp->quals.empty())
2745         fta_parse_rpt_units(fbp);
2746 
2747     if (!fbp->quals.empty())
2748     {
2749         for(b = TransSplicingFeats; *b != NULL; b++)
2750             if(StringCmp(fbp->key, *b) == 0)
2751                 break;
2752         if (*b != NULL && DeleteQual(fbp->quals, "trans_splicing"))
2753         {
2754             feat->SetExcept(true);
2755             if (!feat->IsSetExcept_text())
2756                 feat->SetExcept_text("trans-splicing");
2757             else
2758             {
2759                 std::string& exc_text = feat->SetExcept_text();
2760                 exc_text += ", trans-splicing";
2761             }
2762         }
2763     }
2764 
2765     if(!fta_check_evidence(*feat, fbp))
2766     {
2767         pp->entrylist[pp->curindx]->drop = 1;
2768         return feat;
2769     }
2770 
2771     if ((!feat->IsSetPartial() || !feat->GetPartial()) && StringCmp(fbp->key, "gap") != 0) {
2772         if (SeqLocHaveFuzz(feat->GetLocation()))
2773             feat->SetPartial(true);
2774     }
2775 
2776     if (!fbp->quals.empty())
2777     {
2778         const Char* comment = GetTheQualValue(fbp->quals, "note");
2779 
2780         if (comment && comment[0])
2781             feat->SetComment(comment);
2782     }
2783 
2784     /* assume all imp for now
2785      */
2786     if (StringStr(fbp->key, "source") == NULL)
2787         GetImpFeat(*feat, fbp, locmap);
2788 
2789     ITERATE(TQualVector, cur, fbp->quals)
2790     {
2791         const std::string& qual_str = (*cur)->GetQual();
2792         if (qual_str == "pseudogene")
2793             feat->SetPseudo(true);
2794 
2795         // Do nothing for 'translation' qualifier in case of its value is empty
2796         if (qual_str == "translation" && (!(*cur)->IsSetVal() || (*cur)->GetVal().empty()))
2797             continue;
2798 
2799         if (!qual_str.empty())
2800             feat->SetQual().push_back(*cur);
2801     }
2802 
2803     return feat;
2804 }
2805 
2806 /**********************************************************/
fta_get_gcode_from_biosource(const objects::CBioSource & bio_src,IndexblkPtr ibp)2807 static void fta_get_gcode_from_biosource(const objects::CBioSource& bio_src, IndexblkPtr ibp)
2808 {
2809     if (!bio_src.IsSetOrg() || !bio_src.GetOrg().IsSetOrgname())
2810         return;
2811 
2812     ibp->gc_genomic = bio_src.GetOrg().GetOrgname().IsSetGcode() ? bio_src.GetOrg().GetOrgname().GetGcode() : 0;
2813     ibp->gc_mito = bio_src.GetOrg().GetOrgname().IsSetMgcode() ? bio_src.GetOrg().GetOrgname().GetMgcode() : 0;
2814 }
2815 
2816 /**********************************************************/
fta_sort_quals(FeatBlkPtr fbp,bool qamode)2817 static void fta_sort_quals(FeatBlkPtr fbp, bool qamode)
2818 {
2819     if(fbp == NULL)
2820         return;
2821 
2822     NON_CONST_ITERATE(TQualVector, q, fbp->quals)
2823     {
2824         if((*q)->GetQual() == "gene" ||
2825            (!qamode && (*q)->GetQual() == "product"))
2826             continue;
2827 
2828         TQualVector::iterator tq = q;
2829         for (++tq; tq != fbp->quals.end(); ++tq)
2830         {
2831             const std::string& q_qual = (*q)->GetQual();
2832             const std::string& tq_qual = (*tq)->GetQual();
2833 
2834             if (!tq_qual.empty())
2835             {
2836                 if (q_qual == "gene")
2837                     continue;
2838 
2839                 Int4 i = StringICmp(q_qual.c_str(), tq_qual.c_str());
2840                 if(i < 0)
2841                     continue;
2842                 if(i == 0)
2843                 {
2844                     /* Do not sort /gene qualifiers
2845                      */
2846                     const std::string q_val = (*q)->GetVal();
2847                     const std::string tq_val = (*tq)->GetVal();
2848 
2849                     if (q_val.empty())
2850                         continue;
2851 
2852                     if(!tq_val.empty())
2853                     {
2854                         if(q_val[0] >= '0' && q_val[0] <= '9' &&
2855                            tq_val[0] >= '0' && tq_val[0] <= '9')
2856                         {
2857                             if(atoi(q_val.c_str()) <= atoi(tq_val.c_str()))
2858                                 continue;
2859                         }
2860                         else if(q_val <= tq_val)
2861                             continue;
2862                     }
2863                 }
2864             }
2865 
2866             q->Swap(*tq);
2867         }
2868     }
2869 }
2870 
2871 /**********************************************************/
fta_qual_a_in_b(const TQualVector & qual1,const TQualVector & qual2)2872 static bool fta_qual_a_in_b(const TQualVector& qual1, const TQualVector& qual2)
2873 {
2874     bool found = false;
2875 
2876     ITERATE(TQualVector, gbqp1, qual1)
2877     {
2878         found = false;
2879         ITERATE(TQualVector, gbqp2, qual2)
2880         {
2881             const Char* qual_a = (*gbqp1)->IsSetQual() ? (*gbqp1)->GetQual().c_str() : NULL;
2882             const Char* qual_b = (*gbqp2)->IsSetQual() ? (*gbqp2)->GetQual().c_str() : NULL;
2883 
2884             const Char* val_a = (*gbqp1)->IsSetVal() ? (*gbqp1)->GetVal().c_str() : NULL;
2885             const Char* val_b = (*gbqp2)->IsSetVal() ? (*gbqp2)->GetVal().c_str() : NULL;
2886 
2887             if (fta_strings_same(qual_a, qual_b) && fta_strings_same(val_a, val_b))
2888             {
2889                 found = true;
2890                 break;
2891             }
2892         }
2893         if (!found)
2894             break;
2895     }
2896 
2897     if (!found)
2898         return false;
2899 
2900     return true;
2901 }
2902 
2903 /**********************************************************/
fta_feats_same(FeatBlkPtr fbp1,FeatBlkPtr fbp2)2904 static bool fta_feats_same(FeatBlkPtr fbp1, FeatBlkPtr fbp2)
2905 {
2906     if(fbp1 == NULL && fbp2 == NULL)
2907         return true;
2908     if(fbp1 == NULL || fbp2 == NULL ||
2909        fta_strings_same(fbp1->key, fbp2->key) == false ||
2910        fta_strings_same(fbp1->location, fbp2->location) == false)
2911         return false;
2912 
2913     if (fta_qual_a_in_b(fbp1->quals, fbp2->quals) && fta_qual_a_in_b(fbp2->quals, fbp1->quals))
2914         return true;
2915 
2916     return false;
2917 }
2918 
2919 /**********************************************************/
fta_check_rpt_unit_span(const char * val,size_t length)2920 static bool fta_check_rpt_unit_span(const char* val, size_t length)
2921 {
2922     const char* p;
2923     const char* q;
2924     Int4    i1;
2925     Int4    i2;
2926 
2927     if(val == NULL || *val == '\0')
2928         return false;
2929 
2930     for(p = val; *p >= '0' && *p <= '9';)
2931         p++;
2932 
2933     if(p == val || p[0] != '.' || p[1] != '.')
2934         return false;
2935 
2936     i1 = atoi(val);
2937     for(p += 2, q = p; *q >= '0' && *q <= '9';)
2938         q++;
2939     if(q == p || *q != '\0')
2940         return false;
2941     i2 = atoi(p);
2942 
2943     if(i1 == 0 || i1 > i2 || i2 > (Int4) length)
2944         return false;
2945     return true;
2946 }
2947 
2948 /**********************************************************/
fta_check_rpt_unit_range(FeatBlkPtr fbp,size_t length)2949 static void fta_check_rpt_unit_range(FeatBlkPtr fbp, size_t length)
2950 {
2951     Char      ch;
2952 
2953     if (fbp == NULL || fbp->quals.empty())
2954         return;
2955 
2956     for (TQualVector::iterator cur = fbp->quals.begin(); cur != fbp->quals.end();)
2957     {
2958         if (!(*cur)->IsSetQual() || !(*cur)->IsSetVal())
2959         {
2960             ++cur;
2961             continue;
2962         }
2963 
2964         const std::string& qual_str = (*cur)->GetQual();
2965         const std::string& val_str = (*cur)->GetVal();
2966 
2967         if (qual_str != "rpt_unit_range" || fta_check_rpt_unit_span(val_str.c_str(), length))
2968         {
2969             ++cur;
2970             continue;
2971         }
2972 
2973         if(fbp->location != NULL && StringLen(fbp->location) > 20)
2974         {
2975             ch = fbp->location[20];
2976             fbp->location[20] = '\0';
2977         }
2978         else
2979             ch = '\0';
2980         ErrPostEx(SEV_ERROR, ERR_QUALIFIER_InvalidRptUnitRange,
2981                   "/rpt_unit_range qualifier \"%s\" on feature \"%s\" at location \"%s%s\" is not a valid basepair range. Qualifier dropped.",
2982                   val_str.empty() ? "(EMPTY)" : val_str.c_str(),
2983                   (fbp->key == NULL) ? "Unknown" : fbp->key,
2984                   (fbp->location == NULL) ? "unknown" : fbp->location,
2985                   (ch == '\0') ? "" : "...");
2986         if(ch != '\0')
2987             fbp->location[20] = ch;
2988 
2989         cur = fbp->quals.erase(cur);
2990     }
2991 }
2992 
2993 /**********************************************************/
fta_remove_dup_feats(DataBlkPtr dbp)2994 static void fta_remove_dup_feats(DataBlkPtr dbp)
2995 {
2996     DataBlkPtr tdbp;
2997     DataBlkPtr tdbpprev;
2998     DataBlkPtr tdbpnext;
2999     FeatBlkPtr fbp1;
3000     FeatBlkPtr fbp2;
3001     Char       ch;
3002 
3003     if(dbp == NULL || dbp->next == NULL)
3004         return;
3005 
3006     for(; dbp != NULL; dbp = dbp->next)
3007     {
3008         if(dbp->data == NULL)
3009             continue;
3010 
3011         fbp1 = (FeatBlkPtr) dbp->data;
3012         tdbpprev = dbp;
3013         for(tdbp = dbp->next; tdbp != NULL; tdbp = tdbpnext)
3014         {
3015             tdbpnext = tdbp->next;
3016             if(tdbp->data == NULL)
3017             {
3018                 tdbpprev->next = tdbpnext;
3019                 MemFree(tdbp);
3020                 continue;
3021             }
3022 
3023             fbp2 = (FeatBlkPtr) tdbp->data;
3024 
3025             if(fbp1->location != NULL && fbp2->location != NULL &&
3026                StringCmp(fbp1->location, fbp2->location) < 0)
3027                 break;
3028 
3029             if(!fta_feats_same(fbp1, fbp2))
3030             {
3031                 tdbpprev = tdbp;
3032                 continue;
3033             }
3034 
3035             if(fbp2->location != NULL && StringLen(fbp2->location) > 20)
3036             {
3037                 ch = fbp2->location[20];
3038                 fbp2->location[20] = '\0';
3039             }
3040             else
3041                 ch = '\0';
3042             ErrPostEx(SEV_WARNING, ERR_FEATURE_DuplicateRemoved,
3043                       "Duplicated feature \"%s\" at location \"%s%s\" removed.",
3044                       (fbp2->key == NULL) ? "???" : fbp2->key,
3045                       (fbp2->location == NULL) ? "???" : fbp2->location,
3046                       (ch == '\0') ? "" : "...");
3047 
3048             FreeFeatBlkQual(fbp2);
3049             tdbpprev->next = tdbpnext;
3050             MemFree(tdbp);
3051         }
3052     }
3053 }
3054 
3055 /**********************************************************/
3056 class PredIsGivenQual
3057 {
3058 public:
PredIsGivenQual(const std::string & qual)3059     PredIsGivenQual(const std::string& qual) : qual_(qual) {}
3060 
operator ()(const CRef<objects::CGb_qual> & qual)3061     bool operator()(const CRef<objects::CGb_qual>& qual)
3062     {
3063         return qual->GetQual() == qual_;
3064     }
3065 
3066 private:
3067     std::string qual_;
3068 };
3069 
fta_check_multiple_locus_tag(DataBlkPtr dbp,unsigned char * drop)3070 static void fta_check_multiple_locus_tag(DataBlkPtr dbp, unsigned char* drop)
3071 {
3072     FeatBlkPtr fbp;
3073     Char       ch;
3074 
3075     for(; dbp != NULL; dbp = dbp->next)
3076     {
3077         fbp = (FeatBlkPtr) dbp->data;
3078         if(fbp == NULL)
3079             continue;
3080 
3081         size_t i = std::count_if(fbp->quals.begin(), fbp->quals.end(), PredIsGivenQual("locus_tag"));
3082         if(i < 2)
3083             continue;
3084 
3085         if(fbp->location != NULL && StringLen(fbp->location) > 50)
3086         {
3087             ch = fbp->location[50];
3088             fbp->location[50] = '\0';
3089         }
3090         else
3091             ch = '\0';
3092         ErrPostEx(SEV_REJECT, ERR_FEATURE_MultipleLocusTags,
3093                   "Multiple /locus_tag values for \"%s\" feature at \"%s\".",
3094                   (fbp->key == NULL) ? "Unknown" : fbp->key,
3095                   (fbp->location == NULL) ? "unknown location" : fbp->location);
3096         if(ch != '\0')
3097             fbp->location[50] = ch;
3098         *drop = 1;
3099         break;
3100     }
3101 }
3102 
3103 /**********************************************************/
fta_check_old_locus_tags(DataBlkPtr dbp,unsigned char * drop)3104 static void fta_check_old_locus_tags(DataBlkPtr dbp, unsigned char* drop)
3105 {
3106     Int4       i;
3107 
3108     PredIsGivenQual isOldLocusTag("old_locus_tag"),
3109                     isLocusTag("locus_tag");
3110 
3111     for(; dbp != NULL; dbp = dbp->next)
3112     {
3113         FeatBlkPtr fbp = (FeatBlkPtr)dbp->data;
3114         if(fbp == NULL)
3115             continue;
3116         size_t olt = std::count_if(fbp->quals.begin(), fbp->quals.end(), isOldLocusTag);
3117         size_t lt = std::count_if(fbp->quals.begin(), fbp->quals.end(), isLocusTag);
3118 
3119         if(olt == 0)
3120             continue;
3121 
3122         if(lt == 0)
3123         {
3124             ErrPostEx(SEV_REJECT, ERR_FEATURE_OldLocusTagWithoutNew,
3125                       "Feature \"%s\" at \"%s\" has an /old_locus_tag qualifier but lacks a /locus_tag qualifier. Entry dropped.",
3126                       (fbp->key == NULL) ? "Unknown" : fbp->key,
3127                       (fbp->location == NULL) ? "unknown location" : fbp->location);
3128             *drop = 1;
3129         }
3130         else
3131         {
3132             i = 0;
3133             ITERATE(TQualVector, gbqp1, fbp->quals)
3134             {
3135                 if (!(*gbqp1)->IsSetQual() || !(*gbqp1)->IsSetVal() || !isLocusTag(*gbqp1))
3136                     continue;
3137 
3138                 i++;
3139 
3140                 const std::string& gbqp1_val = (*gbqp1)->GetVal();
3141                 if (gbqp1_val.empty())
3142                     continue;
3143 
3144                 ITERATE(TQualVector, gbqp2, fbp->quals)
3145                 {
3146                     if (!(*gbqp2)->IsSetQual() || !(*gbqp2)->IsSetVal())
3147                         continue;
3148 
3149                     const std::string& gbqp2_val = (*gbqp2)->GetVal();
3150 
3151                     if (!isOldLocusTag(*gbqp2) || !NStr::EqualNocase(gbqp1_val, gbqp2_val))
3152                         continue;
3153 
3154                     ErrPostEx(SEV_REJECT, ERR_FEATURE_MatchingOldNewLocusTag,
3155                               "Feature \"%s\" at \"%s\" has an /old_locus_tag qualifier with a value that is identical to that of a /locus_tag qualifier: \"%s\". Entry dropped.",
3156                               (fbp->key == NULL) ? "Unknown" : fbp->key,
3157                               (fbp->location == NULL) ? "unknown location" : fbp->location,
3158                               gbqp1_val.c_str());
3159                     *drop = 1;
3160                 }
3161             }
3162         }
3163 
3164         if(olt == 1)
3165             continue;
3166 
3167         ITERATE(TQualVector, gbqp1, fbp->quals)
3168         {
3169             const std::string& gbqp1_val = (*gbqp1)->GetVal();
3170             if (isOldLocusTag(*gbqp1) || gbqp1_val.empty())
3171                 continue;
3172 
3173             TQualVector::const_iterator gbqp2 = gbqp1;
3174             for (++gbqp2; gbqp2 != fbp->quals.end(); ++gbqp2)
3175             {
3176                 const std::string& gbqp2_val = (*gbqp2)->GetVal();
3177                 if (isOldLocusTag(*gbqp2) || gbqp2_val.empty())
3178                     continue;
3179 
3180                 if (StringICmp(gbqp1_val.c_str(), gbqp2_val.c_str()) == 0)
3181                 {
3182                     ErrPostEx(SEV_ERROR, ERR_FEATURE_RedundantOldLocusTag,
3183                               "Feature \"%s\" at \"%s\" has redundant /old_locus_tag qualifiers. Dropping all but the first.",
3184                               (fbp->key == NULL) ? "Unknown" : fbp->key,
3185                               (fbp->location == NULL) ? "unknown location" : fbp->location);
3186                     break;
3187                 }
3188             }
3189 
3190             if (gbqp2 != fbp->quals.end())
3191                 break;
3192         }
3193     }
3194 }
3195 
3196 /**********************************************************/
fta_check_pseudogene_qual(DataBlkPtr dbp)3197 static void fta_check_pseudogene_qual(DataBlkPtr dbp)
3198 {
3199     FeatBlkPtr fbp;
3200     bool    got_pseudogene;
3201     bool    got_pseudo;
3202 
3203     for(; dbp != NULL; dbp = dbp->next)
3204     {
3205         fbp = (FeatBlkPtr) dbp->data;
3206         if(fbp == NULL)
3207             continue;
3208 
3209         got_pseudo = false;
3210         got_pseudogene = false;
3211 
3212         for (TQualVector::iterator cur = fbp->quals.begin(); cur != fbp->quals.end(); )
3213         {
3214             const std::string& qual_str = (*cur)->GetQual();
3215             const std::string& val_str = (*cur)->IsSetVal() ? (*cur)->GetVal() : "";
3216 
3217             if (qual_str != "pseudogene")
3218             {
3219                 if(!got_pseudo && qual_str == "pseudo")
3220                     got_pseudo = true;
3221                 ++cur;
3222                 continue;
3223             }
3224 
3225             if(got_pseudogene)
3226             {
3227                 ErrPostEx(SEV_ERROR, ERR_QUALIFIER_MultiplePseudoGeneQuals,
3228                           "Dropping a /pseudogene qualifier because multiple /pseudogene qualifiers are present : <%s> : Feature key <%s> : Feature location <%s>.",
3229                           val_str.empty() ? "[empty]" : val_str.c_str(),
3230                           fbp->key, fbp->location);
3231 
3232                 cur = fbp->quals.erase(cur);
3233                 continue;
3234             }
3235 
3236             got_pseudogene = true;
3237 
3238             if (val_str.empty())
3239             {
3240                 ErrPostEx(SEV_ERROR, ERR_QUALIFIER_InvalidPseudoGeneValue,
3241                           "Dropping a /pseudogene qualifier because its value is empty : Feature key <%s> : Feature location <%s>.",
3242                           fbp->key, fbp->location);
3243 
3244                 cur = fbp->quals.erase(cur);
3245                 continue;
3246             }
3247 
3248             if(MatchArrayString(PseudoGeneValues, val_str.c_str()) >= 0)
3249             {
3250                 ++cur;
3251                 continue;
3252             }
3253 
3254             ErrPostEx(SEV_ERROR, ERR_QUALIFIER_InvalidPseudoGeneValue,
3255                       "Dropping a /pseudogene qualifier because its value is invalid : <%s> : Feature key <%s> : Feature location <%s>.",
3256                       val_str.c_str(), fbp->key, fbp->location);
3257 
3258             cur = fbp->quals.erase(cur);
3259         }
3260 
3261         if(!got_pseudogene || !got_pseudo)
3262             continue;
3263 
3264         ErrPostEx(SEV_ERROR, ERR_QUALIFIER_OldPseudoWithPseudoGene,
3265                   "A legacy /pseudo qualifier and a /pseudogene qualifier are present on the same feature : Dropping /pseudo : Feature key <%s> : Feature location <%s>.",
3266                   fbp->key, fbp->location);
3267         DeleteQual(fbp->quals, "pseudo");
3268     }
3269 }
3270 
3271 /**********************************************************/
fta_check_compare_qual(DataBlkPtr dbp,bool is_tpa)3272 static void fta_check_compare_qual(DataBlkPtr dbp, bool is_tpa)
3273 {
3274     FeatBlkPtr fbp;
3275     char*    p;
3276     char*    q;
3277     bool       badcom;
3278     Char       ch;
3279     Int4       com_count;
3280     Int4       cit_count;
3281 
3282     for(; dbp != NULL; dbp = dbp->next)
3283     {
3284         fbp = (FeatBlkPtr) dbp->data;
3285         if(fbp == NULL)
3286             continue;
3287 
3288         com_count = 0;
3289         cit_count = 0;
3290 
3291         for (TQualVector::iterator cur = fbp->quals.begin(); cur != fbp->quals.end();)
3292         {
3293             const std::string& qual_str = (*cur)->GetQual();
3294             const std::string& val_str = (*cur)->IsSetVal() ? (*cur)->GetVal() : "";
3295 
3296             if (qual_str == "compare")
3297             {
3298                 badcom = true;
3299                 if (!val_str.empty())
3300                 {
3301                     q = StringChr(val_str.c_str(), '.');
3302                     if(q != NULL && q[1] != '\0')
3303                     {
3304                         for(p = q + 1; *p >= '0' && *p <= '9';)
3305                             p++;
3306                         if(*p == '\0')
3307                         {
3308                             *q = '\0';
3309                             if (GetNucAccOwner(val_str.c_str(), is_tpa) > 0)
3310                                 badcom = false;
3311                             *q = '.';
3312                         }
3313                     }
3314                 }
3315 
3316                 if(badcom)
3317                 {
3318                     ErrPostEx(SEV_ERROR, ERR_QUALIFIER_IllegalCompareQualifier,
3319                               "/compare qualifier value is not a legal Accession.Version : feature \"%s\" at \"%s\" : value \"%s\" : qualifier has been dropped.",
3320                               fbp->key, fbp->location,
3321                               val_str.empty() ? "[empty]" : val_str.c_str());
3322 
3323                     cur = fbp->quals.erase(cur);
3324                     continue;
3325                 }
3326                 com_count++;
3327             }
3328             else if (qual_str == "citation")
3329                 cit_count++;
3330 
3331             ++cur;
3332         }
3333 
3334         if(com_count > 0 || cit_count > 0 ||
3335            (StringCmp(fbp->key, "old_sequence") != 0 &&
3336             StringCmp(fbp->key, "conflict") != 0))
3337             continue;
3338 
3339         ch = '\0';
3340         if(StringLen(fbp->location) > 30)
3341         {
3342             ch = fbp->location[30];
3343             fbp->location[30] = '\0';
3344         }
3345         ErrPostEx(SEV_ERROR, ERR_FEATURE_RequiredQualifierMissing,
3346                   "Feature \"%s\" at \"%s\" lacks required /citation and/or /compare qualifier : feature has been dropped.",
3347                   fbp->key, fbp->location);
3348         if(ch != '\0')
3349             fbp->location[30] = ch;
3350         dbp->drop = 1;
3351     }
3352 }
3353 
3354 /**********************************************************/
fta_check_non_tpa_tsa_tls_locations(DataBlkPtr dbp,IndexblkPtr ibp)3355 static void fta_check_non_tpa_tsa_tls_locations(DataBlkPtr dbp,
3356                                                 IndexblkPtr ibp)
3357 {
3358     FeatBlkPtr fbp;
3359     char*    location;
3360     char*    p;
3361     char*    q;
3362     char*    r;
3363     Uint1      i;
3364 
3365     location = NULL;
3366     for(; dbp != NULL; dbp = dbp->next)
3367     {
3368         fbp = (FeatBlkPtr) dbp->data;
3369         if(fbp == NULL || fbp->location == NULL)
3370             continue;
3371         location = StringSave(fbp->location);
3372         for(p = location, q = p; *p != '\0'; p++)
3373             if(*p != ' ' && *p != '\t' && *p != '\n')
3374                 *q++ = *p;
3375         *q = '\0';
3376         if(q == location)
3377         {
3378             MemFree(location);
3379             location = NULL;
3380             continue;
3381         }
3382 
3383         for(p = location + 1; *p != '\0'; p++)
3384         {
3385             if(*p != ':')
3386                 continue;
3387             for(r = NULL, q = p - 1;; q--)
3388             {
3389                 if(q == location)
3390                 {
3391                     if(*q != '_' && (*q < '0' || *q > '9') &&
3392                        (*q < 'a' || *q > 'z') && (*q < 'A' || *q > 'Z'))
3393                         q++;
3394                     break;
3395                 }
3396                 if(*q == '.')
3397                 {
3398                     if(r == NULL)
3399                     {
3400                         r = q;
3401                         continue;
3402                     }
3403                     q++;
3404                     break;
3405                 }
3406                 if(*q != '_' && (*q < '0' || *q > '9') &&
3407                    (*q < 'a' || *q > 'z') && (*q < 'A' || *q > 'Z'))
3408                 {
3409                     q++;
3410                     break;
3411                 }
3412             }
3413             if(q == p)
3414                 continue;
3415             if(r != NULL)
3416                 *r = '\0';
3417             else
3418                 *p = '\0';
3419             i = GetNucAccOwner(q, ibp->is_tpa);
3420             if(r != NULL)
3421                 *r = '.';
3422             else
3423                 *p = ':';
3424 
3425 
3426             if (i == objects::CSeq_id::e_Genbank && (q[0] == 'e' || q[0] == 'E') &&
3427                (q[1] == 'z' || q[1] == 'Z') && ibp->is_tpa == false)
3428                 continue;
3429             if (ibp->is_tpa && (i == objects::CSeq_id::e_Tpg || i == objects::CSeq_id::e_Tpd ||
3430                 i == objects::CSeq_id::e_Tpe))
3431                 continue;
3432             break;
3433         }
3434         if(*p != '\0')
3435             break;
3436         if(location != NULL)
3437         {
3438             MemFree(location);
3439             location = NULL;
3440         }
3441     }
3442     if(dbp == NULL)
3443         return;
3444 
3445     ibp->drop = 1;
3446     if(location != NULL && StringLen(location) > 45)
3447     {
3448         location[40] = '\0';
3449         StringCat(location, "...");
3450     }
3451     if(ibp->is_tsa)
3452         ErrPostEx(SEV_REJECT, ERR_LOCATION_AccessionNotTSA,
3453                   "Feature \"%s\" at \"%s\" on a TSA record cannot point to a non-TSA record.",
3454                   fbp->key, (location == NULL) ? "empty_location" : location);
3455     else if(ibp->is_tls)
3456         ErrPostEx(SEV_REJECT, ERR_LOCATION_AccessionNotTLS,
3457                   "Feature \"%s\" at \"%s\" on a TLS record cannot point to a non-TLS record.",
3458                   fbp->key, (location == NULL) ? "empty_location" : location);
3459     else
3460         ErrPostEx(SEV_REJECT, ERR_LOCATION_AccessionNotTPA,
3461                   "Feature \"%s\" at \"%s\" on a TPA record cannot point to a non-TPA record.",
3462                   fbp->key, (location == NULL) ? "empty_location" : location);
3463     if(location != NULL)
3464         MemFree(location);
3465 }
3466 
3467 /**********************************************************/
fta_perform_operon_checks(ParserPtr pp,TSeqFeatList & feats,IndexblkPtr ibp)3468 static bool fta_perform_operon_checks(ParserPtr pp, TSeqFeatList& feats, IndexblkPtr ibp)
3469 {
3470     FTAOperonPtr fophead;
3471     FTAOperonPtr fop;
3472     FTAOperonPtr tfop;
3473 
3474     char*      p;
3475     bool         got;
3476     Int4         count;
3477 
3478     if(pp == NULL)
3479         return true;
3480 
3481     if (feats.empty())
3482     {
3483         if(ibp->segnum == ibp->segtotal && pp->operon != NULL)
3484             return(pp->operon->ret);
3485         return true;
3486     }
3487 
3488     if(ibp->segnum < 2 && pp->operon != NULL)
3489     {
3490         fta_operon_free(pp->operon);
3491         pp->operon = NULL;
3492     }
3493 
3494     if(pp->operon == NULL)
3495     {
3496         fop = new FTAOperon;
3497         fop->ret = true;
3498         pp->operon = fop;
3499     }
3500     else
3501         for(fop = pp->operon; fop->next != NULL;)
3502             fop = fop->next;
3503 
3504     fophead = NULL;
3505     ITERATE(TSeqFeatList, feat, feats)
3506     {
3507         if (!(*feat)->GetData().IsImp())
3508             continue;
3509 
3510         const objects::CImp_feat& imp_feat = (*feat)->GetData().GetImp();
3511 
3512         count = 0;
3513         ITERATE(objects::CSeq_feat::TQual, qual, (*feat)->GetQual())
3514         {
3515             if (!(*qual)->IsSetQual() || (*qual)->GetQual() != "operon" ||
3516                 !(*qual)->IsSetVal() || (*qual)->GetVal().empty())
3517                 continue;
3518 
3519             tfop = new FTAOperon;
3520             tfop->location = &(*feat)->GetLocation();
3521             tfop->operon = (*qual)->GetVal().c_str();
3522             tfop->featname = imp_feat.IsSetKey() ? imp_feat.GetKey().c_str() : "Unknown";
3523 
3524             tfop->operon_feat = false;
3525             tfop->ret = true;
3526             tfop->strloc = NULL;
3527             tfop->next = NULL;
3528             if(StringCmp(tfop->featname, "operon") == 0)
3529                 tfop->operon_feat = true;
3530 
3531             if(fophead == NULL)
3532                 fophead = tfop;
3533 
3534             fop->next = tfop;
3535             fop = fop->next;
3536 
3537             count++;
3538 
3539             if(fop->operon_feat == false || fop == fophead)
3540                 continue;
3541 
3542             for(tfop = fophead; tfop->next != NULL; tfop = tfop->next)
3543             {
3544                 if(tfop->operon_feat == false ||
3545                    StringCmp(tfop->operon, fop->operon) != 0)
3546                     continue;
3547 
3548                 if(tfop->strloc == NULL)
3549                     tfop->strloc = location_to_string_or_unknown(*tfop->location);
3550 
3551                 if(fop->strloc == NULL)
3552                     fop->strloc = location_to_string_or_unknown(*fop->location);
3553 
3554                 ErrPostEx(SEV_REJECT, ERR_FEATURE_OperonQualsNotUnique,
3555                           "The operon features at \"%s\" and \"%s\" utilize the same /operon qualifier : \"%s\".",
3556                           tfop->strloc, fop->strloc, fop->operon);
3557                 pp->operon->ret = false;
3558             }
3559         }
3560 
3561         if(count > 1)
3562         {
3563             if(fop->strloc == NULL)
3564                 fop->strloc = location_to_string_or_unknown(*fop->location);
3565 
3566             ErrPostEx(SEV_REJECT, ERR_FEATURE_MultipleOperonQuals,
3567                       "Feature \"%s\" at \"%s\" has more than one operon qualifier.",
3568                       fop->featname, fop->strloc);
3569             pp->operon->ret = false;
3570         }
3571 
3572         if (count == 0 && imp_feat.IsSetKey() && imp_feat.GetKey() == "operon")
3573         {
3574             p = location_to_string_or_unknown((*feat)->GetLocation());
3575 
3576             ErrPostEx(SEV_REJECT, ERR_FEATURE_MissingOperonQual,
3577                       "The operon feature at \"%s\" lacks an /operon qualifier.",
3578                       p);
3579 
3580             MemFree(p);
3581             pp->operon->ret = false;
3582         }
3583     }
3584 
3585     if(ibp->segnum != 0 && ibp->segnum != ibp->segtotal)
3586         return true;
3587 
3588     if(pp->operon->next == NULL || pp->operon->next->next == NULL)
3589         return(pp->operon->ret);
3590 
3591     for(fop = pp->operon->next; fop != NULL; fop = fop->next)
3592     {
3593         if(fop->operon_feat)
3594             continue;
3595 
3596         got = false;
3597         for(tfop = pp->operon->next; tfop != NULL; tfop = tfop->next)
3598         {
3599             if(tfop->operon_feat == false ||
3600                StringCmp(fop->operon, tfop->operon) != 0)
3601                 continue;
3602 
3603             got = true;
3604             objects::sequence::ECompare cmp_res = objects::sequence::Compare(*fop->location, *tfop->location, nullptr, objects::sequence::fCompareOverlapping);
3605             if (cmp_res == objects::sequence::eContained || cmp_res == objects::sequence::eSame)
3606                 continue;
3607 
3608             if(fop->strloc == NULL)
3609                 fop->strloc = location_to_string_or_unknown(*fop->location);
3610 
3611             if(tfop->strloc == NULL)
3612                 tfop->strloc = location_to_string_or_unknown(*tfop->location);
3613 
3614             ErrPostEx(SEV_REJECT, ERR_FEATURE_OperonLocationMisMatch,
3615                       "Feature \"%s\" at \"%s\" with /operon qualifier \"%s\" does not fall within the span of the operon feature at \"%s\".",
3616                       fop->featname, fop->strloc, fop->operon, tfop->strloc);
3617             pp->operon->ret = false;
3618         }
3619 
3620         if(!got)
3621         {
3622             if(fop->strloc == NULL)
3623                 fop->strloc = location_to_string_or_unknown(*fop->location);
3624 
3625             ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidOperonQual,
3626                       "/operon qualifier \"%s\" on feature \"%s\" at \"%s\" has a value that does not match any of the /operon qualifiers on operon features.",
3627                       fop->operon, fop->featname, fop->strloc);
3628             pp->operon->ret = false;
3629         }
3630     }
3631 
3632     got = pp->operon->ret;
3633     fta_operon_free(pp->operon);
3634     pp->operon = NULL;
3635     return(got);
3636 }
3637 
3638 /**********************************************************/
fta_remove_dup_quals(FeatBlkPtr fbp)3639 static void fta_remove_dup_quals(FeatBlkPtr fbp)
3640 {
3641     Char      ch;
3642 
3643     if(fbp == NULL || fbp->quals.empty())
3644         return;
3645 
3646     NON_CONST_ITERATE(TQualVector, cur, fbp->quals)
3647     {
3648         const char* cur_qual = (*cur)->IsSetQual() ? (*cur)->GetQual().c_str() : NULL;
3649         const char* cur_val = (*cur)->IsSetVal() ? (*cur)->GetVal().c_str() : NULL;
3650 
3651         TQualVector::iterator next = cur;
3652         for (++next; next != fbp->quals.end();)
3653         {
3654             const char* next_qual = (*next)->IsSetQual() ? (*next)->GetQual().c_str() : NULL;
3655             const char* next_val = (*next)->IsSetVal() ? (*next)->GetVal().c_str() : NULL;
3656 
3657             if (!fta_strings_same(cur_qual, next_qual) || !fta_strings_same(cur_val, next_val))
3658             {
3659                 ++next;
3660                 continue;
3661             }
3662 
3663             if(fbp->location != NULL && StringLen(fbp->location) > 20)
3664             {
3665                 ch = fbp->location[20];
3666                 fbp->location[20] = '\0';
3667             }
3668             else
3669                 ch = '\0';
3670 
3671             ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DuplicateRemoved,
3672                       "Duplicated qualifier \"%s\" in feature \"%s\" at location \"%s%s\" removed.",
3673                       (cur_qual == NULL) ? "???" : cur_qual,
3674                       (fbp->key == NULL) ? "???" : fbp->key,
3675                       (fbp->location == NULL) ? "???" : fbp->location,
3676                       (ch == '\0') ? "" : "...");
3677 
3678             if(ch != '\0')
3679                 fbp->location[20] = ch;
3680 
3681             next = fbp->quals.erase(next);
3682         }
3683     }
3684 }
3685 
3686 /**********************************************************/
CollectGapFeats(DataBlkPtr entry,DataBlkPtr dbp,ParserPtr pp,Int2 type)3687 static void CollectGapFeats(DataBlkPtr entry, DataBlkPtr dbp,
3688                             ParserPtr pp, Int2 type)
3689 {
3690     IndexblkPtr        ibp;
3691     GapFeatsPtr        gfp = NULL;
3692     GapFeatsPtr        tgfp;
3693     DataBlkPtr         tdbp;
3694     FeatBlkPtr         fbp;
3695 
3696     objects::CLinkage_evidence::TLinkage_evidence asn_linkage_evidence;
3697     std::list<std::string> linkage_evidence_names;
3698 
3699     StrNumPtr          snp;
3700     char*            p;
3701     char*            q;
3702     const char*    gap_type;
3703     bool               finished_gap;
3704     ErrSev             sev;
3705     Int4               estimated_length;
3706     Int4               is_htg;
3707     Int4               from;
3708     Int4               to;
3709     Int4               prev_gap;        /* 0 - initial, 1 - "gap",
3710                                            2 - "assembly_gap" */
3711     Int4               curr_gap;        /* 0 - initial, 1 - "gap",
3712                                            2 - "assembly_gap" */
3713     Int4               asn_gap_type;
3714 
3715     ibp = pp->entrylist[pp->curindx];
3716 
3717     if(ibp->keywords.empty())
3718     {
3719         if(pp->format == Parser::EFormat::GenBank)
3720             GetSequenceOfKeywords(entry, ParFlat_KEYWORDS,
3721                                   ParFlat_COL_DATA, ibp->keywords);
3722         else if(pp->format == Parser::EFormat::EMBL)
3723             GetSequenceOfKeywords(entry, ParFlat_KW, ParFlat_COL_DATA_EMBL,
3724                                   ibp->keywords);
3725         else if(pp->format == Parser::EFormat::XML)
3726             XMLGetKeywords(entry->offset, ibp->xip, ibp->keywords);
3727     }
3728 
3729     is_htg = -1;
3730     ITERATE(TKeywordList, key, ibp->keywords)
3731     {
3732         if(is_htg >= 0 && is_htg <= 2)
3733             break;
3734         if(*key == "HTG")
3735             is_htg = 3;
3736         else if(*key == "HTGS_PHASE0")
3737             is_htg = 0;
3738         else if(*key == "HTGS_PHASE1")
3739             is_htg = 1;
3740         else if(*key == "HTGS_PHASE2")
3741             is_htg = 2;
3742         else if(*key == "HTGS_PHASE3")
3743             is_htg = 3;
3744     }
3745 
3746     prev_gap = 0;
3747     curr_gap = 0;
3748     finished_gap = false;
3749     for(ibp->gaps = NULL; dbp != NULL; dbp = dbp->next)
3750     {
3751         if(ibp->drop != 0)
3752             break;
3753         if(dbp->type != type)
3754             continue;
3755 
3756         linkage_evidence_names.clear();
3757         asn_linkage_evidence.clear();
3758 
3759         for(tdbp = (DataBlkPtr) dbp->data; tdbp != NULL; tdbp = tdbp->next)
3760         {
3761             if(ibp->drop != 0)
3762                 break;
3763             fbp = (FeatBlkPtr) tdbp->data;
3764             if(fbp == NULL || fbp->key == NULL)
3765                 continue;
3766             if(StringCmp(fbp->key, "gap") == 0)
3767             {
3768                 prev_gap = curr_gap;
3769                 curr_gap = 1;
3770             }
3771             else if(StringCmp(fbp->key, "assembly_gap") == 0)
3772             {
3773                 prev_gap = curr_gap;
3774                 curr_gap = 2;
3775             }
3776             else
3777                 continue;
3778 
3779             from = 0;
3780             to = 0;
3781             estimated_length = 0;
3782             gap_type = NULL;
3783             linkage_evidence_names.clear();
3784             asn_gap_type = -1;
3785             asn_linkage_evidence.clear();
3786             estimated_length = -1;
3787 
3788             ITERATE(TQualVector, cur, fbp->quals)
3789             {
3790                 if (!(*cur)->IsSetQual() || !(*cur)->IsSetVal())
3791                     continue;
3792 
3793                 const std::string& cur_qual = (*cur)->GetQual();
3794                 const std::string& cur_val = (*cur)->GetVal();
3795 
3796                 if (cur_qual.empty() || cur_val.empty())
3797                     continue;
3798 
3799                 if (cur_qual == "estimated_length")
3800                 {
3801                     if (cur_val == "unknown")
3802                         estimated_length = -100;
3803                     else
3804                     {
3805                         const char* cp = cur_val.c_str();
3806                         for (; *cp >= '0' && *cp <= '9';)
3807                             ++cp;
3808                         if(*cp == '\0')
3809                             estimated_length = atoi(cur_val.c_str());
3810                     }
3811                 }
3812                 else if (cur_qual == "gap_type")
3813                     gap_type = cur_val.c_str();
3814                 else if (cur_qual == "linkage_evidence")
3815                 {
3816                     linkage_evidence_names.push_back(cur_val);
3817                 }
3818             }
3819 
3820             if(fbp->location != NULL)
3821             {
3822                 p = fbp->location;
3823                 if(*p == '<')
3824                     p++;
3825                 for(q = p; *p >= '0' && *p <= '9';)
3826                     p++;
3827                 if(*p == '\0')
3828                 {
3829                     from = atoi(q);
3830                     to = from;
3831                 }
3832                 else if(*p == '.')
3833                 {
3834                     *p = '\0';
3835                     from = atoi(q);
3836                     *p++ = '.';
3837                     if(*fbp->location == '<' && from != 1)
3838                         from = 0;
3839                     else if(*p == '.')
3840                     {
3841                         if(*++p == '>')
3842                            p++;
3843                         for(q = p; *p >= '0' && *p <= '9';)
3844                             p++;
3845                         if(*p == '\0')
3846                             to = atoi(q);
3847                         if(*(q - 1) == '>' && to != (int) ibp->bases)
3848                             to = 0;
3849                     }
3850                 }
3851             }
3852 
3853             if(from == 0 || to == 0 || from > to)
3854             {
3855                 if(curr_gap == 1)
3856                     ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidGapLocation,
3857                               "Invalid gap feature location : \"%s\" : all gap features must have a simple X..Y location on the plus strand.",
3858                               (fbp->location == NULL) ? "unknown" : fbp->location);
3859                 else
3860                     ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidAssemblyGapLocation,
3861                               "Invalid assembly_gap location : \"%s\".",
3862                               (fbp->location == NULL) ? "unknown" : fbp->location);
3863                 ibp->drop = 1;
3864                 break;
3865             }
3866 
3867             if(curr_gap == 2)           /* "assembly_gap" feature */
3868             {
3869                 if(gap_type != NULL && is_htg > -1 &&
3870                    StringCmp(gap_type, "within scaffold") != 0 &&
3871                    StringCmp(gap_type, "repeat within scaffold") != 0)
3872                     ErrPostEx(SEV_ERROR, ERR_QUALIFIER_UnexpectedGapTypeForHTG,
3873                               "assembly_gap has /gap_type of \"%s\", but clone-based HTG records are only expected to have \"within scaffold\" or \"repeat within scaffold\" gaps. assembly_gap feature located at \"%d..%d\".",
3874                               gap_type, from, to);
3875 
3876                 if(is_htg == 0 || is_htg == 1)
3877                 {
3878                     ITERATE(std::list<std::string>, evidence, linkage_evidence_names)
3879                     {
3880                         if (*evidence != LinkageEvidenceValues[objects::CLinkage_evidence_Base::eType_unspecified].str)
3881                         {
3882                             ErrPostEx(SEV_ERROR, ERR_QUALIFIER_LinkageShouldBeUnspecified,
3883                                       "assembly gap has /linkage_evidence of \"%s\", but unoriented and unordered Phase0/Phase1 HTG records are expected to have \"unspecified\" evidence. assembly_gap feature located at \"%d..%d\".",
3884                                       evidence->c_str(), from, to);
3885                         }
3886                     }
3887                 }
3888                 else if(is_htg == 2 || is_htg == 3)
3889                 {
3890                     ITERATE(std::list<std::string>, evidence, linkage_evidence_names)
3891                     {
3892                         if (*evidence != LinkageEvidenceValues[objects::CLinkage_evidence_Base::eType_unspecified].str)
3893                             continue;
3894 
3895                         ErrPostEx(SEV_ERROR, ERR_QUALIFIER_LinkageShouldNotBeUnspecified,
3896                                   "assembly gap has /linkage_evidence of \"unspecified\", but ordered and oriented HTG records are expected to have some level of linkage for their gaps. assembly_gap feature located at \"%d..%d\".",
3897                                   from, to);
3898                     }
3899                 }
3900 
3901                 if(is_htg == 3 && !finished_gap)
3902                 {
3903                     ErrPostEx(SEV_ERROR, ERR_FEATURE_FinishedHTGHasAssemblyGap,
3904                               "Finished Phase-3 HTG records are not expected to have any gaps. First assembly_gap feature encountered at \"%d..%d\".",
3905                               from, to);
3906                     finished_gap = true;
3907                 }
3908 
3909                 if(gap_type == NULL)
3910                 {
3911                     ErrPostEx(SEV_REJECT, ERR_QUALIFIER_MissingGapType,
3912                               "assembly_gap feature at \"%d..%d\" lacks the required /gap_type qualifier.",
3913                               from, to);
3914                     ibp->drop = 1;
3915                     break;
3916                 }
3917 
3918                 for(snp = GapTypeValues; snp->str != NULL; snp++)
3919                     if(StringCmp(snp->str, gap_type) == 0)
3920                         break;
3921                 if(snp->str == NULL)
3922                 {
3923                     ErrPostEx(SEV_REJECT, ERR_QUALIFIER_InvalidGapType,
3924                               "assembly_gap feature at \"%d..%d\" has an invalid gap type : \"%s\".",
3925                               from, to, gap_type);
3926                     ibp->drop = 1;
3927                     break;
3928                 }
3929                 asn_gap_type = snp->num;
3930 
3931                 if(linkage_evidence_names.empty() &&
3932                    (StringCmp(gap_type, "within scaffold") == 0 ||
3933                    StringCmp(gap_type, "repeat within scaffold") == 0))
3934                 {
3935                     ErrPostEx(SEV_REJECT, ERR_QUALIFIER_MissingLinkageEvidence,
3936                               "assembly_gap feature at \"%d..%d\" with gap type \"%s\" lacks a /linkage_evidence qualifier.",
3937                               from, to, gap_type);
3938                     ibp->drop = 1;
3939                     break;
3940                 }
3941                 if (!linkage_evidence_names.empty())
3942                 {
3943                     if (StringCmp(gap_type, "unknown") != 0 &&
3944                         StringCmp(gap_type, "within scaffold") != 0 &&
3945                         StringCmp(gap_type, "repeat within scaffold") != 0)
3946                     {
3947                         ErrPostEx(SEV_REJECT,
3948                                   ERR_QUALIFIER_InvalidGapTypeForLinkageEvidence,
3949                                   "The /linkage_evidence qualifier is not legal for the assembly_gap feature at \"%d..%d\" with /gap_type \"%s\".",
3950                                   from, to, gap_type);
3951                         ibp->drop = 1;
3952                         break;
3953                     }
3954 
3955                     ITERATE(std::list<std::string>, evidence, linkage_evidence_names)
3956                     {
3957                         for(snp = LinkageEvidenceValues; snp->str != NULL; snp++)
3958                             if (*evidence == snp->str)
3959                                 break;
3960                         if(snp->str == NULL)
3961                         {
3962                             ErrPostEx(SEV_REJECT,
3963                                       ERR_QUALIFIER_InvalidLinkageEvidence,
3964                                       "assembly_gap feature at \"%d..%d\" has an invalid linkage evidence : \"%s\".",
3965                                       from, to, evidence->c_str());
3966                             ibp->drop = 1;
3967                             break;
3968                         }
3969 
3970                         CRef<objects::CLinkage_evidence> new_evidence(new objects::CLinkage_evidence);
3971                         new_evidence->SetType(snp->num);
3972                         asn_linkage_evidence.push_back(new_evidence);
3973                     }
3974                 }
3975             }
3976 
3977             if(prev_gap + curr_gap == 3)
3978             {
3979                 if(curr_gap == 1)
3980                     ErrPostEx(SEV_REJECT, ERR_FEATURE_AssemblyGapAndLegacyGap,
3981                               "Legacy gap feature at \"%d..%d\" co-exists with a new AGP 2.0 assembly_gap feature at \"%d..%d\".",
3982                               from, to, gfp->from, gfp->to);
3983                 else
3984                     ErrPostEx(SEV_REJECT, ERR_FEATURE_AssemblyGapAndLegacyGap,
3985                               "Legacy gap feature at \"%d..%d\" co-exists with a new AGP 2.0 assembly_gap feature at \"%d..%d\".",
3986                               gfp->from, gfp->to, from, to);
3987                 ibp->drop = 1;
3988                 break;
3989             }
3990 
3991             if(estimated_length == -1)  /* missing qual */
3992             {
3993                 ErrPostEx(SEV_REJECT, ERR_FEATURE_RequiredQualifierMissing,
3994                           "The gap feature at \"%d..%d\" lacks the required /estimated_length qualifier.",
3995                           from, to);
3996                 ibp->drop = 1;
3997             }
3998             else if(estimated_length == 0)
3999             {
4000                 ErrPostEx(SEV_REJECT, ERR_FEATURE_IllegalEstimatedLength,
4001                           "Gap feature at \"%d..%d\" has an illegal /estimated_length qualifier : \"%s\" : should be \"unknown\" or an integer.",
4002 //                          from, to, gbqp->val); // at this point gbqp is definitely = NULL
4003                           from, to, "");
4004                 ibp->drop = 1;
4005             }
4006             else if(estimated_length == -100)
4007             {
4008                 if(is_htg >= 0 && to - from != 99)
4009                 {
4010                     ErrPostEx(SEV_ERROR, ERR_FEATURE_UnknownGapNot100,
4011                               "Gap feature at \"%d..%d\" has /estimated_length \"unknown\" but the gap size is not 100 bases.",
4012                               from, to);
4013                 }
4014             }
4015             else if(estimated_length != to - from + 1)
4016             {
4017                 if(pp->source == Parser::ESource::EMBL || pp->source == Parser::ESource::DDBJ)
4018                     sev = SEV_ERROR;
4019                 else
4020                 {
4021                     sev = SEV_REJECT;
4022                     ibp->drop = 1;
4023                 }
4024 
4025                 ErrPostEx(sev, ERR_FEATURE_GapSizeEstLengthMissMatch,
4026                           "Gap feature at \"%d..%d\" has a size that does not match the /estimated_length : %d.",
4027                           from, to, estimated_length);
4028             }
4029 
4030             for(gfp = ibp->gaps; gfp != NULL; gfp = gfp->next)
4031             {
4032                 if((gfp->from >= from && gfp->from <= to) ||
4033                    (gfp->to >= from && gfp->to <= to) ||
4034                    (gfp->from <= from && gfp->to >= to))
4035                 {
4036                     ErrPostEx(SEV_REJECT, ERR_FEATURE_OverlappingGaps,
4037                               "Gap features at \"%d..%d\" and \"%d..%d\" overlap.",
4038                               from, to, gfp->from, gfp->to);
4039                     ibp->drop = 1;
4040                 }
4041                 else if(to + 1 == gfp->from || from - 1 == gfp->to)
4042                 {
4043                     if(pp->source == Parser::ESource::EMBL)
4044                         sev = SEV_ERROR;
4045                     else
4046                     {
4047                         sev = SEV_REJECT;
4048                         ibp->drop = 1;
4049                     }
4050 
4051                     ErrPostEx(sev, ERR_FEATURE_ContiguousGaps,
4052                               "Gap features at \"%d..%d\" and \"%d..%d\" are contiguous, and should probably be represented by a single gap that spans both.",
4053                               from, to, gfp->from, gfp->to);
4054                 }
4055             }
4056             if(ibp->drop != 0)
4057                 break;
4058 
4059             gfp = new GapFeats;
4060             gfp->from = from;
4061             gfp->to = to;
4062             gfp->estimated_length = estimated_length;
4063             if(curr_gap == 2)           /* /assembly_gap feature */
4064                 gfp->assembly_gap = true;
4065             if(gap_type != NULL)
4066             {
4067                gfp->gap_type = StringSave(gap_type);
4068                gfp->asn_gap_type = asn_gap_type;
4069             }
4070             if(!asn_linkage_evidence.empty())
4071             {
4072                gfp->asn_linkage_evidence.swap(asn_linkage_evidence);
4073                asn_linkage_evidence.clear();
4074             }
4075             gfp->next = NULL;
4076 
4077             if(ibp->gaps == NULL)
4078             {
4079                 ibp->gaps = gfp;
4080                 continue;
4081             }
4082 
4083             if(ibp->gaps->from > from)
4084             {
4085                 gfp->next = ibp->gaps;
4086                 ibp->gaps = gfp;
4087                 continue;
4088             }
4089 
4090             if(ibp->gaps->next == NULL)
4091             {
4092                 ibp->gaps->next = gfp;
4093                 continue;
4094             }
4095 
4096             for(tgfp = ibp->gaps; tgfp != NULL; tgfp = tgfp->next)
4097             {
4098                 if(tgfp->next != NULL && tgfp->next->from < from)
4099                     continue;
4100                 gfp->next = tgfp->next;
4101                 tgfp->next = gfp;
4102                 break;
4103             }
4104         }
4105         if(ibp->drop != 0)
4106         {
4107             linkage_evidence_names.clear();
4108             asn_linkage_evidence.clear();
4109         }
4110     }
4111 
4112     if(ibp->gaps == NULL)
4113         return;
4114 
4115     if(ibp->drop != 0)
4116     {
4117         GapFeatsFree(ibp->gaps);
4118         ibp->gaps = NULL;
4119     }
4120 }
4121 
4122 /**********************************************************/
XMLGetQuals(char * entry,XmlIndexPtr xip,TQualVector & quals)4123 static void XMLGetQuals(char* entry, XmlIndexPtr xip, TQualVector& quals)
4124 {
4125     XmlIndexPtr xipqual;
4126 
4127     if(entry == NULL || xip == NULL)
4128         return;
4129 
4130     for(; xip != NULL; xip = xip->next)
4131     {
4132         if(xip->subtags == NULL)
4133             continue;
4134 
4135         CRef<objects::CGb_qual> qual(new objects::CGb_qual);
4136         for(xipqual = xip->subtags; xipqual != NULL; xipqual = xipqual->next)
4137         {
4138             if(xipqual->tag == INSDQUALIFIER_NAME)
4139                 qual->SetQual(XMLGetTagValue(entry, xipqual));
4140             else if(xipqual->tag == INSDQUALIFIER_VALUE)
4141                 qual->SetVal(XMLGetTagValue(entry, xipqual));
4142         }
4143 
4144         if (qual->GetQual() == "replace" && !qual->IsSetVal())
4145         {
4146             qual->SetVal("");
4147         }
4148 
4149         if (qual->IsSetQual() && !qual->GetQual().empty())
4150             quals.push_back(qual);
4151     }
4152 }
4153 
4154 /**********************************************************/
XMLLoadFeatBlk(char * entry,XmlIndexPtr xip)4155 static DataBlkPtr XMLLoadFeatBlk(char* entry, XmlIndexPtr xip)
4156 {
4157     XmlIndexPtr xipfeat;
4158     DataBlkPtr  headdbp;
4159     DataBlkPtr  dbp;
4160     DataBlkPtr  ret;
4161     FeatBlkPtr  fbp;
4162 
4163     if(entry == NULL || xip == NULL)
4164         return(NULL);
4165 
4166     for(; xip != NULL; xip = xip->next)
4167         if(xip->tag == INSDSEQ_FEATURE_TABLE)
4168             break;
4169 
4170     if(xip == NULL || xip->subtags == NULL)
4171         return(NULL);
4172 
4173     headdbp = NULL;
4174     for(xip = xip->subtags; xip != NULL; xip = xip->next)
4175     {
4176         if(xip->subtags == NULL)
4177             continue;
4178         fbp = new FeatBlk;
4179         for(xipfeat = xip->subtags; xipfeat != NULL; xipfeat = xipfeat->next)
4180         {
4181             if(xipfeat->tag == INSDFEATURE_KEY)
4182                 fbp->key = XMLGetTagValue(entry, xipfeat);
4183             else if(xipfeat->tag == INSDFEATURE_LOCATION)
4184                 fbp->location = XMLGetTagValue(entry, xipfeat);
4185             else if(xipfeat->tag == INSDFEATURE_QUALS)
4186                 XMLGetQuals(entry, xipfeat->subtags, fbp->quals);
4187         }
4188         if(headdbp == NULL)
4189         {
4190             headdbp = (DataBlkPtr) MemNew(sizeof(DataBlk));
4191             dbp = headdbp;
4192         }
4193         else
4194         {
4195             dbp->next = (DataBlkPtr) MemNew(sizeof(DataBlk));
4196             dbp = dbp->next;
4197         }
4198         dbp->data = fbp;
4199     }
4200     ret = (DataBlkPtr) MemNew(sizeof(DataBlk));
4201     ret->type = XML_FEATURES;
4202     ret->data = headdbp;
4203     ret->next = NULL;
4204     return(ret);
4205 }
4206 
4207 /**********************************************************
4208  *
4209  *   static FeatBlkPtr MergeNoteQual(fbp):
4210  *
4211  *      Only one note on every key feature block,
4212  *   not complete.
4213  *
4214  *                                              5-28-93
4215  *
4216  **********************************************************/
MergeNoteQual(FeatBlkPtr fbp)4217 static FeatBlkPtr MergeNoteQual(FeatBlkPtr fbp)
4218 {
4219     char*   note;
4220     char*   p;
4221     char*   q;
4222 
4223     size_t size = 0;
4224 
4225     NON_CONST_ITERATE(TQualVector, cur, fbp->quals)
4226     {
4227         if (!(*cur)->IsSetQual() || !(*cur)->IsSetVal())
4228             continue;
4229 
4230         const std::string& cur_qual = (*cur)->GetQual();
4231         const std::string& cur_val = (*cur)->GetVal();
4232 
4233         if (cur_qual != "note" || cur_val.empty())
4234             continue;
4235 
4236         size += 2;
4237         std::vector<Char> buf(cur_val.size() + 1);
4238 
4239         const char* cp = cur_val.c_str();
4240         for(q = &buf[0]; *cp != '\0'; ++cp)
4241         {
4242             *q++ = *cp;
4243             if (*cp == ';' && (cp[1] == ' ' || cp[1] == ';'))
4244             {
4245                 for(++cp; *cp == ' ' || *cp == ';';)
4246                     ++cp;
4247                 if(*cp != '\0')
4248                     *q++ = ' ';
4249                 --cp;
4250             }
4251         }
4252 
4253         *q = '\0';
4254         (*cur)->SetVal(&buf[0]);
4255 
4256         size += (*cur)->GetVal().size();
4257         for (cp = (*cur)->GetVal().c_str(); *cp != '\0'; ++cp)
4258             if(*cp == '~')
4259                 ++size;
4260     }
4261 
4262     if(size == 0)
4263         return(fbp);
4264 
4265     note = (char*) MemNew(size);
4266     p = note;
4267 
4268     for (TQualVector::iterator cur = fbp->quals.begin(); cur != fbp->quals.end();)
4269     {
4270         if (!(*cur)->IsSetQual() || !(*cur)->IsSetVal())
4271         {
4272             ++cur;
4273             continue;
4274         }
4275 
4276         const std::string& cur_qual = (*cur)->GetQual();
4277         const std::string& cur_val = (*cur)->GetVal();
4278 
4279         if (cur_qual != "note")
4280         {
4281             ++cur;
4282             continue;
4283         }
4284 
4285         if (!cur_val.empty())
4286         {
4287             /* sometime we get note qual w/o value
4288              */
4289             if(p > note)
4290             {
4291                 *p++ = ';';
4292                 *p++ = '~';
4293             }
4294 
4295             for (const char* cq = cur_val.c_str(); *cq != '\0'; *p++ = *cq++)
4296                 if(*cq == '~')
4297                     *p++ = '~';
4298         }
4299 
4300         cur = fbp->quals.erase(cur);
4301     }
4302     *p = '\0';
4303 
4304     CRef<objects::CGb_qual> qual_new(new objects::CGb_qual);
4305     qual_new->SetQual("note");
4306     qual_new->SetVal(note);
4307 
4308     fbp->quals.push_back(qual_new);
4309 
4310     return(fbp);
4311 }
4312 
4313 /**********************************************************/
CheckLegalQual(const Char * val,Char ch,std::string * qual)4314 static bool CheckLegalQual(const Char* val, Char ch, std::string* qual)
4315 {
4316     std::string qual_name;
4317     for (; *val && *val != ch && (isalpha(*val) || *val == '_'); ++val)
4318         qual_name += *val;
4319 
4320     objects::CSeqFeatData::EQualifier type = objects::CSeqFeatData::GetQualifierType(qual_name);
4321     if (type == objects::CSeqFeatData::eQual_bad)
4322         return false;
4323 
4324     if (qual != nullptr)
4325         *qual = qual_name;
4326 
4327     return true;
4328 }
4329 
4330 /**********************************************************/
fta_set_merge_marks(char * val,size_t quallen,size_t vallen)4331 static void fta_set_merge_marks(char* val, size_t quallen, size_t vallen)
4332 {
4333     char* start;
4334     char* p;
4335     char* q;
4336     bool    first;
4337 
4338     if(val == NULL || *val == '\0')
4339         return;
4340 
4341     p = StringChr(val, '\n');
4342     if(p == NULL)
4343         return;
4344 
4345     for(first = true, start = val; p != NULL;)
4346     {
4347         if((p - 1) >= start && *(p - 1) == '-' &&
4348            (p - 2) >= start && *(p - 2) != ' ')
4349         {
4350             *p = '\t';
4351             start = ++p;
4352             p = StringChr(p, '\n');
4353             continue;
4354         }
4355         if((p - 3) >= start && StringNCmp(p - 3, "(EC", 3) == 0 &&
4356            p[1] >= '0' && p[1] <= '9')
4357         {
4358             start = ++p;
4359             p = StringChr(p, '\n');
4360             continue;
4361         }
4362         if(p[1] == '(' || ((p - 1) >= start && *(p - 1) == ','))
4363         {
4364             start = ++p;
4365             p = StringChr(p, '\n');
4366             continue;
4367         }
4368         *p = '\0';
4369         q = StringChr(start, ' ');
4370         size_t len = StringLen(start);
4371         if(first)
4372         {
4373             first = false;
4374             len += quallen;
4375         }
4376         *p = (q == NULL && len == vallen) ? '\t' : '\n';
4377         start = ++p;
4378         p = StringChr(p, '\n');
4379     }
4380 }
4381 
4382 /**********************************************************/
fta_convert_to_lower_case(char * str)4383 static void fta_convert_to_lower_case(char* str)
4384 {
4385     char* p;
4386 
4387     if (str == NULL || *str == '\0')
4388         return;
4389 
4390     for (p = str; *p != '\0'; p++)
4391         if (*p >= 'A' && *p <= 'Z')
4392             *p |= 040;
4393 }
4394 
4395 /**********************************************************/
fta_process_con_slice(std::vector<char> & val_buf)4396 static void fta_process_con_slice(std::vector<char>& val_buf)
4397 {
4398     size_t i = 1;
4399     char* p = &val_buf[0];
4400 
4401     for (; *p != '\0'; p++)
4402         if (*p == ',' && p[1] != ' ' && p[1] != '\0')
4403             i++;
4404 
4405     if (i > 1)
4406     {
4407         vector<char> buf(i + val_buf.size());
4408         char* q = &buf[0];
4409         for (p = &val_buf[0]; *p != '\0'; p++)
4410         {
4411             *q++ = *p;
4412             if (*p == ',' && p[1] != ' ' && p[1] != '\0')
4413                 *q++ = ' ';
4414         }
4415         *q = '\0';
4416         val_buf.swap(buf);
4417     }
4418 }
4419 
4420 
4421 /**********************************************************
4422  *
4423  *   static void ParseQualifiers(fbp, bptr, eptr,
4424  *                               format):
4425  *
4426  *      Parsing qualifier and put into link list fbp->qual.
4427  *      Some qualifiers may not have value.
4428  *      genbank qualifier format:  /qualifier=value
4429  *      embl qualifier format:     /qualifier= value
4430  *
4431  *                                              10-12-93
4432  *
4433  **********************************************************/
ParseQualifiers(FeatBlkPtr fbp,char * bptr,char * eptr,Parser::EFormat format)4434 static void ParseQualifiers(FeatBlkPtr fbp, char* bptr, char* eptr,
4435                             Parser::EFormat format)
4436 {
4437     const char **b;
4438 
4439     char*    ptr;
4440     char*    str;
4441     char*    qstr;
4442     char*    p;
4443     char*    q;
4444     char*    r;
4445     Char       ch;
4446     Int4       vallen;
4447     Int4       count;
4448     Int2       got;
4449     Int2       quotes;
4450     Int2       reject;
4451 
4452     vallen = (format == Parser::EFormat::EMBL) ? 59 : 58;
4453 
4454     qstr = (char*) MemNew(eptr - bptr + 2);
4455     ch = *eptr;
4456     *eptr = '\0';
4457 
4458     for(p = bptr; *p == ' ' || *p == '\n';)
4459         p++;
4460     for(q = qstr; *p != '\0';)
4461     {
4462         if(*p != ' ' && *p != '\n')
4463         {
4464             *q++ = *p++;
4465             continue;
4466         }
4467 
4468         for(got = 0, r = p; *r == ' ' || *r == '\n'; r++)
4469             if(*r == '\n')
4470                 got = 1;
4471         if(got == 1)
4472         {
4473             *q++ = '\n';
4474             p = r;
4475         }
4476         else
4477             while(*p == ' ')
4478                 *q++ = *p++;
4479     }
4480     if(q == qstr || *(q - 1) != '\n')
4481         *q++ = '\n';
4482     *q = '\0';
4483     *eptr = ch;
4484 
4485     for(str = qstr + 1; *str != '\0';)
4486     {
4487         reject = 0;
4488 
4489         CRef<objects::CGb_qual> qual_new(new objects::CGb_qual);
4490         for(ptr = str; *str != '/' && *str != '=' && *str != '\0' && *str != '\n';)
4491             str++;
4492 
4493         std::string qual_str(ptr, str);
4494         size_t quallen = qual_str.size() + 1;
4495 
4496         NStr::ReplaceInPlace(qual_str, "\n", " ");
4497         NStr::TruncateSpacesInPlace(qual_str, NStr::eTrunc_End);
4498 
4499         if (qual_str == "specific_host")
4500             qual_str = "host";
4501         qual_new->SetQual(qual_str);
4502 
4503         quotes = 0;
4504         if(*str == '=')                 /* get gbq->val */
4505         {
4506             quallen++;
4507             while(*str == '=' || *str == ' ' || *str == '\n')
4508                 str++;
4509 
4510             if(*str == '\"')            /* found open double quote */
4511             {
4512                 quallen++;
4513                 quotes = 1;
4514                 str++;
4515                 ptr = str;
4516 
4517                 /* search first close double quote
4518                  */
4519                 if (qual_str == "note")
4520                 {
4521                     for(;;)
4522                     {
4523                         str = StringChr(str, '\n');
4524                         if(str[1] == '\0')
4525                         {
4526                             if(*(str - 1) == '\"')
4527                             {
4528                                 quotes++;
4529                                 str--;
4530                             }
4531                             break;
4532                         }
4533                         if (str[1] != '/' || !CheckLegalQual(str + 2, '\n', nullptr))
4534                         {
4535                             str++;
4536                             continue;
4537                         }
4538                         if(*(str - 1) == '\"')
4539                         {
4540                             quotes++;
4541                             str--;
4542                         }
4543                         break;
4544                     }
4545                 }
4546                 else
4547                 {
4548                     while(*str != '\"' && *str != '\0')
4549                         str++;
4550                 }
4551             }
4552             else
4553             {
4554                 for(ptr = str; *str != '\0'; str++)
4555                     if(*str == '\n' && str[1] == '/')
4556                     {
4557                         str++;
4558                         break;
4559                     }
4560             }
4561 
4562             std::vector<Char> val_buf(ptr, str);
4563             val_buf.push_back(0);
4564 
4565             if (!val_buf.empty())
4566             {
4567                 fta_set_merge_marks(&val_buf[0], quallen, vallen);
4568 
4569                 std::replace(val_buf.begin(), val_buf.end(), '\n', ' ');
4570                 val_buf.erase(std::remove(val_buf.begin(), val_buf.end(), '\t'), val_buf.end());
4571 
4572                 std::string aux(&val_buf[0]);
4573                 NStr::TruncateSpacesInPlace(aux, NStr::eTrunc_End);
4574                 val_buf.assign(aux.begin(), aux.end());
4575                 val_buf.push_back(0);
4576 
4577                 if(qual_str == "translation" ||
4578                    qual_str == "replace")
4579                 {
4580                     /* delete blanks in the middle of the data
4581                      */
4582                     val_buf.erase(std::remove(val_buf.begin(), val_buf.end(), ' '), val_buf.end());
4583                 }
4584                 else if(qual_str == "rpt_unit")
4585                 {
4586                     fta_convert_to_lower_case(&val_buf[0]);
4587                 }
4588                 else if (qual_str == "cons_splice")
4589                 {
4590                     fta_process_con_slice(val_buf);
4591                 }
4592                 else if (qual_str ==  "note")
4593                 {
4594                     if(quotes == 1)
4595                     {
4596                         if (val_buf.size() > 30)
4597                         {
4598                             ch = val_buf[30];
4599                             val_buf[30] = '\0';
4600                         }
4601                         else
4602                             ch = '\0';
4603                         ErrPostEx(SEV_WARNING,
4604                                   ERR_QUALIFIER_MissingTerminalDoubleQuote,
4605                                   "/note qualifier is not terminated with double quote : [%s%s].",
4606                                   &val_buf[0], (ch == '\0') ? "" : " ...");
4607                         if(ch != '\0')
4608                             val_buf[30] = ch;
4609                     }
4610                     for (quotes = 0, p = &val_buf[0]; *p != '\0'; p++)
4611                     {
4612                         if(*p != '\"')
4613                             continue;
4614 
4615                         if(p[1] != '\"')
4616                         {
4617                             quotes = 1;
4618                             break;
4619                         }
4620                         quotes = !quotes;
4621                         p++;
4622                     }
4623                     if(quotes != 0)
4624                     {
4625                         if (val_buf.size() > 30)
4626                         {
4627                             ch = val_buf[30];
4628                             val_buf[30] = '\0';
4629                         }
4630                         else
4631                             ch = '\0';
4632                         ErrPostEx(SEV_ERROR, ERR_QUALIFIER_UnbalancedQuotes,
4633                                   "/note qualifier value contains unbalanced double-quotes, and has been discarded : [%s%s].",
4634                                   &val_buf[0], (ch == '\0') ? "" : " ...");
4635                         if(ch != '\0')
4636                             val_buf[30] = ch;
4637                         reject = 1;
4638                     }
4639 
4640                     if(fbp != NULL && fbp->key != NULL &&
4641                        StringCmp(fbp->key, "misc_feature") != 0)
4642                     {
4643                         std::string qual;
4644                         for (count = 0, p = &val_buf[0]; ; p++)
4645                         {
4646                             p = StringChr(p, '/');
4647                             if(p == NULL)
4648                                 break;
4649 
4650                             std::string cur_qual;
4651                             if (CheckLegalQual(p + 1, ' ', &cur_qual))
4652                             {
4653                                 if (qual.empty())
4654                                     qual = cur_qual;
4655                                 else
4656                                     count++;
4657                             }
4658                         }
4659 
4660                         if (!qual.empty())
4661                         {
4662                             FtaDeletePrefix(PREFIX_FEATURE);
4663                             if(count == 0)
4664                                 ErrPostEx(SEV_WARNING, ERR_QUALIFIER_EmbeddedQual,
4665                                           "/note contains /%s : FEAT=%s[%s] : %s.",
4666                                           qual.c_str(), fbp->key, fbp->location, &val_buf[0]);
4667                             else
4668                                 ErrPostEx(SEV_WARNING, ERR_QUALIFIER_EmbeddedQual,
4669                                           "/note contains /%s and %d other embedded qualifiers : FEAT=%s[%s] : %s.",
4670                                           qual.c_str(), count, fbp->key, fbp->location, &val_buf[0]);
4671                             FtaInstallPrefix(PREFIX_FEATURE, fbp->key,
4672                                              fbp->location);
4673                         }
4674                     }
4675                 }
4676 
4677                 qual_new->SetVal(&val_buf[0]);
4678             }
4679 
4680             while(*str == ' ' || *str == '\"' || *str == '\n')
4681                 str++;
4682 
4683             /* check any truncated data
4684              */
4685             if(*str != '\0' && *str != '/')
4686             {
4687                 for(ptr = str; *str != '/' && *str != '\0';)
4688                     str++;
4689 
4690                 std::string aux(ptr, str);
4691                 if(str - ptr > 50)
4692                     aux.resize(50);
4693                 NStr::ReplaceInPlace(aux, "\n", " ");
4694 
4695                 ErrPostEx(SEV_WARNING, ERR_FEATURE_DiscardData, "%s", aux.c_str());
4696             }
4697         } /* if, = */
4698 
4699         while(*str == ' ' || *str == '/' || *str == '\"' || *str == '\n')
4700             str++;
4701 
4702         if(reject != 0)
4703             continue;
4704 
4705         if (qual_new->IsSetVal())
4706         {
4707             const std::string& val_str = qual_new->GetVal();
4708             const char* cp = val_str.c_str();
4709             for(; *cp == '\"' || *cp == ' ' || *cp == '\t';)
4710                 ++cp;
4711             if(*cp == '\0')
4712             {
4713                 if(qual_str == "replace")
4714                     qual_new->SetVal("");
4715                 else
4716                     qual_new->ResetVal();
4717             }
4718         }
4719 
4720         for(b = EmptyQuals; *b != NULL; b++)
4721             if (qual_str == *b)
4722                 break;
4723 
4724         if(*b == NULL)
4725         {
4726             if (!qual_new->IsSetVal())
4727             {
4728                 if (qual_str == "old_locus_tag")
4729                     ErrPostEx(SEV_ERROR, ERR_FEATURE_EmptyOldLocusTag,
4730                               "Feature \"%s\" at \"%s\" has an /old_locus_tag qualifier with no value. Qualifier has been dropped.",
4731                               (fbp->key == NULL) ? "Unknown" : fbp->key,
4732                               (fbp->location == NULL) ? "Empty" : fbp->location);
4733                 else
4734                     ErrPostEx(SEV_WARNING, ERR_QUALIFIER_EmptyQual,
4735                               "Qualifier /%s ignored because it lacks a data value. Feature \"%s\", location \"%s\".",
4736                               qual_str.c_str(),
4737                               (fbp->key == NULL) ? "Unknown" : fbp->key,
4738                               (fbp->location == NULL) ? "Empty" : fbp->location);
4739                 continue;
4740             }
4741         }
4742         else if (qual_new->IsSetVal())
4743         {
4744             if (qual_str != "artificial_location" &&
4745                 qual_str != "mobile_element_type")
4746             {
4747                 ErrPostEx(SEV_WARNING, ERR_QUALIFIER_ShouldNotHaveValue,
4748                           "Qualifier /%s should not have data value. Qualifier value has been ignored. Feature \"%s\", location \"%s\".",
4749                           qual_str.c_str(), (fbp->key == NULL) ? "Unknown" : fbp->key,
4750                           (fbp->location == NULL) ? "Empty" : fbp->location);
4751                 qual_new->ResetVal();
4752             }
4753         }
4754 
4755         if (qual_new->IsSetVal() && qual_str == "note")
4756         {
4757             std::string val = qual_new->GetVal();
4758             std::replace(val.begin(), val.end(), '\"', '\'');
4759             qual_new->SetVal(val);
4760         }
4761 
4762         if (qual_new->IsSetQual() && !qual_new->GetQual().empty())
4763             fbp->quals.push_back(qual_new);
4764     }
4765 
4766     MemFree(qstr);
4767 }
4768 
4769 /**********************************************************/
fta_check_satellite(char * str,unsigned char * drop)4770 static void fta_check_satellite(char* str, unsigned char* drop)
4771 {
4772     char* p;
4773     Int2    i;
4774 
4775     if(str == NULL || *str == '\0')
4776         return;
4777 
4778     p = StringChr(str, ':');
4779     if(p != NULL)
4780         *p = '\0';
4781 
4782     i = MatchArrayString(SatelliteValues, str);
4783     if(p != NULL)
4784         *p = ':';
4785     if(i < 0)
4786     {
4787         ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidSatelliteType,
4788                   "/satellite qualifier \"%s\" does not begin with a valid satellite type.",
4789                   str);
4790         *drop = 1;
4791     }
4792     else if(p != NULL && p[1] == '\0')
4793     {
4794         ErrPostEx(SEV_REJECT, ERR_FEATURE_NoSatelliteClassOrIdentifier,
4795                   "/satellite qualifier \"%s\" does not include a class or identifier after the satellite type.",
4796                   str);
4797         *drop = 1;
4798     }
4799 }
4800 
4801 /**********************************************************
4802  *
4803  *   int ParseFeatureBlock(ibp, deb, dbp, source, format):
4804  *
4805  *      Parsing each feature sub-block, dbp, to
4806  *   FeatBlkPtr, fbp.
4807  *      Put warning message if bad qualifier's value or
4808  *   unknown feature key found.
4809  *      fdbp->drop = 1, if found unknown feature key, or
4810  *   do not go through 2nd time of qualifiers sematic
4811  *   check (i.e. drop bad qualifier if the value if illegal
4812  *   format in the 1st time)
4813  *
4814  *                                              11-22-93
4815  *
4816  *      The location begins at column 22, and qualifier
4817  *   begin on subsequent lines at column 22, they may
4818  *   extend from column 22-80.
4819  *      Qualifiers take the form of a slash, "/", followed
4820  *   by the qualifier name and, if applicable, an equal
4821  *   sign, "=", and a value (i.e. some qualifiers only
4822  *   have name w/o value, s.t. /pseudo).
4823  *
4824  *                                              5-4-93
4825  *
4826  **********************************************************/
ParseFeatureBlock(IndexblkPtr ibp,bool deb,DataBlkPtr dbp,Parser::ESource source,Parser::EFormat format)4827 int ParseFeatureBlock(IndexblkPtr ibp, bool deb, DataBlkPtr dbp,
4828                       Parser::ESource source, Parser::EFormat format)
4829 {
4830     char*    bptr;
4831     char*    eptr;
4832     char*    ptr1;
4833     char*    ptr2;
4834     char*    p;
4835     char*    q;
4836     Char       loc[100];
4837     Char       ch;
4838 
4839     FeatBlkPtr fbp;
4840     Int4       num;
4841     size_t     i;
4842     int        retval = GB_FEAT_ERR_NONE;
4843     int        ret;
4844 
4845     if(ibp->is_mga)
4846         sprintf(loc, "1..%ld", ibp->bases);
4847     for(num = 0; dbp != NULL; dbp = dbp->next, num++)
4848     {
4849         fbp = new FeatBlk;
4850         fbp->num = num;
4851         dbp->data = fbp;
4852 
4853         bptr = dbp->offset;
4854         eptr = bptr + dbp->len;
4855 
4856         for(p = bptr; *p != '\n';)
4857             p++;
4858         *p = '\0';
4859         FtaInstallPrefix(PREFIX_FEATURE, (char *) "Parsing FT line: ", bptr);
4860         *p = '\n';
4861         ptr1 = bptr + ParFlat_COL_FEATKEY;
4862         if(*ptr1 == ' ')
4863         {
4864             ErrPostStr(SEV_WARNING, ERR_FEATURE_FeatureKeyReplaced,
4865                        "Empty featkey");
4866         }
4867         for(ptr1 = bptr; *ptr1 == ' ';)
4868             ptr1++;
4869 
4870         for(ptr2 = ptr1; *ptr2 != ' ' && *ptr2 != '\n';)
4871             ptr2++;
4872 
4873         if(StringNCmp(ptr1, "- ", 2) == 0)
4874         {
4875             ErrPostStr(SEV_WARNING, ERR_FEATURE_FeatureKeyReplaced,
4876                        "Featkey '-' is replaced by 'misc_feature'");
4877             fbp->key = StringSave("misc_feature");
4878         }
4879         else
4880             fbp->key = StringSave(std::string(ptr1, ptr2).c_str());
4881 
4882         for(ptr1 = ptr2; *ptr1 == ' ';)
4883             ptr1++;
4884         if(*ptr1 == '\n')
4885         {
4886             if(ibp->is_mga == false)
4887             {
4888                 ErrPostEx(SEV_WARNING, ERR_FEATURE_LocationParsing,
4889                           "Location missing");
4890                 dbp->drop = 1;
4891                 retval = GB_FEAT_ERR_DROP;
4892                 continue;
4893             }
4894         }
4895         else
4896         {
4897             i = ptr1 - bptr;
4898             if(i < ParFlat_COL_FEATDAT)
4899                 ErrPostEx(SEV_WARNING, ERR_FEATURE_LocationParsing,
4900                           "Location data is shifted to the left");
4901             else if(i > ParFlat_COL_FEATDAT)
4902                 ErrPostEx(SEV_WARNING, ERR_FEATURE_LocationParsing,
4903                           "Location data is shifted to the right");
4904         }
4905 
4906         for(ptr2 = ptr1; *ptr2 != '/' && ptr2 < eptr;)
4907             ptr2++;
4908         ch = *ptr2;
4909         *ptr2 = '\0';
4910         fbp->location = StringSave(ptr1);
4911         if(ibp->is_prot)
4912             fta_strip_aa(fbp->location);
4913         *ptr2 = ch;
4914         for(p = fbp->location, q = p; *p != '\0'; p++)
4915             if(*p != ' ' && *p != '\n')
4916                 *q++ = *p;
4917         *q = '\0';
4918 
4919         if(fbp->location[0] == '\0' && ibp->is_mga)
4920         {
4921             MemFree(fbp->location);
4922             fbp->location = StringSave(loc);
4923         }
4924 
4925         FtaInstallPrefix(PREFIX_FEATURE, fbp->key, fbp->location);
4926         if(StringCmp(fbp->key, "allele") == 0 ||
4927            StringCmp(fbp->key, "mutation") == 0)
4928         {
4929             ErrPostEx(SEV_ERROR, ERR_FEATURE_ObsoleteFeature,
4930                       "Obsolete feature \"%s\" found. Replaced with \"variation\".",
4931                       fbp->key);
4932             MemFree(fbp->key);
4933             fbp->key = StringSave("variation");
4934         }
4935 
4936         objects::CSeqFeatData::ESubtype subtype = objects::CSeqFeatData::SubtypeNameToValue(fbp->key);
4937 
4938         if (subtype == objects::CSeqFeatData::eSubtype_bad && !deb)
4939         {
4940             ErrPostEx(SEV_ERROR, ERR_FEATURE_UnknownFeatKey, fbp->key,
4941                       "Feature dropped");
4942             dbp->drop = 1;
4943             retval = GB_FEAT_ERR_DROP;
4944             continue;
4945         }
4946 
4947         if(*ptr2 == '/')                /* qualifier start in first "/" */
4948         {
4949             ParseQualifiers(fbp, ptr2, eptr, format);
4950 
4951             if(StringCmp(fbp->key, "assembly_gap") != 0)
4952             {
4953                 ITERATE(TQualVector, cur, fbp->quals)
4954                 {
4955                     const std::string& cur_qual = (*cur)->GetQual();
4956                     if (cur_qual == "gap_type" ||
4957                         cur_qual == "assembly_evidence")
4958                     {
4959                         ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidQualifier,
4960                                   "Qualifier /%s is invalid for the feature \"%s\" at \"%s\".",
4961                                   cur_qual.c_str(), fbp->key, (fbp->location == NULL) ? "Unknown" : fbp->location);
4962                         ibp->drop = 1;
4963                     }
4964                 }
4965             }
4966 
4967             if(StringCmp(fbp->key, "source") != 0)
4968             {
4969                 ITERATE(TQualVector, cur, fbp->quals)
4970                 {
4971                     const std::string& cur_qual = (*cur)->GetQual();
4972                     if (cur_qual == "submitter_seqid" )
4973                     {
4974                         ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidQualifier,
4975                                   "Qualifier /%s is invalid for the feature \"%s\" at \"%s\".",
4976                                   cur_qual.c_str(), fbp->key, (fbp->location == NULL) ? "Unknown" : fbp->location);
4977                         ibp->drop = 1;
4978                     }
4979                 }
4980             }
4981 
4982             fbp = MergeNoteQual(fbp);   /* allow more than one
4983                                            notes w/i a key */
4984 
4985             if (subtype == objects::CSeqFeatData::eSubtype_bad)
4986             {
4987                 ErrPostStr(SEV_ERROR, ERR_FEATURE_UnknownFeatKey, fbp->key);
4988                 ret = GB_FEAT_ERR_REPAIRABLE;
4989             }
4990             else
4991             {
4992                 /* last argument is perform_corrections if debug
4993                  * mode is FALSE
4994                  */
4995                 ret = XGBFeatKeyQualValid(subtype, fbp->quals, true, (source == Parser::ESource::Flybase ? false : !deb));
4996             }
4997             if(ret > retval)
4998                 retval = ret;
4999 
5000             if(ret > GB_FEAT_ERR_REPAIRABLE &&
5001                StringCmp(fbp->key, "ncRNA") != 0)
5002                 dbp->drop = 1;
5003         }
5004         else if (subtype == objects::CSeqFeatData::eSubtype_bad && !objects::CSeqFeatData::GetMandatoryQualifiers(subtype).empty())
5005         {
5006             if(StringCmp(fbp->key, "mobile_element") != 0)
5007             {
5008                 auto qual_idx = *objects::CSeqFeatData::GetMandatoryQualifiers(subtype).begin();
5009                 std::string str1 = objects::CSeqFeatData::GetQualifierAsString(qual_idx);
5010                 const char *str = str1.c_str();
5011                 if((StringCmp(fbp->key, "old_sequence") != 0 &&
5012                     StringCmp(fbp->key, "conflict") != 0) ||
5013                    StringCmp(str, "citation") != 0)
5014                 {
5015                     ErrPostEx(SEV_ERROR, ERR_FEATURE_RequiredQualifierMissing,
5016                               "lacks required /%s qualifier : feature has been dropped.",
5017                               str);
5018                     if(!deb)
5019                     {
5020                         dbp->drop = 1;
5021                         retval = GB_FEAT_ERR_DROP;
5022                     }
5023                 }
5024             }
5025         }
5026         else if(StringCmp(fbp->key, "misc_feature") == 0 && fbp->quals.empty())
5027         {
5028             if (!deb)
5029             {
5030                 dbp->drop = 1;
5031                 retval = GB_FEAT_ERR_DROP;
5032                 ErrPostStr(SEV_WARNING, ERR_FEATURE_Dropped,
5033                            "Empty 'misc_feature' dropped");
5034             }
5035             else
5036                 retval = GB_FEAT_ERR_REPAIRABLE;
5037         }
5038 
5039         NON_CONST_ITERATE(TQualVector, cur, fbp->quals)
5040         {
5041             if (!(*cur)->IsSetQual() || !(*cur)->IsSetVal())
5042                 continue;
5043 
5044             const std::string& qual_str = (*cur)->GetQual();
5045             const std::string& val_str = (*cur)->GetVal();
5046 
5047             std::vector<Char> val_buf(val_str.begin(), val_str.end());
5048             val_buf.push_back(0);
5049 
5050             p = &val_buf[0];
5051             ShrinkSpaces(p);
5052             if (*p == '\0' && qual_str != "replace")
5053             {
5054                 (*cur)->ResetVal();
5055                 val_buf[0] = 0;
5056             }
5057             else
5058             {
5059                 if (qual_str == "replace")
5060                     fta_convert_to_lower_case(p);
5061                 (*cur)->SetVal(p);
5062             }
5063 
5064             if (qual_str == "satellite")
5065                 fta_check_satellite(&val_buf[0], &ibp->drop);
5066         }
5067     } /* for, each sub-block, or each feature key */
5068     FtaDeletePrefix(PREFIX_FEATURE);
5069     return(retval);
5070 }
5071 
5072 /**********************************************************/
XMLCheckQualifiers(FeatBlkPtr fbp)5073 static void XMLCheckQualifiers(FeatBlkPtr fbp)
5074 {
5075     const char **b;
5076     char*    p;
5077     Char       ch;
5078 
5079     if(fbp == NULL || fbp->quals.empty())
5080         return;
5081 
5082     for (TQualVector::iterator cur = fbp->quals.begin(); cur != fbp->quals.end();)
5083     {
5084         const std::string& qual_str = (*cur)->GetQual();
5085 
5086         if ((*cur)->IsSetVal())
5087         {
5088             const std::string& val_str = (*cur)->GetVal();
5089             std::vector<Char> val_buf(val_str.begin(), val_str.end());
5090             val_buf.push_back(0);
5091 
5092             if (qual_str == "translation")
5093             {
5094                 DelCharBtwData(&val_buf[0]);
5095             }
5096             else if (qual_str == "rpt_unit")
5097             {
5098                 fta_convert_to_lower_case(&val_buf[0]);
5099             }
5100             else if (qual_str == "cons_splice")
5101             {
5102                 fta_process_con_slice(val_buf);
5103             }
5104             else if (qual_str == "note")
5105             {
5106                 for(p = &val_buf[0];;)
5107                 {
5108                     p = StringChr(p, '/');
5109                     if(p == NULL)
5110                         break;
5111                     p++;
5112                     if (!CheckLegalQual(p, ' ', nullptr))
5113                         continue;
5114 
5115                     if (val_buf.size() > 30)
5116                     {
5117                         ch = val_buf[30];
5118                         val_buf[30] = '\0';
5119                     }
5120                     else
5121                         ch = '\0';
5122                     ErrPostEx(SEV_WARNING, ERR_QUALIFIER_EmbeddedQual,
5123                               "/note qualifier value appears to contain other qualifiers : [%s%s].",
5124                               &val_buf[0], (ch == '\0') ? "" : " ...");
5125                     if(ch != '\0')
5126                         val_buf[30] = ch;
5127                 }
5128             }
5129 
5130             for (p = &val_buf[0]; *p == '\"' || *p == ' ' || *p == '\t';)
5131                 p++;
5132 
5133             if(*p == '\0')
5134             {
5135                 if (qual_str == "replace")
5136                 {
5137                     (*cur)->SetVal("");
5138                 }
5139                 else
5140                     (*cur)->ResetVal();
5141             }
5142             else
5143                 (*cur)->SetVal(&val_buf[0]);
5144         }
5145 
5146         for (b = EmptyQuals; *b != NULL; b++)
5147             if (qual_str == *b)
5148                 break;
5149 
5150         if (*b == NULL)
5151         {
5152             if (!(*cur)->IsSetVal())
5153             {
5154                 if (qual_str == "old_locus_tag")
5155                     ErrPostEx(SEV_ERROR, ERR_FEATURE_EmptyOldLocusTag,
5156                               "Feature \"%s\" at \"%s\" has an /old_locus_tag qualifier with no value. Qualifier has been dropped.",
5157                               (fbp->key == NULL) ? "Unknown" : fbp->key,
5158                               (fbp->location == NULL) ? "Empty" : fbp->location);
5159                 else
5160                     ErrPostEx(SEV_WARNING, ERR_QUALIFIER_EmptyQual,
5161                               "Qualifier /%s ignored because it lacks a data value. Feature \"%s\", location \"%s\".",
5162                               qual_str.c_str(),
5163                               (fbp->key == NULL) ? "Unknown" : fbp->key,
5164                               (fbp->location == NULL) ? "Empty" : fbp->location);
5165 
5166                 cur = fbp->quals.erase(cur);
5167                 continue;
5168             }
5169         }
5170         else if ((*cur)->IsSetVal())
5171         {
5172             ErrPostEx(SEV_WARNING, ERR_QUALIFIER_ShouldNotHaveValue,
5173                       "Qualifier /%s should not have data value. Qualifier value has been ignored. Feature \"%s\", location \"%s\".",
5174                       qual_str.c_str(), (fbp->key == NULL) ? "Unknown" : fbp->key,
5175                       (fbp->location == NULL) ? "Empty" : fbp->location);
5176 
5177             (*cur)->ResetVal();
5178         }
5179 
5180         if ((*cur)->IsSetVal() && qual_str == "note")
5181         {
5182             std::string val = (*cur)->GetVal();
5183             std::replace(val.begin(), val.end(), '\"', '\'');
5184             (*cur)->SetVal(val);
5185         }
5186 
5187         ++cur;
5188     }
5189 }
5190 
5191 /**********************************************************/
XMLParseFeatureBlock(bool deb,DataBlkPtr dbp,Parser::ESource source)5192 static int XMLParseFeatureBlock(bool deb, DataBlkPtr dbp, Parser::ESource source)
5193 {
5194     FeatBlkPtr fbp;
5195     char*    p;
5196     Int4       num;
5197     int        retval = GB_FEAT_ERR_NONE;
5198     int        ret;
5199 
5200     for(num = 0; dbp != NULL; dbp = dbp->next, num++)
5201     {
5202         if(dbp->data == NULL)
5203             continue;
5204         fbp = (FeatBlkPtr) dbp->data;
5205         fbp->num = num;
5206         FtaInstallPrefix(PREFIX_FEATURE, fbp->key, fbp->location);
5207 
5208         if(fbp->key[0] == '-' && fbp->key[1] == '\0')
5209         {
5210             ErrPostStr(SEV_WARNING, ERR_FEATURE_FeatureKeyReplaced,
5211                        "Featkey '-' is replaced by 'misc_feature'");
5212             MemFree(fbp->key);
5213             fbp->key = StringSave("misc_feature");
5214         }
5215 
5216         if(StringCmp(fbp->key, "allele") == 0 ||
5217            StringCmp(fbp->key, "mutation") == 0)
5218         {
5219             ErrPostEx(SEV_ERROR, ERR_FEATURE_ObsoleteFeature,
5220                       "Obsolete feature \"%s\" found. Replaced with \"variation\".",
5221                       fbp->key);
5222             MemFree(fbp->key);
5223             fbp->key = StringSave("variation");
5224         }
5225 
5226         objects::CSeqFeatData::ESubtype subtype = objects::CSeqFeatData::SubtypeNameToValue(fbp->key);
5227 
5228         if (subtype == objects::CSeqFeatData::eSubtype_bad && !deb)
5229         {
5230             ErrPostEx(SEV_ERROR, ERR_FEATURE_UnknownFeatKey, fbp->key,
5231                       "Feature dropped");
5232             dbp->drop = 1;
5233             retval = GB_FEAT_ERR_DROP;
5234             continue;
5235         }
5236 
5237         if (!fbp->quals.empty())
5238         {
5239             XMLCheckQualifiers(fbp);
5240             fbp = MergeNoteQual(fbp);   /* allow more than one
5241                                            notes w/i a key */
5242 
5243             if (subtype == objects::CSeqFeatData::eSubtype_bad)
5244             {
5245                 ErrPostStr(SEV_ERROR, ERR_FEATURE_UnknownFeatKey, fbp->key);
5246                 ret = GB_FEAT_ERR_REPAIRABLE;
5247             }
5248             else
5249             {
5250                 /* last argument is perform_corrections if debug
5251                  * mode is FALSE
5252                  */
5253                 ret = XGBFeatKeyQualValid(subtype, fbp->quals, true, ((source == Parser::ESource::Flybase) ? false : !deb));
5254             }
5255             if(ret > retval)
5256                 retval = ret;
5257 
5258             if(ret > GB_FEAT_ERR_REPAIRABLE &&
5259                StringCmp(fbp->key, "ncRNA") != 0)
5260                 dbp->drop = 1;
5261         }
5262         else if (subtype == objects::CSeqFeatData::eSubtype_bad && !objects::CSeqFeatData::GetMandatoryQualifiers(subtype).empty())
5263         {
5264             if(StringCmp(fbp->key, "mobile_element") != 0)
5265             {
5266                 auto qual_idx = *objects::CSeqFeatData::GetMandatoryQualifiers(subtype).begin();
5267                 std::string str1 = objects::CSeqFeatData::GetQualifierAsString(qual_idx);
5268                 const char *str = str1.c_str();
5269                 if((StringCmp(fbp->key, "old_sequence") != 0 &&
5270                     StringCmp(fbp->key, "conflict") != 0) ||
5271                    StringCmp(str, "citation") != 0)
5272                 {
5273                     ErrPostEx(SEV_ERROR, ERR_FEATURE_RequiredQualifierMissing,
5274                               "lacks required /%s qualifier : feature has been dropped.",
5275                               str);
5276                     if(!deb)
5277                     {
5278                         dbp->drop = 1;
5279                         retval = GB_FEAT_ERR_DROP;
5280                     }
5281                 }
5282             }
5283         }
5284         else if(StringCmp(fbp->key, "misc_feature") == 0 && fbp->quals.empty())
5285         {
5286             if (!deb)
5287             {
5288                 dbp->drop = 1;
5289                 retval = GB_FEAT_ERR_DROP;
5290                 ErrPostStr(SEV_WARNING, ERR_FEATURE_Dropped,
5291                            "Empty 'misc_feature' dropped");
5292             }
5293             else
5294                 retval = GB_FEAT_ERR_REPAIRABLE;
5295         }
5296 
5297         NON_CONST_ITERATE(TQualVector, cur, fbp->quals)
5298         {
5299             if (!(*cur)->IsSetQual() || !(*cur)->IsSetVal())
5300                 continue;
5301 
5302             const std::string& qual_str = (*cur)->GetQual();
5303             const std::string& val_str = (*cur)->GetVal();
5304 
5305             std::vector<Char> val_buf(val_str.begin(), val_str.end());
5306             val_buf.push_back(0);
5307 
5308             p = &val_buf[0];
5309             ShrinkSpaces(p);
5310             if (*p == '\0' && qual_str != "replace")
5311             {
5312                 (*cur)->ResetVal();
5313                 val_buf[0] = 0;
5314             }
5315             else
5316             {
5317                 if (qual_str == "replace")
5318                     fta_convert_to_lower_case(p);
5319                 (*cur)->SetVal(p);
5320             }
5321         }
5322     } /* for, each sub-block, or each feature key */
5323     FtaDeletePrefix(PREFIX_FEATURE);
5324     return(retval);
5325 }
5326 
5327 /**********************************************************/
fta_check_ncrna(const objects::CSeq_feat & feat)5328 static bool fta_check_ncrna(const objects::CSeq_feat& feat)
5329 {
5330     char*   p;
5331     Int4      count = 0;
5332 
5333     bool stop = false;
5334     ITERATE(objects::CSeq_feat::TQual, qual, feat.GetQual())
5335     {
5336         if (!(*qual)->IsSetQual() || (*qual)->GetQual().empty() ||
5337             (*qual)->GetQual() != "ncRNA_class")
5338             continue;
5339 
5340         count++;
5341 
5342         if (!(*qual)->IsSetVal() || (*qual)->GetVal().empty())
5343         {
5344             p = location_to_string_or_unknown(feat.GetLocation());
5345 
5346             ErrPostEx(SEV_REJECT, ERR_FEATURE_ncRNA_class,
5347                       "Feature \"ncRNA\" at location \"%s\" has an empty /ncRNA_class qualifier.",
5348                       (p == NULL) ? "unknown" : p);
5349 
5350             if(p != NULL)
5351                 MemFree(p);
5352 
5353             stop = true;
5354             break;
5355         }
5356 
5357         if (MatchArrayString(ncRNA_class_values, (*qual)->GetVal().c_str()) < 0)
5358         {
5359             p = location_to_string_or_unknown(feat.GetLocation());
5360 
5361             ErrPostEx(SEV_REJECT, ERR_FEATURE_ncRNA_class,
5362                       "Feature \"ncRNA\" at location \"%s\" has an invalid /ncRNA_class qualifier: \"%s\".",
5363                       (p == NULL) ? "unknown" : p, (*qual)->GetVal().c_str());
5364 
5365             if (p != NULL)
5366                 MemFree(p);
5367 
5368             stop = true;
5369             break;
5370         }
5371     }
5372 
5373     if (stop)
5374         return false;
5375 
5376     if (count == 1)
5377         return true;
5378 
5379     p = location_to_string_or_unknown(feat.GetLocation());
5380 
5381     ErrPostEx(SEV_REJECT, ERR_FEATURE_ncRNA_class,
5382               "Feature \"ncRNA\" at location \"%s\" %s /ncRNA_class qualifier.",
5383               (p == NULL) ? "unknown" : p,
5384               (count == 0) ? "lacks the mandatory" : "has more than one");
5385 
5386     if(p != NULL)
5387         MemFree(p);
5388 
5389     return false;
5390 }
5391 
5392 /**********************************************************/
fta_check_artificial_location(objects::CSeq_feat & feat,char * key)5393 static void fta_check_artificial_location(objects::CSeq_feat& feat, char* key)
5394 {
5395     NON_CONST_ITERATE(objects::CSeq_feat::TQual, qual, feat.SetQual())
5396     {
5397         if (!(*qual)->IsSetQual() || (*qual)->GetQual() != "artificial_location")
5398             continue;
5399 
5400         if ((*qual)->IsSetVal())
5401         {
5402             const Char* p_val = (*qual)->GetVal().c_str();
5403             for (; *p_val == '\"';)
5404                 ++p_val;
5405 
5406             if (*p_val == '\0')
5407                 (*qual)->ResetVal();
5408         }
5409 
5410         std::string val = (*qual)->IsSetVal() ? (*qual)->GetVal() : "";
5411 
5412         if (val == "heterogenous population sequenced" ||
5413             val == "low-quality sequence region")
5414         {
5415             feat.SetExcept(true);
5416 
5417             if (!feat.IsSetExcept_text())
5418                 feat.SetExcept_text(val);
5419             else
5420             {
5421                 std::string& except_text = feat.SetExcept_text();
5422                 except_text += ", ";
5423                 except_text += val;
5424             }
5425         }
5426         else
5427         {
5428             Char* cstr = location_to_string_or_unknown(feat.GetLocation());
5429             std::string loc_str = cstr;
5430             MemFree(cstr);
5431 
5432             if (val.empty())
5433                 ErrPostEx(SEV_ERROR, ERR_QUALIFIER_InvalidArtificialLoc,
5434                           "Encountered empty /artificial_location qualifier : Feature \"%s\" : Location \"%s\". Qualifier dropped.",
5435                           (key == NULL || *key == '\0') ? "unknown" : key,
5436                           loc_str.empty() ? "unknown" : loc_str.c_str());
5437             else
5438                 ErrPostEx(SEV_ERROR, ERR_QUALIFIER_InvalidArtificialLoc,
5439                           "Value \"%s\" is not legal for the /artificial_location qualifier : Feature \"%s\" : Location \"%s\". Qualifier dropped.",
5440                           val.c_str(),
5441                           (key == NULL || *key == '\0') ? "unknown" : key,
5442                           loc_str.empty() ? "unknown" : loc_str.c_str());
5443         }
5444 
5445         feat.SetQual().erase(qual);
5446         break;
5447     }
5448 }
5449 
5450 /**********************************************************/
fta_check_mobile_element(const objects::CSeq_feat & feat)5451 static bool fta_check_mobile_element(const objects::CSeq_feat& feat)
5452 {
5453     bool found = false;
5454     ITERATE(objects::CSeq_feat::TQual, qual, feat.GetQual())
5455     {
5456         if ((*qual)->IsSetQual() && (*qual)->GetQual() == "mobile_element_type" &&
5457             (*qual)->IsSetVal() && !(*qual)->GetVal().empty())
5458         {
5459             const Char* p_val = (*qual)->GetVal().c_str();
5460             for (; *p_val == '\"';)
5461                 ++p_val;
5462 
5463             if (*p_val != '\0')
5464             {
5465                 found = true;
5466                 break;
5467             }
5468         }
5469     }
5470 
5471     if (found)
5472         return true;
5473 
5474     Char* cstr = location_to_string_or_unknown(feat.GetLocation());
5475     std::string loc_str = cstr;
5476     MemFree(cstr);
5477 
5478     ErrPostEx(SEV_REJECT, ERR_FEATURE_RequiredQualifierMissing,
5479               "Mandatory qualifier /mobile_element_type is absent or has no value : Feature \"mobile_element\" : Location \"%s\". Entry dropped.",
5480               loc_str.empty() ? "unknown" : loc_str.c_str());
5481 
5482     return false;
5483 }
5484 
5485 /**********************************************************/
SortFeaturesByLoc(const DataBlkPtr & sp1,const DataBlkPtr & sp2)5486 static bool SortFeaturesByLoc(const DataBlkPtr& sp1, const DataBlkPtr& sp2)
5487 {
5488     FeatBlkPtr      fbp1;
5489     FeatBlkPtr      fbp2;
5490     Int4            status;
5491 
5492     fbp1 = (FeatBlkPtr) sp1->data;
5493     fbp2 = (FeatBlkPtr) sp2->data;
5494 
5495     if(fbp1->location == NULL && fbp2->location != NULL)
5496         return false;
5497     if(fbp1->location != NULL && fbp2->location == NULL)
5498         return false;
5499 
5500     if (fbp1->location != NULL && fbp2->location != NULL)
5501     {
5502         status = StringCmp(fbp1->location, fbp2->location);
5503         if (status != 0)
5504             return status < 0;
5505     }
5506 
5507     if(fbp1->key == NULL && fbp2->key != NULL)
5508         return false;
5509     if (fbp1->key != NULL && fbp2->key == NULL)
5510         return false;
5511     if (fbp1->key != NULL && fbp2->key != NULL)
5512     {
5513         status = StringCmp(fbp1->key, fbp2->key);
5514         if (status != 0)
5515             return status < 0;
5516     }
5517 
5518     return false;
5519 }
5520 
5521 /**********************************************************/
SortFeaturesByOrder(const DataBlkPtr & sp1,const DataBlkPtr & sp2)5522 static bool SortFeaturesByOrder(const DataBlkPtr& sp1, const DataBlkPtr& sp2)
5523 {
5524     FeatBlkPtr      fbp1;
5525     FeatBlkPtr      fbp2;
5526 
5527     fbp1 = (FeatBlkPtr) sp1->data;
5528     fbp2 = (FeatBlkPtr) sp2->data;
5529 
5530     return fbp1->num < fbp2->num;
5531 }
5532 
5533 /**********************************************************/
fta_sort_features(DataBlkPtr dbp,bool order)5534 static DataBlkPtr fta_sort_features(DataBlkPtr dbp, bool order)
5535 {
5536     DataBlkPtr* temp;
5537     DataBlkPtr      tdbp;
5538     Int4            total;
5539     Int4            i;
5540 
5541     for(total = 0, tdbp = dbp; tdbp != NULL; tdbp = tdbp->next)
5542         total++;
5543 
5544     temp = (DataBlkPtr*) MemNew(total * sizeof(DataBlkPtr));
5545 
5546     for(i = 0, tdbp = dbp; tdbp != NULL; tdbp = tdbp->next)
5547         temp[i++] = tdbp;
5548 
5549     std::sort(temp, temp + i, (order ? SortFeaturesByOrder : SortFeaturesByLoc));
5550 
5551     dbp = tdbp = temp[0];
5552     for(i = 0; i < total - 1; tdbp = tdbp->next, i++)
5553         tdbp->next = temp[i+1];
5554 
5555     tdbp = temp[total-1];
5556     tdbp->next = NULL;
5557 
5558     MemFree(temp);
5559 
5560     return(dbp);
5561 }
5562 
5563 /**********************************************************/
fta_convert_to_regulatory(FeatBlkPtr fbp,const char * rclass)5564 static void fta_convert_to_regulatory(FeatBlkPtr fbp, const char *rclass)
5565 {
5566     if(fbp == NULL || fbp->key == NULL || rclass == NULL)
5567         return;
5568 
5569     if(fbp->key != NULL)
5570         MemFree(fbp->key);
5571     fbp->key = StringSave("regulatory");
5572 
5573     CRef<objects::CGb_qual> qual(new objects::CGb_qual);
5574     qual->SetQual("regulatory_class");
5575     qual->SetVal(rclass);
5576     fbp->quals.push_back(qual);
5577 }
5578 
5579 /**********************************************************/
fta_check_replace_regulatory(DataBlkPtr dbp,unsigned char * drop)5580 static void fta_check_replace_regulatory(DataBlkPtr dbp, unsigned char* drop)
5581 {
5582     FeatBlkPtr fbp;
5583     const char **b;
5584     char*    p;
5585     bool       got_note;
5586     bool       other_class;
5587     Int4       count;
5588     Char       ch;
5589 
5590     for(; dbp != NULL; dbp = dbp->next)
5591     {
5592         fbp = (FeatBlkPtr) dbp->data;
5593         if(fbp == NULL || fbp->key == NULL)
5594             continue;
5595 
5596         if(StringCmp(fbp->key, "attenuator") == 0)
5597             fta_convert_to_regulatory(fbp, "attenuator");
5598         else if(StringCmp(fbp->key, "CAAT_signal") == 0)
5599             fta_convert_to_regulatory(fbp, "CAAT_signal");
5600         else if(StringCmp(fbp->key, "enhancer") == 0)
5601             fta_convert_to_regulatory(fbp, "enhancer");
5602         else if(StringCmp(fbp->key, "GC_signal") == 0)
5603             fta_convert_to_regulatory(fbp, "GC_signal");
5604         else if(StringCmp(fbp->key, "-35_signal") == 0)
5605             fta_convert_to_regulatory(fbp, "minus_35_signal");
5606         else if(StringCmp(fbp->key, "-10_signal") == 0)
5607             fta_convert_to_regulatory(fbp, "minus_10_signal");
5608         else if(StringCmp(fbp->key, "polyA_signal") == 0)
5609             fta_convert_to_regulatory(fbp, "polyA_signal_sequence");
5610         else if(StringCmp(fbp->key, "promoter") == 0)
5611             fta_convert_to_regulatory(fbp, "promoter");
5612         else if(StringCmp(fbp->key, "RBS") == 0)
5613             fta_convert_to_regulatory(fbp, "ribosome_binding_site");
5614         else if(StringCmp(fbp->key, "TATA_signal") == 0)
5615             fta_convert_to_regulatory(fbp, "TATA_box");
5616         else if(StringCmp(fbp->key, "terminator") == 0)
5617             fta_convert_to_regulatory(fbp, "terminator");
5618         else if(StringCmp(fbp->key, "regulatory") != 0)
5619             continue;
5620 
5621         got_note = false;
5622         other_class = false;
5623         count = 0;
5624 
5625         ITERATE(TQualVector, cur, fbp->quals)
5626         {
5627             if (!(*cur)->IsSetQual() || !(*cur)->IsSetVal())
5628                 continue;
5629 
5630             const std::string& qual_str = (*cur)->GetQual();
5631 
5632             if (qual_str != "regulatory_class")
5633             {
5634                 if (qual_str == "note")
5635                     got_note = true;
5636                 continue;
5637             }
5638 
5639             count++;
5640             if (!(*cur)->IsSetVal() || (*cur)->GetVal().empty())
5641             {
5642                 ch = '\0';
5643                 if(fbp->location == NULL || *fbp->location == '\0')
5644                     p = (char*) "(empty)";
5645                 else
5646                 {
5647                     p = fbp->location;
5648                     if(StringLen(p) > 50)
5649                     {
5650                         ch = p[50];
5651                         p[50] = '\0';
5652                     }
5653                 }
5654                 ErrPostEx(SEV_REJECT, ERR_QUALIFIER_InvalidRegulatoryClass,
5655                           "Empty /regulatory_class qualifier value in regulatory feature at location %s.",
5656                           p);
5657                 if(ch != '\0')
5658                     p[50] = ch;
5659                 *drop = 1;
5660                 continue;
5661             }
5662 
5663             const std::string& val_str = (*cur)->GetVal();
5664 
5665             for (b = RegulatoryClassValues; *b != NULL; b++)
5666                 if (val_str == *b)
5667                     break;
5668 
5669             if(*b != NULL)
5670             {
5671                 if (val_str == "other")
5672                     other_class = true;
5673                 continue;
5674             }
5675 
5676             ch = '\0';
5677             if(fbp->location == NULL || *fbp->location == '\0')
5678                 p = (char*) "(empty)";
5679             else
5680             {
5681                 p = fbp->location;
5682                 if(StringLen(p) > 50)
5683                 {
5684                     ch = p[50];
5685                     p[50] = '\0';
5686                 }
5687             }
5688             ErrPostEx(SEV_REJECT, ERR_QUALIFIER_InvalidRegulatoryClass,
5689                       "Invalid /regulatory_class qualifier value %s provided in regulatory feature at location %s.",
5690                       val_str.c_str(), p);
5691             if(ch != '\0')
5692                 p[50] = ch;
5693             *drop = 1;
5694         }
5695 
5696         if(count == 0)
5697         {
5698             ch = '\0';
5699             if(fbp->location == NULL || *fbp->location == '\0')
5700                 p = (char*) "(empty)";
5701             else
5702             {
5703                 p = fbp->location;
5704                 if(StringLen(p) > 50)
5705                 {
5706                     ch = p[50];
5707                     p[50] = '\0';
5708                 }
5709             }
5710             ErrPostEx(SEV_REJECT, ERR_QUALIFIER_MissingRegulatoryClass,
5711                       "The regulatory feature is missing mandatory /regulatory_class qualifier at location %s.",
5712                       p);
5713             if(ch != '\0')
5714                 p[50] = ch;
5715             *drop = 1;
5716         }
5717         else if(count > 1)
5718         {
5719             ch = '\0';
5720             if(fbp->location == NULL || *fbp->location == '\0')
5721                 p = (char*) "(empty)";
5722             else
5723             {
5724                 p = fbp->location;
5725                 if(StringLen(p) > 50)
5726                 {
5727                     ch = p[50];
5728                     p[50] = '\0';
5729                 }
5730             }
5731             ErrPostEx(SEV_REJECT, ERR_QUALIFIER_MultipleRegulatoryClass,
5732                       "Multiple /regulatory_class qualifiers were encountered in regulatory feature at location %s.",
5733                       p);
5734             if(ch != '\0')
5735                 p[50] = ch;
5736             *drop = 1;
5737         }
5738 
5739         if(other_class && !got_note)
5740         {
5741             ch = '\0';
5742             if(fbp->location == NULL || *fbp->location == '\0')
5743                 p = (char*) "(empty)";
5744             else
5745             {
5746                 p = fbp->location;
5747                 if(StringLen(p) > 50)
5748                 {
5749                     ch = p[50];
5750                     p[50] = '\0';
5751                 }
5752             }
5753             ErrPostEx(SEV_REJECT, ERR_QUALIFIER_NoNoteForOtherRegulatory,
5754                       "The regulatory feature of class other is lacking required /note qualifier at location %s.",
5755                       p);
5756             if(ch != '\0')
5757                 p[50] = ch;
5758             *drop = 1;
5759         }
5760     }
5761 }
5762 
5763 /**********************************************************/
fta_create_wgs_dbtag(objects::CBioseq & bioseq,char * submitter_seqid,char * prefix,Int4 seqtype)5764 static void fta_create_wgs_dbtag(objects::CBioseq &bioseq,
5765                                  char* submitter_seqid,
5766                                  char* prefix, Int4 seqtype)
5767 {
5768     char* dbname;
5769 
5770     dbname = (char*) MemNew(11);
5771     if(seqtype == 0 || seqtype == 1 || seqtype == 7)
5772         StringCpy(dbname, "WGS:");
5773     else if(seqtype == 4 || seqtype == 5 || seqtype == 8 || seqtype == 9)
5774         StringCpy(dbname, "TSA:");
5775     else
5776         StringCpy(dbname, "TLS:");
5777     StringCat(dbname, prefix);
5778 
5779     CRef<objects::CSeq_id> gen_id(new objects::CSeq_id);
5780     objects::CDbtag &tag = gen_id->SetGeneral();
5781     tag.SetTag().SetStr(submitter_seqid);
5782     tag.SetDb(dbname);
5783     bioseq.SetId().push_back(gen_id);
5784 }
5785 
5786 /**********************************************************/
fta_create_wgs_seqid(objects::CBioseq & bioseq,IndexblkPtr ibp,Parser::ESource source)5787 static void fta_create_wgs_seqid(objects::CBioseq &bioseq,
5788                                  IndexblkPtr ibp, Parser::ESource source)
5789 {
5790     TokenBlkPtr tbp;
5791     char*     prefix;
5792     char*     p;
5793     Int4        seqtype;
5794     Int4        i;
5795 
5796     if(!ibp || !ibp->submitter_seqid)
5797         return;
5798 
5799     prefix = NULL;
5800 
5801     seqtype = fta_if_wgs_acc(ibp->acnum);
5802     if(seqtype == 0 || seqtype == 3 || seqtype == 4 || seqtype == 6 ||
5803        seqtype == 10 || seqtype == 12)
5804     {
5805         ErrPostEx(SEV_REJECT, ERR_SOURCE_SubmitterSeqidNotAllowed,
5806                  "WGS/TLS/TSA master records are not allowed to have /submitter_seqid qualifiers, only contigs and scaffolds. Entry dropped.");
5807         ibp->drop = 1;
5808         return;
5809     }
5810 
5811     if(seqtype == 1 || seqtype == 5 || seqtype == 7 || seqtype == 8 ||
5812        seqtype == 9 || seqtype == 11)
5813     {
5814         prefix = StringSave(ibp->acnum);
5815         if(prefix[4] >= '0' && prefix[4] <= '9')
5816             prefix[6] = '\0';
5817         else
5818             prefix[8] = '\0';
5819         fta_create_wgs_dbtag(bioseq, ibp->submitter_seqid, prefix, seqtype);
5820         MemFree(prefix);
5821         return;
5822     }
5823 
5824     for(tbp = ibp->secaccs; tbp != NULL; tbp = tbp->next)
5825     {
5826         if(tbp->str[0] == '-')
5827             continue;
5828 
5829         if(prefix == NULL)
5830             prefix = StringSave(tbp->str);
5831         else
5832         {
5833             i = (prefix[4] >= '0' && prefix[4] <= '9') ? 6 : 8;
5834             if(StringNCmp(prefix, tbp->str, i) != 0)
5835                 break;
5836         }
5837     }
5838 
5839     if(tbp == NULL && prefix != NULL)
5840     {
5841         seqtype = fta_if_wgs_acc(prefix);
5842         if(seqtype == 0 || seqtype == 1 || seqtype == 4 || seqtype == 5 ||
5843            seqtype == 7 || seqtype == 8 || seqtype == 9 || seqtype == 10 ||
5844            seqtype == 11)
5845         {
5846             if(prefix[4] >= '0' && prefix[4] <= '9')
5847                 prefix[6] = '\0';
5848             else
5849                 prefix[8] = '\0';
5850             fta_create_wgs_dbtag(bioseq, ibp->submitter_seqid, prefix,
5851                                  seqtype);
5852             MemFree(prefix);
5853             return;
5854         }
5855     }
5856 
5857     if(prefix != NULL)
5858     {
5859         MemFree(prefix);
5860         prefix = NULL;
5861     }
5862 
5863     if(bioseq.GetInst().IsSetExt() && bioseq.GetInst().GetExt().IsDelta())
5864     {
5865         objects::CDelta_ext::Tdata deltas =
5866             bioseq.GetInst().GetExt().GetDelta();
5867         objects::CDelta_ext::Tdata::iterator delta;
5868 
5869         for(delta = deltas.begin(); delta != deltas.end(); delta++)
5870         {
5871             const objects::CSeq_id *id = nullptr;
5872 
5873             if(!(*delta)->IsLoc())
5874                 continue;
5875 
5876             const objects::CSeq_loc &locs = (*delta)->GetLoc();
5877             objects::CSeq_loc_CI ci(locs);
5878 
5879             for(; ci; ++ci)
5880             {
5881                 CConstRef<objects::CSeq_loc> loc =
5882                     ci.GetRangeAsSeq_loc();
5883                 if(!loc->IsInt())
5884                     continue;
5885                 id = &ci.GetSeq_id();
5886                 if(!id)
5887                     break;
5888                 if(!id->IsGenbank() && !id->IsEmbl() && !id->IsDdbj() &&
5889                    !id->IsOther() && !id->IsTpg() && !id->IsTpe() &&
5890                    !id->IsTpd())
5891                     break;
5892 
5893                 const objects::CTextseq_id *text_id =
5894                     id->GetTextseq_Id();
5895                 if(text_id == nullptr || !text_id->IsSetAccession() ||
5896                    text_id->GetAccession().empty())
5897                     break;
5898 
5899                 p = (char *) text_id->GetAccession().c_str();
5900                 if(prefix == NULL)
5901                     prefix = StringSave(p);
5902                 else
5903                 {
5904                     i = (prefix[4] >= '0' && prefix[4] <= '9') ? 6 : 8;
5905                     if(StringNCmp(prefix, p, i) != 0)
5906                         break;
5907                 }
5908             }
5909             if(ci)
5910                 break;
5911         }
5912 
5913         if(delta == deltas.end() && prefix != NULL)
5914         {
5915             seqtype = fta_if_wgs_acc(prefix);
5916             if(seqtype == 0 || seqtype == 1 || seqtype == 4 || seqtype == 5 ||
5917                seqtype == 7 || seqtype == 8 || seqtype == 9 || seqtype == 10 ||
5918                seqtype == 11)
5919             {
5920                 if(prefix[4] >= '0' && prefix[4] <= '9')
5921                     prefix[6] = '\0';
5922                 else
5923                     prefix[8] = '\0';
5924                 fta_create_wgs_dbtag(bioseq, ibp->submitter_seqid, prefix,
5925                                      seqtype);
5926                 MemFree(prefix);
5927                 return;
5928             }
5929         }
5930 
5931         if(prefix != NULL)
5932         {
5933              MemFree(prefix);
5934              prefix = NULL;
5935         }
5936 
5937         ErrPostEx(SEV_ERROR, ERR_SOURCE_SubmitterSeqidDropped,
5938                   "Could not determine project code for what appears to be a WGS/TLS/TSA scaffold record. /submitter_seqid dropped.");
5939         return;
5940     }
5941 
5942     if((source == Parser::ESource::EMBL || source == Parser::ESource::DDBJ) && ibp->is_tsa)
5943     {
5944         ErrPostEx(SEV_ERROR, ERR_SOURCE_SubmitterSeqidIgnored,
5945                   "Submitter sequence identifiers for non-project-based TSA records are not supported. /submitter_seqid \"%s\" has been dropped.",
5946                   ibp->submitter_seqid);
5947         return;
5948     }
5949 
5950     ErrPostEx(SEV_REJECT, ERR_SOURCE_SubmitterSeqidNotAllowed,
5951               "Only WGS/TLS/TSA related records (contigs and scaffolds) are allowed to have /submitter_seqid qualifier. This \"%s\" is not one of them. Entry dropped.",
5952               ibp->acnum);
5953     ibp->drop = 1;
5954 }
5955 
5956 /**********************************************************
5957  *
5958  *   SeqAnnotPtr LoadFeat(pp, entry, bsp):
5959  *
5960  *                                              5-4-93
5961  *
5962  **********************************************************/
LoadFeat(ParserPtr pp,DataBlkPtr entry,objects::CBioseq & bioseq)5963 void LoadFeat(ParserPtr pp, DataBlkPtr entry, objects::CBioseq& bioseq)
5964 {
5965     DataBlkPtr  dab;
5966     DataBlkPtr  dabnext;
5967     DataBlkPtr  dbp;
5968     DataBlkPtr  tdbp;
5969     FeatBlkPtr  fbp;
5970 
5971     IndexblkPtr ibp;
5972     Int4        col_data;
5973     Int2        type;
5974     Int4        i = 0;
5975     CRef<objects::CSeq_id> pat_seq_id;
5976 
5977     xinstall_gbparse_range_func(pp, flat2asn_range_func);
5978 
5979     ibp = pp->entrylist[pp->curindx];
5980 
5981     CRef<objects::CSeq_id> seq_id =
5982         MakeAccSeqId(ibp->acnum, pp->seqtype, pp->accver, ibp->vernum,
5983                      true, ibp->is_tpa);
5984     if(pp->source == Parser::ESource::USPTO)
5985     {
5986         pat_seq_id = new objects::CSeq_id;
5987         CRef<objects::CPatent_seq_id> pat_id = MakeUsptoPatSeqId(ibp->acnum);
5988         pat_seq_id->SetPatent(*pat_id);
5989     }
5990 
5991     if (!seq_id) {
5992         if (ibp->acnum && !NStr::IsBlank(ibp->acnum)) {
5993             seq_id = Ref(new CSeq_id(CSeq_id::e_Local, ibp->acnum));
5994         }
5995         else if (pp->mode == Parser::EMode::Relaxed) {
5996             seq_id = Ref(new CSeq_id(CSeq_id::e_Local, ibp->locusname));
5997         }
5998     }
5999 
6000     TSeqIdList ids;
6001     ids.push_back(seq_id);
6002 
6003     if(pp->format == Parser::EFormat::GenBank)
6004     {
6005         col_data = ParFlat_COL_DATA;
6006         type = ParFlat_FEATURES;
6007     }
6008     else if(pp->format == Parser::EFormat::XML)
6009     {
6010         col_data = 0;
6011         type = XML_FEATURES;
6012     }
6013     else
6014     {
6015         col_data = ParFlat_COL_DATA_EMBL;
6016         type = ParFlat_FH;
6017     }
6018 
6019     /* Find feature already isolated in a "block"
6020      * The key, location and qualifiers will be isolated to
6021      * a FeatBlk at the first step of ParseFeatureBlock, which
6022      * parses a single feature at a time.
6023      *                                          -Karl
6024      */
6025     if(pp->format == Parser::EFormat::XML)
6026         dab = XMLLoadFeatBlk(entry->offset, ibp->xip);
6027     else
6028         dab = TrackNodeType(entry, type);
6029     for(dbp = dab; dbp != NULL; dbp = dbp->next)
6030     {
6031         if(dbp->type != type)
6032             continue;
6033 
6034         /* Parsing each feature subblock to FeatBlkPtr, fbp
6035          * it also checks semantics of qualifiers and keys
6036          */
6037         if(pp->format == Parser::EFormat::XML)
6038             XMLParseFeatureBlock(pp->debug, (DataBlkPtr) dbp->data, pp->source);
6039         else
6040             ParseFeatureBlock(ibp, pp->debug, (DataBlkPtr) dbp->data, pp->source, pp->format);
6041 
6042         dbp->data = (DataBlkPtr) fta_sort_features((DataBlkPtr) dbp->data, false);
6043         fta_check_pseudogene_qual((DataBlkPtr) dbp->data);
6044         fta_check_old_locus_tags((DataBlkPtr) dbp->data, &ibp->drop);
6045         fta_check_compare_qual((DataBlkPtr) dbp->data, ibp->is_tpa);
6046         tdbp = (DataBlkPtr) dbp->data;
6047         for(i = 0; tdbp != NULL; i++, tdbp = tdbp->next)
6048             fta_remove_dup_quals((FeatBlkPtr) tdbp->data);
6049         fta_remove_dup_feats((DataBlkPtr) dbp->data);
6050         for(tdbp = (DataBlkPtr) dbp->data; tdbp != NULL; tdbp = tdbp->next)
6051             fta_check_rpt_unit_range((FeatBlkPtr) tdbp->data, ibp->bases);
6052         fta_check_multiple_locus_tag((DataBlkPtr) dbp->data, &ibp->drop);
6053         if(ibp->is_tpa || ibp->is_tsa || ibp->is_tls)
6054             fta_check_non_tpa_tsa_tls_locations((DataBlkPtr) dbp->data, ibp);
6055         fta_check_replace_regulatory((DataBlkPtr) dbp->data, &ibp->drop);
6056         dbp->data = fta_sort_features((DataBlkPtr) dbp->data, true);
6057     }
6058 
6059     if(i > 1 && ibp->is_mga)
6060     {
6061         ErrPostEx(SEV_REJECT, ERR_FEATURE_MoreThanOneCAGEFeat,
6062                   "CAGE records are allowed to have only one feature, and it must be the \"source\" one. Entry dropped.");
6063         ibp->drop = 1;
6064     }
6065 
6066     if(ibp->drop == 0)
6067         CollectGapFeats(entry, dab, pp, type);
6068 
6069     TSeqFeatList seq_feats;
6070     if(ibp->drop == 0)
6071         ParseSourceFeat(pp, dab, ids, type, bioseq, seq_feats);
6072 
6073     if (seq_feats.empty())
6074     {
6075         ibp->drop = 1;
6076         for(; dab != NULL; dab = dabnext)
6077         {
6078             dabnext = dab->next;
6079             FreeFeatBlk((DataBlkPtr) dab->data, pp->format);
6080             if(pp->format == Parser::EFormat::XML)
6081                 MemFree(dab);
6082         }
6083         xinstall_gbparse_range_func(NULL, NULL);
6084         return;
6085     }
6086 
6087     if(ibp->submitter_seqid != NULL)
6088         fta_create_wgs_seqid(bioseq, ibp, pp->source);
6089 
6090     objects::CSeq_descr::Tdata& descr_list = bioseq.SetDescr().Set();
6091     for (objects::CSeq_descr::Tdata::iterator descr = descr_list.begin(); descr != descr_list.end();)
6092     {
6093         if (!(*descr)->IsSource())
6094         {
6095             ++descr;
6096             continue;
6097         }
6098 
6099         descr = descr_list.erase(descr);
6100     }
6101 
6102     CRef<objects::CSeqdesc> descr_src(new objects::CSeqdesc);
6103     descr_src->SetSource(seq_feats.front()->SetData().SetBiosrc());
6104 
6105     descr_list.push_back(descr_src);
6106     seq_feats.pop_front();
6107 
6108     fta_get_gcode_from_biosource(descr_src->GetSource(), ibp);
6109 
6110     for(; dab != NULL; dab = dabnext)
6111     {
6112         dabnext = dab->next;
6113         if(dab->type != type)
6114         {
6115             if(pp->format == Parser::EFormat::XML)
6116                 MemFree(dab);
6117             continue;
6118         }
6119 
6120         for(dbp = (DataBlkPtr) dab->data; dbp != NULL; dbp = dbp->next)
6121         {
6122             if(dbp->drop == 1)
6123                 continue;
6124 
6125             fbp = (FeatBlkPtr) dbp->data;
6126             if(StringCmp(fbp->key, "source") == 0 ||
6127                StringCmp(fbp->key, "assembly_gap") == 0 ||
6128                (StringCmp(fbp->key, "gap") == 0 &&
6129                 pp->source != Parser::ESource::DDBJ && pp->source != Parser::ESource::EMBL))
6130                 continue;
6131 
6132             fta_sort_quals(fbp, pp->qamode);
6133             CRef<objects::CSeq_feat> feat = ProcFeatBlk(pp, fbp, ids);
6134             if (feat.Empty())
6135             {
6136                 if(StringCmp(fbp->key, "CDS") == 0)
6137                 {
6138                     ErrPostEx(SEV_ERROR, ERR_FEATURE_LocationParsing,
6139                               "CDS feature has unparsable location. Entry dropped. Location = [%s].",
6140                               fbp->location);
6141                     ibp->drop = 1;
6142                 }
6143                 continue;
6144             }
6145 
6146             if(StringCmp(fbp->key, "mobile_element") == 0 &&
6147                !fta_check_mobile_element(*feat))
6148             {
6149                 ibp->drop = 1;
6150                 continue;
6151             }
6152 
6153             fta_check_artificial_location(*feat, fbp->key);
6154 
6155             if(CheckForeignLoc(feat->GetLocation(),
6156                (pp->source == Parser::ESource::USPTO) ? *pat_seq_id : *seq_id))
6157             {
6158                 ErrPostEx(SEV_WARNING, ERR_LOCATION_FailedCheck,
6159                           "Location pointing outside the entry [%s]",
6160                           fbp->location);
6161 
6162                 if (feat->GetData().IsImp())
6163                 {
6164                     const objects::CImp_feat& imp_feat = feat->GetData().GetImp();
6165                     if (imp_feat.GetKey() == "intron" ||
6166                         imp_feat.GetKey() == "exon")
6167                     {
6168                         /* foreign introns and exons wouldn't be parsed
6169                          */
6170                         feat.Reset();
6171                         continue;
6172                     }
6173                 }
6174             }
6175 
6176             FilterDb_xref(*feat, pp->source);
6177 
6178             i = FTASeqLocCheck(feat->GetLocation(), ibp->acnum);
6179             if(i == 0)
6180             {
6181                 ErrPostEx(SEV_WARNING, ERR_LOCATION_FailedCheck,
6182                           fbp->location);
6183 
6184                 if(pp->debug)
6185                     seq_feats.push_back(feat);
6186                 else
6187                 {
6188                     feat.Reset();
6189                     continue;
6190                 }
6191             }
6192             else
6193             {
6194                 if(i == 1)
6195                 {
6196                     if (feat->IsSetExcept_text() && feat->GetExcept_text() == "trans-splicing")
6197                         ErrPostEx(SEV_INFO,
6198                                   ERR_LOCATION_TransSpliceMixedStrand,
6199                                   "Mixed strands in SeqLoc of /trans_splicing feature: %s",
6200                                   fbp->location);
6201                     else
6202                         ErrPostEx(SEV_WARNING, ERR_LOCATION_MixedStrand,
6203                                   "Mixed strands in SeqLoc: %s", fbp->location);
6204                 }
6205 
6206                 seq_feats.push_back(feat);
6207             }
6208         }
6209         FreeFeatBlk((DataBlkPtr) dab->data, pp->format);
6210         if(pp->format == Parser::EFormat::XML)
6211             MemFree(dab);
6212     }
6213 
6214     if (!fta_perform_operon_checks(pp, seq_feats, ibp))
6215     {
6216         ibp->drop = 1;
6217         seq_feats.clear();
6218         xinstall_gbparse_range_func(NULL, NULL);
6219         return;
6220     }
6221 
6222     bool stop = false;
6223     NON_CONST_ITERATE(TSeqFeatList, feat, seq_feats)
6224     {
6225         if (!(*feat)->GetData().IsImp())
6226             continue;
6227 
6228         const objects::CImp_feat& imp_feat = (*feat)->GetData().GetImp();
6229 
6230         if (imp_feat.IsSetKey() &&
6231             StringStr(imp_feat.GetKey().c_str(), "RNA") != NULL)
6232         {
6233             if (imp_feat.GetKey() == "ncRNA" && !fta_check_ncrna(*(*feat)))
6234             {
6235                 stop = true;
6236                 break;
6237             }
6238 
6239             GetRnaRef(*(*feat), bioseq, pp->source, pp->accver);
6240         }
6241     }
6242 
6243     if (stop)
6244     {
6245         ibp->drop = 1;
6246         seq_feats.clear();
6247         xinstall_gbparse_range_func(NULL, NULL);
6248         return;
6249     }
6250 
6251     SeqFeatPub(pp, entry, seq_feats, ids, col_data, ibp);
6252     if (seq_feats.empty() && ibp->drop != 0)
6253     {
6254         xinstall_gbparse_range_func(NULL, NULL);
6255         return;
6256     }
6257 
6258     /* ImpFeatPub() call will be removed in asn 4.0
6259      */
6260     ImpFeatPub(pp, entry, seq_feats, *seq_id, col_data, ibp);
6261 
6262     xinstall_gbparse_range_func(NULL, NULL);
6263     if (seq_feats.empty())
6264         return;
6265 
6266     CRef<objects::CSeq_annot> annot(new objects::CSeq_annot);
6267     annot->SetData().SetFtable().swap(seq_feats);
6268 
6269     bioseq.SetAnnot().push_back(annot);
6270 }
6271 
6272 /**********************************************************/
GetBiomolFromToks(char * mRNA,char * tRNA,char * rRNA,char * snRNA,char * scRNA,char * uRNA,char * snoRNA)6273 static Uint1 GetBiomolFromToks(char* mRNA, char* tRNA, char* rRNA,
6274                                char* snRNA, char* scRNA, char* uRNA,
6275                                char* snoRNA)
6276 {
6277     char* p = NULL;
6278 
6279     if(mRNA != NULL)
6280         p = mRNA;
6281     if(p == NULL || (tRNA != NULL && tRNA < p))
6282         p = tRNA;
6283     if(p == NULL || (rRNA != NULL && rRNA < p))
6284         p = rRNA;
6285     if(p == NULL || (snRNA != NULL && snRNA < p))
6286         p = snRNA;
6287     if(p == NULL || (scRNA != NULL && scRNA < p))
6288         p = scRNA;
6289     if(p == NULL || (uRNA != NULL && uRNA < p))
6290         p = uRNA;
6291     if(p == NULL || (snoRNA != NULL && snoRNA < p))
6292         p = snoRNA;
6293 
6294     if(p == mRNA)
6295         return(Seq_descr_GIBB_mol_mRNA);
6296     if(p == tRNA)
6297         return(Seq_descr_GIBB_mol_tRNA);
6298     if(p == rRNA)
6299         return(Seq_descr_GIBB_mol_rRNA);
6300     if(p == snRNA || p == uRNA)
6301         return(Seq_descr_GIBB_mol_snRNA);
6302     if(p == snoRNA)
6303         return(Seq_descr_GIBB_mol_snoRNA);
6304     return(Seq_descr_GIBB_mol_scRNA);
6305 }
6306 
6307 /**********************************************************/
GetFlatBiomol(int & biomol,Uint1 tech,char * molstr,ParserPtr pp,DataBlkPtr entry,const objects::COrg_ref * org_ref)6308 void GetFlatBiomol(int& biomol, Uint1 tech, char* molstr, ParserPtr pp,
6309                    DataBlkPtr entry, const objects::COrg_ref* org_ref)
6310 {
6311     Int4        genomic;
6312     char*     offset;
6313     Char        c;
6314     DataBlkPtr  dbp;
6315 
6316     Int2        count;
6317     Int2        i;
6318     EntryBlkPtr ebp;
6319     IndexblkPtr ibp;
6320     const char  *p;
6321 
6322     char*     q;
6323     char*     r;
6324     char*     mRNA = NULL;
6325     char*     tRNA = NULL;
6326     char*     rRNA = NULL;
6327     char*     snRNA = NULL;
6328     char*     scRNA = NULL;
6329     char*     uRNA = NULL;
6330     char*     snoRNA = NULL;
6331     bool        stage;
6332     bool        techok;
6333     bool        same;
6334     bool        is_syn;
6335 
6336     ebp = (EntryBlkPtr) entry->data;
6337 
6338     objects::CBioseq& bioseq = ebp->seq_entry->SetSeq();
6339     ibp = pp->entrylist[pp->curindx];
6340 
6341     if(ibp->is_prot)
6342     {
6343         bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_aa);
6344         biomol = 8;
6345         return;
6346     }
6347 
6348     if(StringCmp(ibp->division, "SYN") == 0 ||
6349        (org_ref != NULL && org_ref->IsSetOrgname() && org_ref->GetOrgname().IsSetDiv() &&
6350        org_ref->GetOrgname().GetDiv() == "SYN"))
6351         is_syn = true;
6352     else
6353         is_syn = false;
6354 
6355     r = NULL;
6356     c = '\0';
6357     if(ibp->moltype != NULL)
6358     {
6359         if(pp->source == Parser::ESource::DDBJ && StringNICmp(molstr, "PRT", 3) == 0)
6360             return;
6361 
6362         biomol = Seq_descr_GIBB_mol_genomic;
6363         bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_dna);
6364 
6365         if(molstr != NULL)
6366         {
6367             q = molstr;
6368             r = molstr;
6369             if(pp->format == Parser::EFormat::EMBL || pp->format == Parser::EFormat::XML)
6370                 while(*r != ';' && *r != '\n' && *r != '\0')
6371                     r++;
6372             else
6373             {
6374                 while(*r != ';' && *r != ' ' && *r != '\t' && *r != '\n' &&
6375                       *r != '\0')
6376                     r++;
6377                 if(r - molstr > 10)
6378                     r = molstr + 10;
6379             }
6380             c = *r;
6381             *r = '\0';
6382             if(q == r)
6383                 q = (char*) "???";
6384         }
6385         else
6386             q = (char*) "???";
6387 
6388         same = true;
6389         if(StringCmp(ibp->moltype, "genomic DNA") == 0)
6390         {
6391             biomol = Seq_descr_GIBB_mol_genomic;
6392             bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_dna);
6393 
6394             if(pp->source == Parser::ESource::EMBL)
6395             {
6396                 if(StringICmp(q, "DNA") != 0 &&
6397                    StringICmp(ibp->moltype, q) != 0)
6398                     same = false;
6399             }
6400             else if(StringICmp(q, "DNA") != 0)
6401                 same = false;
6402         }
6403         else if(StringCmp(ibp->moltype, "genomic RNA") == 0)
6404         {
6405             biomol = Seq_descr_GIBB_mol_genomic;
6406             bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6407 
6408             if (pp->source == Parser::ESource::EMBL)
6409             {
6410                 if(StringICmp(q, "RNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6411                     same = false;
6412             }
6413             else if(StringICmp(q, "RNA") != 0)
6414                 same = false;
6415         }
6416         else if(StringCmp(ibp->moltype, "mRNA") == 0)
6417         {
6418             biomol = Seq_descr_GIBB_mol_mRNA;
6419             bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6420 
6421             if(pp->source == Parser::ESource::EMBL)
6422             {
6423                 if(StringICmp(q, "RNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6424                     same = false;
6425             }
6426             else if(StringICmp(q, "mRNA") != 0)
6427                 same = false;
6428         }
6429         else if(StringCmp(ibp->moltype, "tRNA") == 0)
6430         {
6431             biomol = Seq_descr_GIBB_mol_tRNA;
6432             bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6433 
6434             if(pp->source == Parser::ESource::EMBL)
6435             {
6436                 if(StringICmp(q, "RNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6437                     same = false;
6438             }
6439             else if(StringICmp(q, "tRNA") != 0)
6440                 same = false;
6441         }
6442         else if(StringCmp(ibp->moltype, "rRNA") == 0)
6443         {
6444             biomol = Seq_descr_GIBB_mol_rRNA;
6445             bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6446 
6447             if(pp->source == Parser::ESource::EMBL)
6448             {
6449                 if(StringICmp(q, "RNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6450                     same = false;
6451             }
6452             else if(StringICmp(q, "rRNA") != 0)
6453                 same = false;
6454         }
6455         else if(StringCmp(ibp->moltype, "snoRNA") == 0)
6456         {
6457             biomol = Seq_descr_GIBB_mol_snoRNA;
6458             bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6459 
6460             if(pp->source == Parser::ESource::EMBL)
6461             {
6462                 if(StringICmp(q, "RNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6463                     same = false;
6464             }
6465             else if(StringICmp(q, "snoRNA") != 0)
6466                 same = false;
6467         }
6468         else if(StringCmp(ibp->moltype, "snRNA") == 0)
6469         {
6470             biomol = Seq_descr_GIBB_mol_snRNA;
6471             bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6472 
6473             if(pp->source == Parser::ESource::EMBL)
6474             {
6475                 if(StringICmp(q, "RNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6476                     same = false;
6477             }
6478             else if(StringICmp(q, "snRNA") != 0)
6479                 same = false;
6480         }
6481         else if(StringCmp(ibp->moltype, "scRNA") == 0)
6482         {
6483             biomol = Seq_descr_GIBB_mol_scRNA;
6484             bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6485 
6486             if(pp->source == Parser::ESource::EMBL)
6487             {
6488                 if(StringICmp(q, "RNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6489                     same = false;
6490             }
6491             else if(StringICmp(q, "scRNA") != 0)
6492                 same = false;
6493         }
6494         else if(StringCmp(ibp->moltype, "pre-RNA") == 0)
6495         {
6496             biomol = Seq_descr_GIBB_mol_preRNA;
6497             bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6498 
6499             if(pp->source == Parser::ESource::EMBL)
6500             {
6501                 if(StringICmp(q, "RNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6502                     same = false;
6503             }
6504             else if(StringICmp(q, "RNA") != 0)
6505                 same = false;
6506         }
6507         else if(StringCmp(ibp->moltype, "pre-mRNA") == 0)
6508         {
6509             biomol = Seq_descr_GIBB_mol_preRNA;
6510             bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6511 
6512             if(pp->source == Parser::ESource::EMBL)
6513             {
6514                 if(StringICmp(q, "RNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6515                     same = false;
6516             }
6517             else if(StringICmp(q, "RNA") != 0)
6518                 same = false;
6519         }
6520         else if(StringCmp(ibp->moltype, "other RNA") == 0)
6521         {
6522             if(is_syn)
6523                 biomol = Seq_descr_GIBB_mol_other_genetic;
6524             else
6525                 biomol = Seq_descr_GIBB_mol_other;
6526             bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6527 
6528             if (pp->source == Parser::ESource::EMBL)
6529             {
6530                 if(StringICmp(q, "RNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6531                     same = false;
6532             }
6533             else if(StringICmp(q, "RNA") != 0)
6534                 same = false;
6535         }
6536         else if(StringCmp(ibp->moltype, "other DNA") == 0)
6537         {
6538             if(is_syn)
6539                 biomol = Seq_descr_GIBB_mol_other_genetic;
6540             else
6541                 biomol = Seq_descr_GIBB_mol_other;
6542             bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_dna);
6543 
6544             if (pp->source == Parser::ESource::EMBL)
6545             {
6546                 if(StringICmp(q, "DNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6547                     same = false;
6548             }
6549             else if(StringICmp(q, "DNA") != 0)
6550                 same = false;
6551         }
6552         else if(StringCmp(ibp->moltype, "unassigned RNA") == 0)
6553         {
6554             if(is_syn)
6555                 biomol = Seq_descr_GIBB_mol_other_genetic;
6556             else
6557                 biomol = Seq_descr_GIBB_mol_unknown;
6558             bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6559 
6560             if (pp->source == Parser::ESource::EMBL)
6561             {
6562                 if(StringICmp(q, "RNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6563                     same = false;
6564             }
6565             else if(StringICmp(q, "RNA") != 0)
6566                 same = false;
6567         }
6568         else if(StringCmp(ibp->moltype, "unassigned DNA") == 0)
6569         {
6570             if(is_syn)
6571                 biomol = Seq_descr_GIBB_mol_other_genetic;
6572             else
6573                 biomol = Seq_descr_GIBB_mol_unknown;
6574             bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_dna);
6575 
6576             if (pp->source == Parser::ESource::EMBL)
6577             {
6578                 if(StringICmp(q, "DNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6579                     same = false;
6580             }
6581             else if(StringICmp(q, "DNA") != 0)
6582                 same = false;
6583         }
6584         else if(StringCmp(ibp->moltype, "viral cRNA") == 0)
6585         {
6586             biomol = Seq_descr_GIBB_mol_cRNA;
6587             bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6588 
6589             if (pp->source == Parser::ESource::EMBL)
6590             {
6591                 if(StringICmp(q, "RNA") != 0 &&
6592                    StringICmp(q, "cRNA") != 0 &&
6593                    StringICmp(ibp->moltype, q) != 0)
6594                     same = false;
6595             }
6596             else if(StringICmp(q, "cRNA") != 0)
6597                 same = false;
6598         }
6599         else if(StringCmp(ibp->moltype, "transcribed RNA") == 0)
6600         {
6601             biomol = Seq_descr_GIBB_mol_trRNA;
6602             bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6603 
6604             if (pp->source == Parser::ESource::EMBL)
6605             {
6606                 if(StringICmp(q, "RNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6607                     same = false;
6608             }
6609             else if(StringICmp(q, "RNA") != 0)
6610                 same = false;
6611         }
6612         else
6613         {
6614             ErrPostEx(SEV_REJECT, ERR_SOURCE_InvalidMolType,
6615                       "Invalid /mol_type value \"%s\" provided in source features. Entry dropped.",
6616                       ibp->moltype);
6617             ibp->drop = 1;
6618             if(molstr != NULL)
6619                 *r = c;
6620             return;
6621         }
6622 
6623         if(!same)
6624         {
6625             if(ibp->embl_new_ID)
6626             {
6627                 ErrPostEx(SEV_REJECT, ERR_SOURCE_MolTypesDisagree,
6628                           "Molecule type \"%s\" from the ID line disagrees with \"%s\" from the /mol_type qualifier.",
6629                           q, ibp->moltype);
6630                 ibp->drop = 1;
6631                 if(molstr != NULL)
6632                     *r = c;
6633                 return;
6634             }
6635             ErrPostEx(SEV_ERROR, ERR_SOURCE_MolTypesDisagree,
6636                       "Molecule type \"%s\" from the ID/LOCUS line disagrees with \"%s\" from the /mol_type qualifier.",
6637                       q, ibp->moltype);
6638         }
6639 
6640         if ((tech == objects::CMolInfo::eTech_sts || tech == objects::CMolInfo::eTech_htgs_0 ||
6641             tech == objects::CMolInfo::eTech_htgs_1 || tech == objects::CMolInfo::eTech_htgs_2 ||
6642             tech == objects::CMolInfo::eTech_htgs_3 || tech == objects::CMolInfo::eTech_wgs ||
6643             tech == objects::CMolInfo::eTech_survey) &&
6644            StringCmp(ibp->moltype, "genomic DNA") != 0)
6645             techok = false;
6646         else if ((tech == objects::CMolInfo::eTech_est || tech == objects::CMolInfo::eTech_fli_cdna ||
6647             tech == objects::CMolInfo::eTech_htc) && StringCmp(ibp->moltype, "mRNA") != 0)
6648             techok = false;
6649         else
6650             techok = true;
6651 
6652         if(!techok)
6653         {
6654             if(tech == objects::CMolInfo::eTech_est)
6655                 p = "EST";
6656             else if(tech == objects::CMolInfo::eTech_fli_cdna)
6657                 p = "fli-cDNA";
6658             else if(tech == objects::CMolInfo::eTech_htc)
6659                 p = "HTC";
6660             else if(tech == objects::CMolInfo::eTech_sts)
6661                 p = "STS";
6662             else if(tech == objects::CMolInfo::eTech_wgs)
6663                 p = "WGS";
6664             else if(tech == objects::CMolInfo::eTech_tsa)
6665                 p = "TSA";
6666             else if(tech == objects::CMolInfo::eTech_targeted)
6667                 p = "TLS";
6668             else if(tech == objects::CMolInfo::eTech_survey)
6669                 p = "GSS";
6670             else
6671                 p = "HTG";
6672             ErrPostEx(SEV_ERROR, ERR_SOURCE_MolTypeSeqTypeConflict,
6673                       "Molecule type \"%s\" from the /mol_type qualifier disagrees with this record's sequence type: \"%s\".",
6674                       ibp->moltype, p);
6675         }
6676 
6677         if(molstr != NULL)
6678             *r = c;
6679         return;
6680     }
6681 
6682     if(tech == objects::CMolInfo::eTech_est)
6683     {
6684         biomol = Seq_descr_GIBB_mol_mRNA;
6685         bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6686         return;
6687     }
6688 
6689     if(pp->source == Parser::ESource::DDBJ || pp->source == Parser::ESource::LANL ||
6690        pp->source == Parser::ESource::NCBI)
6691     {
6692         biomol = Seq_descr_GIBB_mol_genomic;
6693         bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_dna);
6694     }
6695     else
6696     {
6697         biomol = Unknown;
6698         bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_na);
6699     }
6700 
6701     if(molstr == NULL)
6702         genomic = -1;
6703     else
6704     {
6705         genomic = CheckNA(molstr);
6706         if(genomic < 0 && pp->source == Parser::ESource::DDBJ)
6707             genomic = CheckNADDBJ(molstr);
6708     }
6709 
6710     if(genomic < 0 || genomic > 20)
6711     {
6712         if(pp->source == Parser::ESource::EMBL && StringNICmp(molstr, "XXX", 3) == 0)
6713             return;
6714         if(pp->source == Parser::ESource::DDBJ && StringNICmp(molstr, "PRT", 3) == 0)
6715             return;
6716         ibp->drop = 1;
6717         q = molstr;
6718         c = '\0';
6719         if(q != NULL)
6720         {
6721             if(pp->format == Parser::EFormat::EMBL)
6722                 while(*q != ';' && *q != '\n' && *q != '\0')
6723                     q++;
6724             else
6725             {
6726                 while(*q != ';' && *q != ' ' && *q != '\t' && *q != '\n' &&
6727                       *q != '\0')
6728                     q++;
6729                 if(q - molstr > 10)
6730                     q = molstr + 10;
6731             }
6732 
6733             c = *q;
6734             *q = '\0';
6735         }
6736         if(pp->source == Parser::ESource::DDBJ)
6737             p = "DDBJ";
6738         else if(pp->source == Parser::ESource::EMBL)
6739             p = "EMBL";
6740         else if(pp->source == Parser::ESource::LANL)
6741             p = "LANL";
6742         else
6743             p = "NCBI";
6744 
6745         ErrPostEx(SEV_FATAL, ERR_FORMAT_InvalidMolType,
6746                   "Molecule type \"%s\" from LOCUS/ID line is not legal value for records from source \"%s\". Sequence rejected.",
6747                   (molstr == NULL) ? "???" : molstr, p);
6748         if(q != NULL)
6749             *q = c;
6750         return;
6751     }
6752 
6753     if(genomic < 2)
6754         bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_na);
6755     else if(genomic > 1 && genomic < 6)
6756         bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_dna);
6757     else
6758         bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6759 
6760     if(genomic != 6)                    /* Not just RNA */
6761     {
6762         if(genomic < 2)                 /* "   ", "NA" or "cDNA" */
6763             biomol = Seq_descr_GIBB_mol_genomic;
6764         else if(genomic == 2)                   /* DNA */
6765             biomol = Seq_descr_GIBB_mol_genomic;
6766         else if(genomic == 3)                   /* genomic DNA */
6767             biomol = Seq_descr_GIBB_mol_genomic;
6768         else if(genomic == 4)                   /* other DNA */
6769         {
6770             if(is_syn)
6771                 biomol = Seq_descr_GIBB_mol_other_genetic;
6772             else
6773                 biomol = Seq_descr_GIBB_mol_other;
6774         }
6775         else if(genomic == 5)                   /* unassigned DNA */
6776         {
6777             if(is_syn)
6778                 biomol = Seq_descr_GIBB_mol_other_genetic;
6779             else
6780                 biomol = Seq_descr_GIBB_mol_unknown;
6781         }
6782         else if(genomic == 7)                   /* mRNA */
6783             biomol = Seq_descr_GIBB_mol_mRNA;
6784         else if(genomic == 8)                   /* rRNA */
6785             biomol = Seq_descr_GIBB_mol_rRNA;
6786         else if(genomic == 9)                   /* tRNA */
6787             biomol = Seq_descr_GIBB_mol_tRNA;
6788         else if(genomic == 10 || genomic == 12) /* uRNA -> snRNA */
6789             biomol = Seq_descr_GIBB_mol_snRNA;
6790         else if(genomic == 11)                  /* scRNA */
6791             biomol = Seq_descr_GIBB_mol_scRNA;
6792         else if(genomic == 13)                  /* snoRNA */
6793             biomol = Seq_descr_GIBB_mol_snoRNA;
6794         else if(genomic == 14)                  /* pre-RNA */
6795             biomol = Seq_descr_GIBB_mol_preRNA;
6796         else if(genomic == 15)                  /* pre-mRNA */
6797             biomol = Seq_descr_GIBB_mol_preRNA;
6798         else if(genomic == 16)                  /* genomic RNA */
6799             biomol = Seq_descr_GIBB_mol_genomic;
6800         else if(genomic == 17)                  /* other RNA */
6801         {
6802             if(is_syn)
6803                 biomol = Seq_descr_GIBB_mol_other_genetic;
6804             else
6805                 biomol = Seq_descr_GIBB_mol_other;
6806          }
6807         else if(genomic == 18)                  /* unassigned RNA */
6808         {
6809             if(is_syn)
6810                 biomol = Seq_descr_GIBB_mol_other_genetic;
6811             else
6812                 biomol = Seq_descr_GIBB_mol_unknown;
6813         }
6814         else if(genomic == 19 || genomic == 20) /* cRNA or viral cRNA */
6815             biomol = Seq_descr_GIBB_mol_cRNA;
6816         return;
6817     }
6818 
6819     /* Here goes most complicated case with just RNA
6820      */
6821     const Char* div = NULL;
6822     if (org_ref != NULL && org_ref->IsSetOrgname() && org_ref->GetOrgname().IsSetDiv())
6823         div = org_ref->GetOrgname().GetDiv().c_str();
6824 
6825     if(pp->source != Parser::ESource::EMBL || pp->format != Parser::EFormat::EMBL)
6826     {
6827         biomol = Seq_descr_GIBB_mol_genomic;
6828         if (div == NULL || StringNCmp(div, "VRL", 3) != 0)
6829         {
6830             ErrPostEx(SEV_ERROR, ERR_LOCUS_NonViralRNAMoltype,
6831                       "Genomic RNA implied by presence of RNA moltype, but sequence is non-viral.");
6832         }
6833         return;
6834     }
6835 
6836     count = 0;
6837     size_t len = 0;
6838     offset = SrchNodeType(entry, ParFlat_DE, &len);
6839     if(offset != NULL)
6840     {
6841         c = offset[len];
6842         offset[len] = '\0';
6843         mRNA = StringStr(offset, "mRNA");
6844         tRNA = StringStr(offset, "tRNA");
6845         rRNA = StringStr(offset, "rRNA");
6846         snRNA = StringStr(offset, "snRNA");
6847         scRNA = StringStr(offset, "scRNA");
6848         uRNA = StringStr(offset, "uRNA");
6849         snoRNA = StringStr(offset, "snoRNA");
6850         if(mRNA != NULL)
6851             count++;
6852         if(tRNA != NULL)
6853             count++;
6854         if(rRNA != NULL)
6855             count++;
6856         if(snRNA != NULL || uRNA != NULL)
6857             count++;
6858         if(scRNA != NULL)
6859             count++;
6860         if(snoRNA != NULL)
6861             count++;
6862         offset[len] = c;
6863     }
6864 
6865     /* Non-viral division
6866      */
6867     if (div == NULL || StringNCmp(div, "VRL", 3) != 0)
6868     {
6869         biomol = Seq_descr_GIBB_mol_mRNA;
6870 
6871         if(count > 1)
6872         {
6873             ErrPostEx(SEV_WARNING, ERR_DEFINITION_DifferingRnaTokens,
6874                       "More than one of mRNA, tRNA, rRNA, snRNA (uRNA), scRNA, snoRNA present in defline.");
6875         }
6876 
6877         if(tRNA != NULL)
6878         {
6879             for(p = tRNA + 4; *p == ' ' || *p == '\t';)
6880                 p++;
6881             if(*p == '\n')
6882             {
6883                 p++;
6884                 if(StringNCmp(p, "DE   ", 5) == 0)
6885                     p += 5;
6886             }
6887             if(StringNICmp(p, "Synthetase", 10) == 0)
6888                 return;
6889         }
6890 
6891         if(count > 0)
6892             biomol = GetBiomolFromToks(mRNA, tRNA, rRNA, snRNA, scRNA, uRNA,
6893                                        snoRNA);
6894         return;
6895     }
6896 
6897     /* Viral division
6898      */
6899     if (org_ref != NULL && org_ref->IsSetOrgname() && org_ref->GetOrgname().IsSetLineage() &&
6900         StringIStr(org_ref->GetOrgname().GetLineage().c_str(), "no DNA stage") != NULL)
6901          stage = true;
6902     else
6903          stage = false;
6904 
6905     dbp = TrackNodeType(entry, ParFlat_FH);
6906     if(dbp == NULL)
6907         return;
6908     dbp = (DataBlkPtr) dbp->data;
6909     for(i = 0; dbp != NULL && i < 2; dbp = dbp->next)
6910     {
6911         if(dbp->offset == NULL)
6912             continue;
6913         offset = dbp->offset + ParFlat_COL_FEATKEY;
6914         if(StringNCmp(offset, "CDS", 3) == 0)
6915             i++;
6916     }
6917     if(i > 1)
6918     {
6919         biomol = Seq_descr_GIBB_mol_genomic;
6920         if(!stage)
6921         {
6922             ErrPostEx(SEV_WARNING, ERR_SOURCE_GenomicViralRnaAssumed,
6923                       "This sequence is assumed to be genomic due to multiple coding region but lack of a DNA stage is not indicated in taxonomic lineage.");
6924         }
6925         return;
6926     }
6927 
6928     if(count == 0)
6929     {
6930         biomol = Seq_descr_GIBB_mol_genomic;
6931         if(!stage)
6932         {
6933             ErrPostEx(SEV_ERROR, ERR_SOURCE_UnclassifiedViralRna,
6934                       "Cannot determine viral molecule type (genomic vs a specific type of RNA) based on definition line, CDS content, or taxonomic lineage. So this sequence has been classified as genomic by default (perhaps in error).");
6935         }
6936         else
6937         {
6938             ErrPostEx(SEV_WARNING, ERR_SOURCE_LineageImpliesGenomicViralRna,
6939                       "This sequence lacks indication of specific RNA type in the definition line, but the taxonomic lineage mentions lack of a DNA stage, so it is classified as genomic.");
6940         }
6941         return;
6942     }
6943 
6944     if(count > 1)
6945     {
6946         ErrPostEx(SEV_WARNING, ERR_DEFINITION_DifferingRnaTokens,
6947                   "More than one of mRNA, tRNA, rRNA, snRNA (uRNA), scRNA, snoRNA present in defline.");
6948     }
6949 
6950     biomol = GetBiomolFromToks(mRNA, tRNA, rRNA, snRNA, scRNA, uRNA, snoRNA);
6951 }
6952 
6953 END_NCBI_SCOPE
6954