1 /* loadfeat.cpp
2 *
3 * ===========================================================================
4 *
5 * PUBLIC DOMAIN NOTICE
6 * National Center for Biotechnology Information
7 *
8 * This software/database is a "United States Government Work" under the
9 * terms of the United States Copyright Act. It was written as part of
10 * the author's official duties as a United States Government employee and
11 * thus cannot be copyrighted. This software/database is freely available
12 * to the public for use. The National Library of Medicine and the U.S.
13 * Government have not placed any restriction on its use or reproduction.
14 *
15 * Although all reasonable efforts have been taken to ensure the accuracy
16 * and reliability of the software and data, the NLM and the U.S.
17 * Government do not and cannot warrant the performance or results that
18 * may be obtained by using this software or data. The NLM and the U.S.
19 * Government disclaim all warranties, express or implied, including
20 * warranties of performance, merchantability or fitness for any particular
21 * purpose.
22 *
23 * Please cite the author in any work or product based on this material.
24 *
25 * ===========================================================================
26 *
27 * File Name: loadfeat.c
28 *
29 * Author: Karl Sirotkin, Hsiu-Chuan Chen
30 *
31 * File Description:
32 * -----------------
33 * Parse features block to subblock.
34 * Process each subblock.
35 * Output each subblock.
36 * Free out subblock.
37 */
38 #include <ncbi_pch.hpp>
39
40 #include "ftacpp.hpp"
41
42 #include <objects/seqfeat/Seq_feat.hpp>
43 #include <objects/seqfeat/Imp_feat.hpp>
44 #include <objmgr/bioseq_handle.hpp>
45 #include <objmgr/scope.hpp>
46 #include <objects/seqloc/Seq_bond.hpp>
47 #include <objects/seqfeat/Org_ref.hpp>
48 #include <objects/general/Dbtag.hpp>
49 #include <objects/general/Object_id.hpp>
50 #include <objects/seqfeat/OrgName.hpp>
51 #include <objects/seqfeat/SubSource.hpp>
52 #include <objects/seq/Seq_descr.hpp>
53 #include <objects/seqfeat/RNA_ref.hpp>
54 #include <objects/seqfeat/RNA_gen.hpp>
55 #include <objects/seqfeat/RNA_qual_set.hpp>
56 #include <objects/seqfeat/RNA_qual.hpp>
57 #include <objects/seqfeat/Trna_ext.hpp>
58 #include <objects/pub/Pub_set.hpp>
59 #include <objects/pub/Pub.hpp>
60 #include <serial/objostr.hpp>
61 #include <objmgr/util/seq_loc_util.hpp>
62 #include <objects/seq/seq_loc_from_string.hpp>
63 #include <objects/seq/Pubdesc.hpp>
64 #include <objects/seqfeat/BioSource.hpp>
65 #include <objects/seqfeat/SeqFeatData.hpp>
66 #include <objects/seq/MolInfo.hpp>
67 #include <objects/seq/Seq_inst.hpp>
68 #include <objects/seq/Seq_ext.hpp>
69 #include <objects/seq/Delta_ext.hpp>
70 #include <objects/seq/Delta_seq.hpp>
71
72 #include "index.h"
73 #include "embl.h"
74 #include "genbank.h"
75
76 #include <objtools/flatfile/flatfile_parser.hpp>
77 #include <objtools/flatfile/flatdefn.h>
78
79 #include "ftaerr.hpp"
80 #include "indx_blk.h"
81 #include "asci_blk.h"
82 #include "utilfeat.h"
83 #include "loadfeat.h"
84 #include "add.h"
85 #include "fta_src.h"
86 #include "buf_data_loader.h"
87 #include "utilfun.h"
88 #include "ref.h"
89 #include "xgbfeat.h"
90 #include "xgbparint.h"
91 #include "fta_xml.h"
92
93 #ifdef THIS_FILE
94 # undef THIS_FILE
95 #endif
96 #define THIS_FILE "loadfeat.cpp"
97
98 BEGIN_NCBI_SCOPE
99 USING_SCOPE(objects);
100
101 #define Seq_descr_GIBB_mol_unknown 0
102 #define Seq_descr_GIBB_mol_genomic 1
103 #define Seq_descr_GIBB_mol_preRNA 2
104 #define Seq_descr_GIBB_mol_mRNA 3
105 #define Seq_descr_GIBB_mol_rRNA 4
106 #define Seq_descr_GIBB_mol_tRNA 5
107 #define Seq_descr_GIBB_mol_uRNA 6
108 #define Seq_descr_GIBB_mol_snRNA 6
109 #define Seq_descr_GIBB_mol_scRNA 7
110 #define Seq_descr_GIBB_mol_other_genetic 9
111 #define Seq_descr_GIBB_mol_cRNA 11
112 #define Seq_descr_GIBB_mol_snoRNA 12
113 #define Seq_descr_GIBB_mol_trRNA 13
114 #define Seq_descr_GIBB_mol_other 255
115
116 typedef struct _trna_aa {
117 const char *name;
118 Uint1 aa;
119 } TrnaAa, *TrnaAaPtr;
120
121 typedef struct _str_num {
122 const char *str;
123 Int4 num;
124 } StrNum, *StrNumPtr;
125
126 TrnaAa taa[] = {
127 {"alanine", 'A'},
128 {"arginine", 'R'},
129 {"asparagine", 'N'},
130 {"aspartic acid", 'D'},
131 {"aspartate", 'D'},
132 {"cysteine", 'C'},
133 {"glutamine", 'Q'},
134 {"glutamic acid", 'E'},
135 {"glutamate", 'E'},
136 {"glycine", 'G'},
137 {"histidine", 'H'},
138 {"isoleucine", 'I'},
139 {"leucine", 'L'},
140 {"lysine", 'K'},
141 {"methionine", 'M'},
142 {"phenylalanine", 'F'},
143 {"proline", 'P'},
144 {"selenocysteine", 'U'},
145 {"serine", 'S'},
146 {"threonine", 'T'},
147 {"tryptophan", 'W'},
148 {"tyrosine", 'Y'},
149 {"valine", 'V'},
150 {NULL, '\0'}
151 };
152
153 typedef struct _aa_codons {
154 const char *straa;
155 Uint1 intaa;
156 Uint1 gencode;
157 Int4 vals[8];
158 } AaCodons, *AaCodonsPtr;
159
160 AaCodons aacodons[] = {
161 {"Ala", 'A', 0, {52, 53, 54, 55, -1, -1, -1, -1}}, /* GCT, GCC, GCA, GCG */
162 {"Arg", 'R', 2, {28, 29, 30, 31, -1, -1, -1, -1}}, /* CGT, CGC, CGA, CGG */
163 {"Arg", 'R', 5, {28, 29, 30, 31, -1, -1, -1, -1}}, /* CGT, CGC, CGA, CGG */
164 {"Arg", 'R', 9, {28, 29, 30, 31, -1, -1, -1, -1}}, /* CGT, CGC, CGA, CGG */
165 {"Arg", 'R', 13, {28, 29, 30, 31, -1, -1, -1, -1}}, /* CGT, CGC, CGA, CGG */
166 {"Arg", 'R', 14, {28, 29, 30, 31, -1, -1, -1, -1}}, /* CGT, CGC, CGA, CGG */
167 {"Arg", 'R', 0, {28, 29, 30, 31, 46, 47, -1, -1}}, /* CGT, CGC, CGA, CGG, AGA, AGG */
168 {"Asn", 'N', 9, {40, 41, 42, -1, -1, -1, -1, -1}}, /* AAT, AAC, AAA */
169 {"Asn", 'N', 14, {40, 41, 42, -1, -1, -1, -1, -1}}, /* AAT, AAC, AAA */
170 {"Asn", 'N', 0, {40, 41, -1, -1, -1, -1, -1, -1}}, /* AAT, AAC */
171 {"Asp", 'D', 0, {56, 57, -1, -1, -1, -1, -1, -1}}, /* GAT, GAC */
172 {"Asx", 'B', 9, {40, 41, 42, 56, 57, -1, -1, -1}}, /* Asn + Asp */
173 {"Asx", 'B', 14, {40, 41, 42, 56, 57, -1, -1, -1}}, /* Asn + Asp */
174 {"Asx", 'B', 0, {40, 41, 56, 57, -1, -1, -1, -1}}, /* Asn + Asp */
175 {"Cys", 'C', 10, {12, 13, 14, -1, -1, -1, -1, -1}}, /* TGT, TGC, TGA */
176 {"Cys", 'C', 0, {12, 13, -1, -1, -1, -1, -1, -1}}, /* TGT, TGC */
177 {"Gln", 'Q', 6, {10, 11, 26, 27, -1, -1, -1, -1}}, /* TAA, TAG, CAA, CAG */
178 {"Gln", 'Q', 15, {11, 26, 27, -1, -1, -1, -1, -1}}, /* TAG, CAA, CAG */
179 {"Gln", 'Q', 0, {26, 27, -1, -1, -1, -1, -1, -1}}, /* CAA, CAG */
180 {"Glu", 'E', 0, {58, 59, -1, -1, -1, -1, -1, -1}}, /* GAA, GAG */
181 {"Glx", 'Z', 6, {10, 11, 26, 27, 58, 59, -1, -1}}, /* Gln + Glu */
182 {"Glx", 'Z', 0, {11, 26, 27, 58, 59, -1, -1, -1}}, /* Gln + Glu */
183 {"Glx", 'Z', 0, {26, 27, 58, 59, -1, -1, -1, -1}}, /* Gln + Glu */
184 {"Gly", 'G', 13, {46, 47, 60, 61, 62, 63, -1, -1}}, /* AGA, AGG, GGT, GGC, GGA, GGG */
185 {"Gly", 'G', 0, {60, 61, 62, 63, -1, -1, -1, -1}}, /* GGT, GGC, GGA, GGG */
186 {"His", 'H', 0, {24, 25, -1, -1, -1, -1, -1, -1}}, /* CAT, CAC */
187 {"Ile", 'I', 2, {32, 33, -1, -1, -1, -1, -1, -1}}, /* ATT, ATC */
188 {"Ile", 'I', 3, {32, 33, -1, -1, -1, -1, -1, -1}}, /* ATT, ATC */
189 {"Ile", 'I', 5, {32, 33, -1, -1, -1, -1, -1, -1}}, /* ATT, ATC */
190 {"Ile", 'I', 13, {32, 33, -1, -1, -1, -1, -1, -1}}, /* ATT, ATC */
191 {"Ile", 'I', 0, {32, 33, 34, -1, -1, -1, -1, -1}}, /* ATT, ATC, ATA */
192 {"Leu", 'L', 3, { 2, 3, -1, -1, -1, -1, -1, -1}}, /* TTA, TTG */
193 {"Leu", 'L', 12, { 2, 3, 16, 17, 18, -1, -1, -1}}, /* TTA, TTG, CTT, CTC, CTA */
194 {"Leu", 'L', 0, { 2, 3, 16, 17, 18, 19, -1, -1}}, /* TTA, TTG, CTT, CTC, CTA, CTG */
195 {"Lys", 'K', 9, {43, -1, -1, -1, -1, -1, -1, -1}}, /* AAG */
196 {"Lys", 'K', 14, {43, -1, -1, -1, -1, -1, -1, -1}}, /* AAG */
197 {"Lys", 'K', 0, {42, 43, -1, -1, -1, -1, -1, -1}}, /* AAA, AAG */
198 {"Met", 'M', 2, {34, 35, -1, -1, -1, -1, -1, -1}}, /* ATA, ATG */
199 {"Met", 'M', 3, {34, 35, -1, -1, -1, -1, -1, -1}}, /* ATA, ATG */
200 {"Met", 'M', 5, {34, 35, -1, -1, -1, -1, -1, -1}}, /* ATA, ATG */
201 {"Met", 'M', 13, {34, 35, -1, -1, -1, -1, -1, -1}}, /* ATA, ATG */
202 {"Met", 'M', 0, {35, -1, -1, -1, -1, -1, -1, -1}}, /* ATG */
203 {"fMet", 'M', 2, {34, 35, -1, -1, -1, -1, -1, -1}}, /* ATA, ATG */
204 {"fMet", 'M', 3, {34, 35, -1, -1, -1, -1, -1, -1}}, /* ATA, ATG */
205 {"fMet", 'M', 5, {34, 35, -1, -1, -1, -1, -1, -1}}, /* ATA, ATG */
206 {"fMet", 'M', 13, {34, 35, -1, -1, -1, -1, -1, -1}}, /* ATA, ATG */
207 {"fMet", 'M', 0, {35, -1, -1, -1, -1, -1, -1, -1}}, /* ATG */
208 {"Phe", 'F', 0, { 0, 1, -1, -1, -1, -1, -1, -1}}, /* TTT, TTC */
209 {"Pro", 'P', 0, {20, 21, 22, 23, -1, -1, -1, -1}}, /* CCT, CCC, CCA, CCG */
210 {"Sec", 'U', 0, {-1, -1, -1, -1, -1, -1, -1, -1}},
211 {"Ser", 'S', 5, { 4, 5, 6, 7, 44, 45, 46, 47}}, /* TCT, TCC, TCA, TCG, AGT, AGC, AGA, AGG */
212 {"Ser", 'S', 9, { 4, 5, 6, 7, 44, 45, 46, 47}}, /* TCT, TCC, TCA, TCG, AGT, AGC, AGA, AGG */
213 {"Ser", 'S', 12, { 4, 5, 6, 7, 19, 44, 45, -1}}, /* TCT, TCC, TCA, TCG, CTG, AGT, AGC */
214 {"Ser", 'S', 14, { 4, 5, 6, 7, 44, 45, 46, 47}}, /* TCT, TCC, TCA, TCG, AGT, AGC, AGA, AGG */
215 {"Ser", 'S', 0, { 4, 5, 6, 7, 44, 45, -1, -1}}, /* TCT, TCC, TCA, TCG, AGT, AGC */
216 {"Thr", 'T', 3, {16, 17, 18, 19, 36, 37, 38, 39}}, /* CTT, CTC, CTA, CTG, ACT, ACC, ACA, ACG */
217 {"Thr", 'T', 0, {36, 37, 38, 39, -1, -1, -1, -1}}, /* ACT, ACC, ACA, ACG */
218 {"Trp", 'W', 1, {15, -1, -1, -1, -1, -1, -1, -1}}, /* TGG */
219 {"Trp", 'W', 6, {15, -1, -1, -1, -1, -1, -1, -1}}, /* TGG */
220 {"Trp", 'W', 10, {15, -1, -1, -1, -1, -1, -1, -1}}, /* TGG */
221 {"Trp", 'W', 11, {15, -1, -1, -1, -1, -1, -1, -1}}, /* TGG */
222 {"Trp", 'W', 12, {15, -1, -1, -1, -1, -1, -1, -1}}, /* TGG */
223 {"Trp", 'W', 15, {15, -1, -1, -1, -1, -1, -1, -1}}, /* TGG */
224 {"Trp", 'W', 0, {14, 15, -1, -1, -1, -1, -1, -1}}, /* TGA, TGG */
225 {"Tyr", 'Y', 14, { 8, 9, 10, -1, -1, -1, -1, -1}}, /* TAT, TAC, TAA */
226 {"Tyr", 'Y', 0, { 8, 9, -1, -1, -1, -1, -1, -1}}, /* TAT, TAC */
227 {"Val", 'V', 0, {48, 49, 50, 51, -1, -1, -1, -1}}, /* GTT, GTC, GTA, GTG */
228 {"TERM", '*', 1, {10, 11, 14, -1, -1, -1, -1, -1}}, /* TAA, TAG, TGA */
229 {"TERM", '*', 2, {10, 11, 46, 47, -1, -1, -1, -1}}, /* TAA, TAG, AGA, AGG */
230 {"TERM", '*', 6, {14, -1, -1, -1, -1, -1, -1, -1}}, /* TGA */
231 {"TERM", '*', 11, {10, 11, 14, -1, -1, -1, -1, -1}}, /* TAA, TAG, TGA */
232 {"TERM", '*', 12, {10, 11, 14, -1, -1, -1, -1, -1}}, /* TAA, TAG, TGA */
233 {"TERM", '*', 14, {11, -1, -1, -1, -1, -1, -1, -1}}, /* TAG */
234 {"TERM", '*', 15, {10, 14, -1, -1, -1, -1, -1, -1}}, /* TAA, TGA */
235 {"TERM", '*', 0, {10, 11, -1, -1, -1, -1, -1, -1}}, /* TAA, TAG */
236 {"OTHER", 'X', 0, {-1, -1, -1, -1, -1, -1, -1, -1}},
237 {NULL, '\0', 0, {-1, -1, -1, -1, -1, -1, -1, -1}}
238 };
239
240 static const char *trna_tags[] = {
241 "TRANSFERN RNA",
242 "TRANSFER RRNA",
243 "TRANSFER TRNA",
244 "TRANSFER RNA",
245 "TRASNFER RNA",
246 "TRANSDER RNA",
247 "TRANSFERRNA",
248 "TRANFER RNA",
249 "T RNA",
250 "TRNA",
251 NULL
252 };
253
254 const char *ParFlat_ESTmod[] = {
255 "EST",
256 "expressed sequence tag",
257 "partial cDNA sequence",
258 "transcribed sequence fragment",
259 "TSR",
260 "putatively transcribed partial sequence",
261 "UK putts",
262 "Plastid",
263 NULL
264 };
265
266 static const char *ParFlat_RNA_array[] = {
267 "precursor_RNA",
268 "mRNA",
269 "tRNA",
270 "rRNA",
271 "snRNA",
272 "scRNA",
273 "snoRNA",
274 "ncRNA",
275 "tmRNA",
276 "misc_RNA",
277 NULL
278 };
279
280 static const char *DbxrefTagAny[] = {
281 "ASAP",
282 "CDD",
283 "DBEST",
284 "DBSTS",
285 "GDB",
286 "HMP",
287 "MAIZEGDB",
288 NULL
289 };
290
291 static const char *DbxrefObsolete[] = {
292 "BHB",
293 "BIOHEALTHBASE",
294 "GENEW",
295 "IFO",
296 "SWISS-PROT",
297 "SPTREMBL",
298 "TREMBL",
299 NULL
300 };
301
302 static const char *EMBLDbxrefTagStr[] = {
303 "BIOMUTA",
304 "DEPOD",
305 "ENSEMBLGENOMES-GN",
306 "ENSEMBLGENOMES-TR",
307 "ESTHER",
308 "GENEVISIBLE",
309 "MOONPROT",
310 "PROTEOMES",
311 "UNITE",
312 "WBPARASITE",
313 NULL
314 };
315
316 static const char *DbxrefTagStr[] = {
317 "ACEVIEW/WORMGENES",
318 "APHIDBASE",
319 "APIDB",
320 "ARAPORT",
321 "BEEBASE",
322 "BEETLEBASE",
323 "BGD",
324 "BOLD",
325 "CGD",
326 "COLLECTF",
327 "DBSNP",
328 "DICTYBASE",
329 "ECOCYC",
330 "ECOGENE",
331 "ENSEMBL",
332 "ENSEMBLGENOMES",
333 "ERIC",
334 "FANTOM_DB",
335 "FLYBASE",
336 "GABI",
337 "GENEDB",
338 "GOA",
339 "H-INVDB",
340 "HGNC",
341 "HOMD",
342 "HSSP",
343 "I5KNAL",
344 "IMGT/GENE-DB",
345 "IMGT/HLA",
346 "IMGT/LIGM",
347 "INTERPRO",
348 "IRD",
349 "ISD",
350 "ISFINDER",
351 "ISHAM-ITS",
352 "JGIDB",
353 "MARPOLBASE",
354 "MEDGEN",
355 "MGI",
356 "MIRBASE",
357 "NEXTDB",
358 "NIAEST",
359 "NMPDR",
360 "NRESTDB",
361 "OSA1",
362 "PATHEMA",
363 "PDB",
364 "PFAM",
365 "PGN",
366 "PHYTOZOME",
367 "PIR",
368 "POMBASE",
369 "PSEUDO",
370 "PSEUDOCAP",
371 "RAP-DB",
372 "REMTREMBL",
373 "RFAM",
374 "RICEGENES",
375 "RZPD",
376 "SEED",
377 "SGD",
378 "SGN",
379 "SPTREMBL",
380 "SRPDB",
381 "SUBTILIST",
382 "SWISS-PROT",
383 "TAIR",
384 "TIGRFAM",
385 "TREMBL",
386 "TUBERCULIST",
387 "UNIPROT/SWISS-PROT",
388 "UNIPROT/TREMBL",
389 "UNIPROTKB/SWISS-PROT",
390 "UNIPROTKB/TREMBL",
391 "UNITE",
392 "VBASE2",
393 "VECTORBASE",
394 "VGNC",
395 "VIPR",
396 "VISTA",
397 "WORFDB",
398 "WORMBASE",
399 "XENBASE",
400 "ZFIN",
401 NULL
402 };
403
404 static const char *DbxrefTagInt[] = {
405 "ATCC",
406 "ATCC(DNA)",
407 "ATCC(IN HOST)",
408 "BDGP_EST",
409 "BDGP_INS",
410 "ESTLIB",
411 "GENEID",
412 "GI",
413 "GO",
414 "GREENGENES",
415 "INTREPIDBIO",
416 "JCM",
417 "LOCUSID",
418 "MIM",
419 "MYCOBANK",
420 "NBRC",
421 "PBMICE",
422 "RATMAP",
423 "RGD",
424 "UNILIB",
425 "UNISTS",
426 NULL
427 };
428
429 static const char *EmptyQuals[] = {
430 "artificial_location", /* Fake. Put here to catch
431 it's empty */
432 "chloroplast",
433 "chromoplast",
434 "cyanelle",
435 "environmental_sample",
436 "focus",
437 "germline",
438 "kinetoplast",
439 "macronuclear",
440 "metagenomic",
441 "mitochondrion",
442 "mobile_element_type", /* Fake. Put here to catch
443 it's empty */
444 "partial",
445 "proviral",
446 "pseudo",
447 "rearranged",
448 "ribosomal_slippage",
449 "trans_splicing",
450 "transgenic",
451 "virion",
452 NULL
453 };
454
455 const char *TransSplicingFeats[] = {
456 "3'UTR",
457 "5'UTR",
458 "CDS",
459 "gene",
460 "mRNA",
461 "misc_RNA",
462 "precursor_RNA",
463 "tRNA",
464 NULL
465 };
466
467 const char *ncRNA_class_values[] = {
468 "antisense_RNA",
469 "autocatalytically_spliced_intron",
470 "hammerhead_ribozyme",
471 "lncRNA",
472 "RNase_P_RNA",
473 "RNase_MRP_RNA",
474 "telomerase_RNA",
475 "guide_RNA",
476 "rasiRNA",
477 "ribozyme",
478 "scRNA",
479 "siRNA",
480 "miRNA",
481 "piRNA",
482 "pre_miRNA",
483 "snoRNA",
484 "snRNA",
485 "SRP_RNA",
486 "vault_RNA",
487 "Y_RNA",
488 "other",
489 NULL
490 };
491
492 const char *SatelliteValues[] = {
493 "satellite",
494 "minisatellite",
495 "microsatellite",
496 NULL
497 };
498
499 const char *PseudoGeneValues[] = {
500 "allelic",
501 "processed",
502 "unitary",
503 "unknown",
504 "unprocessed",
505 NULL
506 };
507
508 const char *RegulatoryClassValues[] = {
509 "attenuator",
510 "CAAT_signal",
511 "DNase_I_hypersensitive_site",
512 "enhancer",
513 "enhancer_blocking_element",
514 "GC_signal",
515 "imprinting_control_region",
516 "insulator",
517 "locus_control_region",
518 "matrix_attachment_region",
519 "minus_35_signal",
520 "minus_10_signal",
521 "response_element",
522 "polyA_signal_sequence",
523 "promoter",
524 "recoding_stimulatory_region",
525 "replication_regulatory_region",
526 "ribosome_binding_site",
527 "riboswitch",
528 "silencer",
529 "TATA_box",
530 "terminator",
531 "transcriptional_cis_regulatory_region",
532 "other",
533 NULL
534 };
535
536 StrNum GapTypeValues[] = {
537 {"between scaffolds", 8}, /* contig */
538 {"within scaffold", 9}, /* scaffold */
539 {"telomere", 6}, /* telomere */
540 {"centromere", 5}, /* centromere */
541 {"short arm", 3}, /* short-arm */
542 {"heterochromatin", 4}, /* heterochromatin */
543 {"repeat within scaffold", 7}, /* repeat */
544 {"repeat between scaffolds", 7}, /* repeat */
545 {"unknown", 0}, /* unknown */
546 {NULL, -1}
547 };
548
549 StrNum LinkageEvidenceValues[] = {
550 {"paired-ends", 0}, /* paired-end */
551 {"align genus", 1}, /* align-genus */
552 {"align xgenus", 2}, /* align-xgenus */
553 {"align trnscpt", 3}, /* align-trnscpt */
554 {"within clone", 4}, /* within-clone */
555 {"clone contig", 5}, /* clone-contig */
556 {"map", 6}, /* map */
557 {"strobe", 7}, /* strobe */
558 {"unspecified", 8}, /* unspecified */
559 {"pcr", 9}, /* pcr */
560 {"proximity ligation", 10}, /* proximity-ligation */
561 {NULL, -1}
562 };
563
564 /**********************************************************/
FreeFeatBlkQual(FeatBlkPtr fbp)565 static void FreeFeatBlkQual(FeatBlkPtr fbp)
566 {
567 MemFree(fbp->key);
568 MemFree(fbp->location);
569 delete fbp;
570 }
571
572 /**********************************************************/
FreeFeatBlk(DataBlkPtr dbp,Parser::EFormat format)573 static void FreeFeatBlk(DataBlkPtr dbp, Parser::EFormat format)
574 {
575 DataBlkPtr dbpnext;
576 FeatBlkPtr fbp;
577
578 for(; dbp != NULL; dbp = dbpnext)
579 {
580 dbpnext = dbp->next;
581 fbp = (FeatBlkPtr) dbp->data;
582 if(fbp != NULL)
583 {
584 FreeFeatBlkQual(fbp);
585 dbp->data = NULL;
586 }
587 if(format == Parser::EFormat::XML)
588 MemFree(dbp);
589 }
590 }
591
592 /**********************************************************
593 *
594 * static void DelCharBtwData(value):
595 *
596 * Deletes blanks in the "str".
597 *
598 **********************************************************/
DelCharBtwData(char * value)599 static void DelCharBtwData(char* value)
600 {
601 char* p;
602
603 for(p = value; *p != '\0'; p++)
604 if(*p != ' ')
605 *value++ = *p;
606 *value = '\0';
607 }
608
609 /**********************************************************
610 *
611 * static Int4 flat2asn_range_func(pp, sip):
612 *
613 * For error handle in gbparint.c routines.
614 * This function has to return the length corresponding
615 * to the SeqId it is passed.
616 *
617 * ks 1/13/94
618 *
619 **********************************************************/
flat2asn_range_func(void * pp_ptr,const objects::CSeq_id & id)620 static Int4 flat2asn_range_func(void* pp_ptr, const objects::CSeq_id& id)
621 {
622 ParserPtr pp = reinterpret_cast<ParserPtr>(pp_ptr);
623
624 int use_indx = pp->curindx;
625 char* acnum;
626
627 Int2 vernum;
628
629 #ifdef BIOSEQ_FIND_METHOD
630
631 bsp = BioseqFind(sip);
632 if(bsp != NULL)
633 return(bsp->length);
634
635 /* could try ID0 server
636 */
637 return(-1);
638
639 #else
640
641 const objects::CTextseq_id* text_id = nullptr;
642 if (id.IsGenbank() || id.IsEmbl() || id.IsDdbj() || id.IsTpg() ||
643 id.IsTpe() || id.IsTpd())
644 text_id = id.GetTextseq_Id();
645
646 if (text_id != nullptr)
647 {
648 Int2 text_id_ver = text_id->IsSetVersion() ? text_id->GetVersion() : INT2_MIN;
649 const std::string& text_id_acc = text_id->GetAccession();
650 for (use_indx = 0; use_indx < pp->indx; use_indx++)
651 {
652 acnum = pp->entrylist[use_indx]->acnum;
653 vernum = pp->entrylist[use_indx]->vernum;
654 if (text_id_acc == acnum &&
655 (pp->accver == false || vernum == text_id_ver))
656 break;
657 }
658
659 if (use_indx >= pp->indx)
660 {
661 /* entry is not present in this file use remote fetch function
662 * use_indx = pp->curindx;
663 */
664 size_t len = (!pp->ffdb) ? -1 : CheckOutsideEntry(pp, text_id_acc.c_str(), text_id_ver);
665 if (len != static_cast<size_t>(-1))
666 return static_cast<Int4>(len);
667
668 if (pp->buf == NULL)
669 {
670 if (pp->farseq)
671 return -1;
672
673 if (pp->accver == false || text_id_ver < 0)
674 {
675 Nlm_ErrSetContext("validatr", __FILE__, __LINE__);
676 Nlm_ErrPostEx(SEV_WARNING, ERR_LOCATION_FailedCheck,
677 "Location points to outside entry %s",
678 text_id_acc.c_str());
679 }
680 else
681 {
682 Nlm_ErrSetContext("validatr", __FILE__, __LINE__);
683 Nlm_ErrPostEx(SEV_WARNING, ERR_LOCATION_FailedCheck,
684 "Location points to outside entry %s.%d",
685 text_id_acc.c_str(), text_id_ver);
686 }
687 return(-1);
688 }
689
690 if (*pp->buf == '\0')
691 return(-1);
692
693 if (pp->source == Parser::ESource::NCBI || pp->source == Parser::ESource::Refseq)
694 ErrPostEx(SEV_WARNING, ERR_LOCATION_NCBIRefersToExternalRecord,
695 "Feature location references an interval on another record : %s",
696 pp->buf);
697 else
698 ErrPostEx(SEV_WARNING, ERR_LOCATION_RefersToExternalRecord,
699 "Feature location references an interval on another record : %s",
700 pp->buf);
701 MemFree(pp->buf);
702 pp->buf = (char*)MemNew(1);
703 *pp->buf = '\0';
704 return(-1);
705 }
706 }
707 return static_cast<Int4>(pp->entrylist[use_indx]->bases);
708
709 #endif
710
711 }
712
713 /**********************************************************/
CheckForeignLoc(const objects::CSeq_loc & loc,const objects::CSeq_id & sid)714 static bool CheckForeignLoc(const objects::CSeq_loc& loc, const objects::CSeq_id& sid)
715 {
716 const objects::CSeq_id& pid = *loc.GetId();
717
718 if (loc.IsMix() || loc.IsEquiv() ||
719 sid.Compare(pid) == objects::CSeq_id::e_YES)
720 return false;
721
722 return true;
723 }
724
725 /**********************************************************/
DbxrefQualToDbtag(const objects::CGb_qual & qual,Parser::ESource source)726 static CRef<objects::CDbtag> DbxrefQualToDbtag(const objects::CGb_qual& qual, Parser::ESource source)
727 {
728 CRef<objects::CDbtag> tag;
729
730 if (!qual.IsSetQual() ||
731 qual.GetQual() != "db_xref")
732 return tag;
733
734 if (!qual.IsSetVal() || qual.GetVal().empty())
735 {
736 ErrPostEx(SEV_WARNING, ERR_QUALIFIER_EmptyQual,
737 "Found empty /db_xref qualifier. Qualifier dropped.");
738 return tag;
739 }
740
741 const std::string& val = qual.GetVal();
742 if (StringICmp(val.c_str(), "taxon") == 0)
743 return tag;
744
745 std::string line = val;
746
747 if (StringNICmp(line.c_str(), "MGD:MGI:", 8) == 0)
748 line = line.substr(4);
749
750 size_t colon = line.find(':');
751 if (colon == std::string::npos)
752 {
753 ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DbxrefIncorrect,
754 "Badly formatted /db_xref qualifier: \"%s\". Qualifier dropped.",
755 val.c_str());
756 return tag;
757 }
758
759 std::string tail = line.substr(colon + 1);
760 line = line.substr(0, colon);
761
762 if (MatchArrayIString(DbxrefObsolete, line.c_str()) > -1)
763 {
764 ErrPostEx(SEV_WARNING, ERR_FEATURE_ObsoleteDbXref,
765 "/db_xref type \"%s\" is obsolete.", line.c_str());
766
767 std::string buf;
768 if(StringICmp(line.c_str(), "BHB") == 0)
769 buf = "IRD";
770 else if (StringICmp(line.c_str(), "BioHealthBase") == 0)
771 buf = "IRD";
772 else if (StringICmp(line.c_str(), "GENEW") == 0)
773 buf = "HGNC";
774 else if (StringICmp(line.c_str(), "IFO") == 0)
775 buf = "NBRC";
776 else if (StringICmp(line.c_str(), "SWISS-PROT") == 0)
777 buf = "UniProt/Swiss-Prot";
778 else
779 buf = "UniProt/TrEMBL";
780
781 line = buf;
782 }
783
784 if(StringICmp(line.c_str(), "UNIPROT/SWISS-PROT") == 0 ||
785 StringICmp(line.c_str(), "UNIPROT/TREMBL") == 0)
786 {
787 std::string buf("UniProtKB");
788 buf += line.substr(7);
789
790 line = buf;
791 }
792
793 const Char* strid = NULL;
794 Int4 intid = 0;
795
796 const Char* p = tail.c_str();
797 if (MatchArrayIString(DbxrefTagAny, line.c_str()) > -1)
798 {
799 for(strid = p; *p >= '0' && *p <= '9';)
800 p++;
801 if(*p == '\0' && *strid != '0')
802 {
803 intid = atoi(strid);
804 strid = NULL;
805 }
806 }
807 else if(MatchArrayIString(DbxrefTagStr, line.c_str()) > -1 ||
808 (source == Parser::ESource::EMBL &&
809 MatchArrayIString(EMBLDbxrefTagStr, line.c_str()) > -1))
810 {
811 for(strid = p; *p >= '0' && *p <= '9';)
812 p++;
813 if(*p == '\0')
814 {
815 ErrPostEx(SEV_WARNING, ERR_QUALIFIER_DbxrefWrongType,
816 "/db_xref qualifier \"%s\" is supposed to be a string, but its value consists of digits only.",
817 val.c_str());
818 if(*strid != '0')
819 {
820 intid = atoi(strid);
821 strid = NULL;
822 }
823 }
824 }
825 else if(MatchArrayIString(DbxrefTagInt, line.c_str()) > -1)
826 {
827 const Char* q = p;
828 for(; *q == '0';)
829 q++;
830 if(*q == '\0')
831 {
832 ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DbxrefShouldBeNumeric,
833 "/db_xref qual should have numeric value greater than 0: \"%s\". Qualifier dropped.",
834 val.c_str());
835 return tag;
836 }
837
838 const Char* r = q;
839 for(; *r >= '0' && *r <= '9';)
840 r++;
841 if(*r != '\0')
842 {
843 ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DbxrefWrongType,
844 "/db_xref qualifier \"%s\" is supposed to be a numeric identifier, but its value includes alphabetic characters. Qualifier dropped.",
845 val.c_str());
846 return tag;
847 }
848 if(*r != '\0' || q != p)
849 strid = p;
850 else if(StringICmp(line.c_str(), "IntrepidBio") == 0 && fta_number_is_huge(q))
851 strid = q;
852 else
853 intid = atoi(q);
854 }
855 else if(StringICmp(line.c_str(), "PID") == 0)
856 {
857 if(*p != 'e' && *p != 'g' && *p != 'd')
858 {
859 ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DbxrefIncorrect,
860 "Badly formatted /db_xref qual \"PID\": \"%s\". Qualifier dropped.",
861 val.c_str());
862 return tag;
863 }
864
865 const Char* q = p + 1;
866 for(; *q == '0';)
867 q++;
868
869 const Char* r = q;
870 for (r = q; *r >= '0' && *r <= '9';)
871 r++;
872 if(*q == '\0' || *r != '\0')
873 {
874 ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DbxrefShouldBeNumeric,
875 "/db_xref qual \"PID\" should contain numeric value greater than 0: \"%s\". Qualifier dropped.",
876 val.c_str());
877 return tag;
878 }
879 strid = p;
880 }
881 else
882 {
883 ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DbxrefUnknownDBName,
884 "Unknown data base name /db_xref = \"%s\". Qualifier dropped.",
885 val.c_str());
886 return tag;
887 }
888
889
890 tag.Reset(new objects::CDbtag);
891
892 tag->SetDb(line);
893
894 if(strid != NULL)
895 tag->SetTag().SetStr(strid);
896 else
897 tag->SetTag().SetId(intid);
898
899 return tag;
900 }
901
902 /**********************************************************
903 *
904 * Function:
905 * static void FilterDb_xref(pSeqFeat, source)
906 *
907 * Purpose:
908 * Looks through SeqFeat's qualifiers which contain
909 * "db_xref" in qual field, convert such qualifiers
910 * into Dbtags removing the qualifiers from SeqFeat's
911 * list, got Dbtags links in the chain of ValNodes
912 * and puts the chain into the SeqFeat.
913 *
914 * Parameters:
915 * pSeqFeat - pointer to a SeqFeat for processing
916 *
917 * Return:
918 * None.
919 *
920 **********************************************************/
FilterDb_xref(objects::CSeq_feat & feat,Parser::ESource source)921 static void FilterDb_xref(objects::CSeq_feat& feat, Parser::ESource source)
922 {
923 if (!feat.IsSetQual())
924 return;
925
926 objects::CSeq_feat::TDbxref& db_refs = feat.SetDbxref();
927
928 for (objects::CSeq_feat::TQual::iterator qual = feat.SetQual().begin(); qual != feat.SetQual().end(); )
929 {
930 if (!(*qual)->IsSetQual() || (*qual)->GetQual() != "db_xref")
931 {
932 /* Just skip this qualifier, it isn't db_xref
933 */
934 ++qual;
935 continue;
936 }
937
938 /* Current qualifier is db_xref, process it
939 */
940 CRef<objects::CDbtag> dbtag = DbxrefQualToDbtag(*(*qual), source);
941 if (dbtag.NotEmpty())
942 {
943 db_refs.push_back(dbtag);
944 }
945
946 /* Remove converted qualifier from chain of qualifiers
947 */
948 qual = feat.SetQual().erase(qual);
949 }
950
951 if (feat.GetQual().empty())
952 feat.ResetQual();
953
954 if (db_refs.empty())
955 feat.ResetDbxref();
956 }
957
958 /**********************************************************
959 *
960 * bool GetSeqLocation(sfp, location, ids,
961 * hard_err, pp, name):
962 *
963 * Return locmap = TRUE if mapping location rules not
964 * work, then SeqLocPtr->whole = ids[0].
965 * sfp->location is a SeqLocPtr which is defined
966 * as a ValNodePtr.
967 *
968 * 7-26-93
969 *
970 **********************************************************/
GetSeqLocation(objects::CSeq_feat & feat,char * location,TSeqIdList & ids,bool * hard_err,ParserPtr pp,char * name)971 bool GetSeqLocation(objects::CSeq_feat& feat, char* location, TSeqIdList& ids,
972 bool* hard_err, ParserPtr pp, char* name)
973 {
974 bool sitesmap;
975 bool locmap = true;
976 int num_errs;
977
978 *hard_err = false;
979 num_errs = 0;
980
981 CRef<objects::CSeq_loc> loc = xgbparseint_ver(location, locmap, sitesmap,
982 num_errs, ids, pp->accver);
983
984 if (loc.NotEmpty())
985 {
986 TSeqLocList locs;
987 locs.push_back(loc);
988 fta_fix_seq_loc_id(locs, pp, location, name, false);
989
990 feat.SetLocation(*loc);
991 }
992
993 if (num_errs > 0)
994 {
995 feat.ResetLocation();
996 objects::CSeq_loc& cur_loc = feat.SetLocation();
997 cur_loc.SetWhole(*(*ids.begin()));
998 *hard_err = true;
999 }
1000 else if(!feat.GetLocation().IsEmpty())
1001 {
1002 if (feat.GetLocation().IsMix())
1003 {
1004 if (feat.GetLocation().GetMix().Get().size() == 1)
1005 {
1006 CRef<objects::CSeq_loc> cur_loc(new objects::CSeq_loc);
1007
1008 cur_loc->Assign(*feat.GetLocation().GetMix().GetFirstLoc());
1009 if (cur_loc->IsInt())
1010 feat.SetLocation(*cur_loc);
1011 }
1012 }
1013 }
1014
1015 return locmap;
1016 }
1017
1018 /**********************************************************
1019 *
1020 * static char* CheckLocStr(str):
1021 *
1022 * Nlm_gbparseint routine does not parse certain types
1023 * of interval correctly, so this routine will save input
1024 * form in fbp before passing it:
1025 * (bases 100 to 300) ==> 100 to 300;
1026 * (bases 1 to 100; 200 to 300) no change.
1027 *
1028 * 5-20-93
1029 *
1030 **********************************************************/
CheckLocStr(const Char * str)1031 static char* CheckLocStr(const Char* str)
1032 {
1033 const Char* ptr;
1034 const Char* eptr;
1035 char* location;
1036
1037 ptr = StringChr(str, ';');
1038 if(ptr != NULL)
1039 return StringSave(str);
1040
1041 for(ptr = str; *ptr != ' ' && *ptr != '\0';)
1042 ptr++;
1043 while(*ptr == ' ')
1044 ptr++;
1045
1046 eptr = StringChr(str, ')');
1047 if(eptr == NULL)
1048 return(NULL);
1049
1050 while(*eptr == ' ' || *eptr == ')')
1051 --eptr;
1052
1053 location = StringSave(std::string(ptr, eptr + 1).c_str());
1054 return(location);
1055 }
1056
1057 /*****************************************************************************
1058 *
1059 * bool SeqIntCheckCpp(loc) is instead of C-toolkit 'bool SeqIntCheck(sip)'
1060 * checks that a seq interval is valid
1061 *
1062 *****************************************************************************/
SeqIntCheckCpp(const objects::CSeq_loc & loc)1063 static bool SeqIntCheckCpp(const objects::CSeq_loc& loc)
1064 {
1065 Uint4 len = UINT4_MAX;
1066
1067 objects::CBioseq_Handle bio_h = GetScope().GetBioseqHandle(*loc.GetId());
1068 if (bio_h.CanGetInst() && bio_h.CanGetInst_Length())
1069 len = bio_h.GetBioseqLength();
1070
1071 return loc.GetInt().GetFrom() <= loc.GetInt().GetTo() && loc.GetInt().GetTo() < len;
1072 }
1073
1074 /*****************************************************************************
1075 *
1076 * bool SeqPntCheckCpp(loc) is instead of C-toolkit 'Boolean SeqPntCheck(SeqPntPtr spp)'
1077 * checks that a seq point is valid
1078 *
1079 *****************************************************************************/
SeqPntCheckCpp(const objects::CSeq_loc & loc)1080 static bool SeqPntCheckCpp(const objects::CSeq_loc& loc)
1081 {
1082 Uint4 len = UINT4_MAX;
1083
1084 objects::CBioseq_Handle bio_h = GetScope().GetBioseqHandle(*loc.GetId());
1085 if (bio_h.CanGetInst() && bio_h.CanGetInst_Length())
1086 len = bio_h.GetBioseqLength();
1087
1088 return loc.GetPnt().GetPoint() < len;
1089 }
1090
1091 /*****************************************************************************
1092 *
1093 * bool PackSeqPntCheck(loc) is instead of C-toolkit 'Boolean PackSeqPntCheck (pspp)'
1094 *
1095 *****************************************************************************/
PackSeqPntCheckCpp(const objects::CSeq_loc & loc)1096 static bool PackSeqPntCheckCpp(const objects::CSeq_loc& loc)
1097 {
1098 Uint4 len = UINT4_MAX;
1099
1100 objects::CBioseq_Handle bio_h = GetScope().GetBioseqHandle(*loc.GetId());
1101 if (bio_h.CanGetInst() && bio_h.CanGetInst_Length())
1102 len = bio_h.GetBioseqLength();
1103
1104 ITERATE(objects::CSeq_loc::TPoints, point, loc.GetPacked_pnt().GetPoints())
1105 {
1106 if (*point >= len)
1107 return false;
1108 }
1109
1110 return true;
1111 }
1112
1113 /**********************************************************/
1114 /* returns : 2 = Ok, 1 = mixed strands, 0 = error in location
1115 */
FTASeqLocCheck(const objects::CSeq_loc & locs,char * accession)1116 static Uint1 FTASeqLocCheck(const objects::CSeq_loc& locs, char* accession)
1117 {
1118 Uint1 strand = 99;
1119 Uint1 retval = 2;
1120
1121 objects::CSeq_loc_CI ci(locs);
1122
1123 bool good = true;
1124 for (; ci; ++ci)
1125 {
1126 CConstRef<objects::CSeq_loc> cur_loc = ci.GetRangeAsSeq_loc();
1127
1128 const objects::CSeq_id* cur_id = nullptr;
1129
1130 switch (cur_loc->Which())
1131 {
1132 case objects::CSeq_loc::e_Int:
1133 good = SeqIntCheckCpp(*cur_loc);
1134 if (good)
1135 cur_id = cur_loc->GetId();
1136 break;
1137
1138 case objects::CSeq_loc::e_Pnt:
1139 good = SeqPntCheckCpp(*cur_loc);
1140 if (good)
1141 cur_id = cur_loc->GetId();
1142 break;
1143
1144 case objects::CSeq_loc::e_Packed_pnt:
1145 good = PackSeqPntCheckCpp(*cur_loc);
1146 if (good)
1147 cur_id = cur_loc->GetId();
1148 break;
1149
1150 case objects::CSeq_loc::e_Bond:
1151 if (!cur_loc->GetBond().CanGetA())
1152 good = false;
1153
1154 if (good)
1155 cur_id = cur_loc->GetId();
1156 break;
1157
1158 case objects::CSeq_loc::e_Empty:
1159 case objects::CSeq_loc::e_Whole:
1160 cur_id = cur_loc->GetId();
1161 break;
1162
1163 default:
1164 continue;
1165 }
1166
1167 if (!good)
1168 break;
1169
1170 if (accession == nullptr || cur_id == nullptr)
1171 continue;
1172
1173 if (!cur_id->IsGenbank() && !cur_id->IsEmbl() && !cur_id->IsPir() &&
1174 !cur_id->IsSwissprot() && !cur_id->IsOther() && !cur_id->IsDdbj() &&
1175 !cur_id->IsPrf() && !cur_id->IsTpg() && !cur_id->IsTpe() &&
1176 !cur_id->IsTpd() && !cur_id->IsGpipe())
1177 continue;
1178
1179 const objects::CTextseq_id* text_id = cur_id->GetTextseq_Id();
1180
1181 if (text_id == nullptr || !text_id->CanGetAccession())
1182 continue;
1183
1184 if (text_id->GetAccession() == accession)
1185 {
1186 if (strand == 99)
1187 strand = cur_loc->GetStrand();
1188 else if (strand != cur_loc->GetStrand())
1189 retval = 1;
1190 }
1191 }
1192
1193 if (!good)
1194 return 0;
1195
1196 return retval;
1197 }
1198
1199 /**********************************************************/
fta_strip_aa(char * str)1200 static void fta_strip_aa(char* str)
1201 {
1202 if(str == NULL || *str == '\0')
1203 return;
1204
1205 while(str != NULL)
1206 {
1207 str = StringStr(str, "aa");
1208 if(str != NULL)
1209 fta_StringCpy(str, str + 2);
1210 }
1211 }
1212
1213 /**********************************************************
1214 *
1215 * static SeqFeatPtr SeqFeatPub(pp, entry, hsfp, seq_id,
1216 * col_data, ibp):
1217 *
1218 * 5-26-93
1219 *
1220 **********************************************************/
SeqFeatPub(ParserPtr pp,DataBlkPtr entry,TSeqFeatList & feats,TSeqIdList & seqids,Int4 col_data,IndexblkPtr ibp)1221 static void SeqFeatPub(ParserPtr pp, DataBlkPtr entry, TSeqFeatList& feats,
1222 TSeqIdList& seqids, Int4 col_data, IndexblkPtr ibp)
1223 {
1224 DataBlkPtr dbp;
1225 DataBlkPtr subdbp;
1226 char* p;
1227 char* q;
1228 char* location = NULL;
1229
1230 bool err = false;
1231 Uint1 i;
1232
1233 /* REFERENCE, to Seq-feat
1234 */
1235 if(pp->format == Parser::EFormat::XML)
1236 dbp = XMLBuildRefDataBlk(entry->offset, ibp->xip, ParFlat_REF_BTW);
1237 else
1238 dbp = TrackNodeType(entry, ParFlat_REF_BTW);
1239 if(dbp == NULL)
1240 return;
1241
1242
1243 for(; dbp != NULL; dbp = dbp->next)
1244 {
1245 if(dbp->type != ParFlat_REF_BTW)
1246 continue;
1247
1248 CRef<objects::CPubdesc> pubdesc = DescrRefs(pp, dbp, col_data);
1249 if (pubdesc.Empty())
1250 continue;
1251
1252 CRef<objects::CSeq_feat> feat(new objects::CSeq_feat);
1253 feat->SetData().SetPub(*pubdesc);
1254
1255 location = NULL;
1256 if(pp->format == Parser::EFormat::XML)
1257 {
1258 location = XMLFindTagValue(dbp->offset, (XmlIndexPtr) dbp->data,
1259 INSDREFERENCE_POSITION);
1260 if(location == NULL)
1261 {
1262 q = XMLFindTagValue(dbp->offset, (XmlIndexPtr) dbp->data,
1263 INSDREFERENCE_REFERENCE);
1264 if(q != NULL)
1265 {
1266 for(p = q; *p != '\0' && *p != '(';)
1267 p++;
1268 if(*p != '\0')
1269 location = CheckLocStr(p + 1);
1270 MemFree(q);
1271 }
1272 }
1273 else
1274 {
1275 p = StringChr(location, ';');
1276 if(p != NULL)
1277 {
1278 p = (char*) MemNew(StringLen(location) + 7);
1279 StringCpy(p, "join(");
1280 StringCat(p, location);
1281 StringCat(p, ")");
1282 MemFree(location);
1283 location = p;
1284 }
1285 }
1286 }
1287 else if(pp->format == Parser::EFormat::GenBank)
1288 {
1289 for(p = dbp->offset + col_data; *p != '\0' && *p != '(';)
1290 p++;
1291 location = CheckLocStr(std::string(p, dbp->offset + dbp->len - p).c_str());
1292 }
1293 else if(pp->format == Parser::EFormat::EMBL)
1294 {
1295 subdbp = (DataBlkPtr) dbp->data;
1296 for(; subdbp != NULL; subdbp = subdbp->next)
1297 {
1298 if(subdbp->type != ParFlat_RP)
1299 continue;
1300
1301 for(p = subdbp->offset; *p != '\0' && IS_DIGIT(*p) == 0;)
1302 p++;
1303 if(StringChr(p, ',') != NULL)
1304 {
1305 location = (char*) MemNew(StringLen(p) + 7);
1306 sprintf(location, "join(%s)", p);
1307 }
1308 else
1309 location = StringSave(p);
1310 break;
1311 }
1312 }
1313 if(location == NULL || *location == '\0')
1314 {
1315 ErrPostEx(SEV_REJECT, ERR_REFERENCE_UnparsableLocation,
1316 "NULL or empty reference location. Entry dropped.");
1317 err = true;
1318 if(location != NULL)
1319 MemFree(location);
1320 break;
1321 }
1322
1323 if(ibp->is_prot)
1324 fta_strip_aa(location);
1325
1326 if(pp->buf != NULL)
1327 MemFree(pp->buf);
1328 pp->buf = NULL;
1329
1330 GetSeqLocation(*feat, location, seqids, &err, pp, (char*) "pub");
1331
1332 if(err)
1333 {
1334 ErrPostEx(SEV_REJECT, ERR_REFERENCE_UnparsableLocation,
1335 "Unparsable reference location. Entry dropped.");
1336 MemFree(location);
1337 break;
1338 }
1339
1340 i = FTASeqLocCheck(feat->GetLocation(), ibp->acnum);
1341
1342 if(i == 0)
1343 {
1344 ErrPostEx(SEV_WARNING, ERR_LOCATION_FailedCheck, location);
1345 if(pp->debug)
1346 {
1347 feats.push_back(feat);
1348 }
1349 }
1350 else
1351 {
1352 if(i == 1)
1353 {
1354 ErrPostEx(SEV_WARNING, ERR_LOCATION_MixedStrand,
1355 "Mixed strands in SeqLoc: %s", location);
1356 }
1357 feats.push_back(feat);
1358 }
1359 if(location != NULL)
1360 MemFree(location);
1361 }
1362
1363 if(!err)
1364 return;
1365
1366 ibp->drop = 1;
1367 feats.clear();
1368 }
1369
1370 /**********************************************************
1371 *
1372 * static SeqFeatPtr ImpFeatPub(pp, entry, hsfp, seq_id,
1373 * col_data, ibp):
1374 *
1375 * 5-26-93
1376 *
1377 **********************************************************/
ImpFeatPub(ParserPtr pp,DataBlkPtr entry,TSeqFeatList & feats,objects::CSeq_id & seq_id,Int4 col_data,IndexblkPtr ibp)1378 static void ImpFeatPub(ParserPtr pp, DataBlkPtr entry, TSeqFeatList& feats,
1379 objects::CSeq_id& seq_id, Int4 col_data, IndexblkPtr ibp)
1380 {
1381 DataBlkPtr dbp;
1382
1383 bool first;
1384
1385 /* REFERENCE, Imp-feat
1386 */
1387 if(pp->format == Parser::EFormat::XML)
1388 dbp = XMLBuildRefDataBlk(entry->offset, ibp->xip, ParFlat_REF_SITES);
1389 else
1390 dbp = TrackNodeType(entry, ParFlat_REF_SITES);
1391 if(dbp == NULL)
1392 return;
1393
1394 CRef<objects::CSeq_feat> feat;
1395 for (first = true; dbp != NULL; dbp = dbp->next)
1396 {
1397 if(dbp->type != ParFlat_REF_SITES)
1398 continue;
1399
1400 CRef<objects::CPubdesc> pubdesc = DescrRefs(pp, dbp, col_data);
1401 if (pubdesc.Empty() || !pubdesc->IsSetPub())
1402 continue;
1403
1404 if(first)
1405 {
1406 feat.Reset(new objects::CSeq_feat);
1407
1408 objects::CImp_feat& imp_feat = feat->SetData().SetImp();
1409 imp_feat.SetKey("Site-ref");
1410 imp_feat.SetLoc("sites");
1411
1412 feat->SetLocation(*fta_get_seqloc_int_whole(seq_id, ibp->bases));
1413 first = false;
1414 }
1415
1416 CRef<objects::CPub> pub(new objects::CPub);
1417 pub->SetEquiv(pubdesc->SetPub());
1418
1419 feat->SetCit().SetPub().push_back(pub);
1420
1421 if (pubdesc->IsSetComment())
1422 feat->SetComment(pubdesc->GetComment());
1423 else
1424 feat->ResetComment();
1425 }
1426
1427 if (!first && feat.NotEmpty())
1428 feats.push_back(feat);
1429 }
1430
1431 /**********************************************************/
fta_fake_gbparse_err_handler(const Char *,const Char *)1432 static void fta_fake_gbparse_err_handler(const Char*, const Char*)
1433 {
1434 }
1435
1436 /**********************************************************/
location_to_string_or_unknown(const objects::CSeq_loc & loc)1437 static Char* location_to_string_or_unknown(const objects::CSeq_loc& loc)
1438 {
1439 Char* ret = location_to_string(loc);
1440 if (ret == NULL)
1441 ret = StringSave("unknown location");
1442
1443 return ret;
1444 }
1445
1446 /**********************************************************/
GetTrnaAnticodon(const objects::CSeq_feat & feat,char * qval,const TSeqIdList & seqids,bool accver)1447 static CRef<objects::CSeq_loc> GetTrnaAnticodon(const objects::CSeq_feat& feat, char* qval, const TSeqIdList& seqids,
1448 bool accver)
1449 {
1450 char* loc_str;
1451 char* p;
1452 char* q;
1453 bool fake1;
1454 bool fake2;
1455 Int4 range;
1456 Int4 pars;
1457 Char ch;
1458 int fake3;
1459
1460 CRef<objects::CSeq_loc> ret;
1461
1462 if (qval == NULL)
1463 return ret;
1464
1465 p = StringStr(qval, "pos:");
1466 if (p == NULL)
1467 return ret;
1468
1469 for(q = p + 4; *q == ' ';)
1470 q++;
1471
1472 for(pars = 0, p = q; *p != '\0'; p++)
1473 {
1474 if(*p == ',' && pars == 0)
1475 break;
1476 if(*p == '(')
1477 pars++;
1478 else if(*p == ')')
1479 {
1480 pars--;
1481 if(pars == 0)
1482 {
1483 p++;
1484 break;
1485 }
1486 }
1487 }
1488
1489 ch = *p;
1490 *p = '\0';
1491 loc_str = StringSave(q);
1492 *p = ch;
1493
1494 xinstall_gbparse_error_handler(fta_fake_gbparse_err_handler);
1495 ret = xgbparseint_ver(loc_str, fake1, fake2, fake3, seqids, accver);
1496 xinstall_gbparse_error_handler(NULL);
1497
1498 if (ret.Empty())
1499 {
1500 p = location_to_string_or_unknown(feat.GetLocation());
1501
1502 ErrPostEx(SEV_ERROR, ERR_FEATURE_InvalidAnticodonPos,
1503 "Invalid position element for an /anticodon qualifier : \"%s\" : qualifier dropped : feature location \"%s\".",
1504 loc_str, (p == NULL) ? "unknown" : p);
1505
1506 if (p != NULL)
1507 MemFree(p);
1508 MemFree(loc_str);
1509
1510 return ret;
1511 }
1512
1513 range = objects::sequence::GetLength(*ret, &GetScope());
1514 if (range != 3)
1515 {
1516 p = location_to_string_or_unknown(feat.GetLocation());
1517
1518 if (range == 4)
1519 ErrPostEx(SEV_WARNING, ERR_FEATURE_FourBaseAntiCodon,
1520 "tRNA feature at \"%s\" has anticodon with location spanning four bases: \"%s\". Cannot generate corresponding codon value from the DNA sequence.",
1521 (p == NULL) ? "unknown" : p, loc_str);
1522 else
1523 ErrPostEx(SEV_ERROR, ERR_FEATURE_StrangeAntiCodonSize,
1524 "tRNA feature at \"%s\" has anticodon of an unusual size: \"%s\". Cannot generate corresponding codon value from the DNA sequence.",
1525 (p == NULL) ? "unknown" : p, loc_str);
1526
1527 if (p != NULL)
1528 MemFree(p);
1529 }
1530
1531 // Comparing two locations ignoring their IDs
1532 // Anticodon should be inside the original location (may be the same)
1533 CRange<TSeqPos> anticodon_range = ret->GetTotalRange();
1534 CRange<TSeqPos> xrange = feat.GetLocation().GetTotalRange().IntersectionWith(anticodon_range);
1535
1536 if (xrange != anticodon_range)
1537 {
1538 p = location_to_string_or_unknown(feat.GetLocation());
1539
1540 ErrPostEx(SEV_ERROR, ERR_FEATURE_BadAnticodonLoc,
1541 "Anticodon location \"%s\" does not fall within tRNA feature at \"%s\".",
1542 loc_str, (p == NULL) ? "unknown" : p);
1543
1544 if(p != NULL)
1545 MemFree(p);
1546 MemFree(loc_str);
1547
1548 ret.Reset();
1549 return ret;
1550 }
1551
1552 MemFree(loc_str);
1553 return ret;
1554 }
1555
1556 /**********************************************************/
fta_parse_rrna_feat(objects::CSeq_feat & feat,objects::CRNA_ref & rna_ref)1557 static void fta_parse_rrna_feat(objects::CSeq_feat& feat, objects::CRNA_ref& rna_ref)
1558 {
1559 char* qval;
1560 char* p;
1561 char* q;
1562 Char ch;
1563
1564 qval = GetTheQualValue(feat.SetQual(), "product");
1565 if (feat.GetQual().empty())
1566 feat.ResetQual();
1567
1568 std::string qval_str;
1569 if (qval)
1570 {
1571 qval_str = qval;
1572 MemFree(qval);
1573 qval = NULL;
1574 }
1575
1576 size_t len = 0;
1577 if (qval_str.empty() && feat.IsSetComment() && rna_ref.GetType() == objects::CRNA_ref::eType_rRNA)
1578 {
1579 std::string comment = feat.GetComment();
1580 len = comment.size();
1581
1582 if(len > 15 && len < 20)
1583 {
1584 if(StringNICmp(comment.c_str() + len - 15, "S ribosomal RNA", 15) == 0)
1585 {
1586 qval_str = comment;
1587 feat.ResetComment();
1588 }
1589 }
1590 else if(len > 6 && len < 20)
1591 {
1592 if (StringNICmp(comment.c_str() + len - 6, "S rRNA", 6) == 0)
1593 {
1594 qval_str = comment;
1595 feat.ResetComment();
1596 }
1597 }
1598 }
1599
1600 if (qval_str.empty())
1601 return;
1602
1603 qval = StringSave(qval_str.c_str());
1604 for(p = qval; p != NULL; p += 13)
1605 {
1606 p = StringIStr(p, "ribosomal rrna");
1607 if(p == NULL)
1608 break;
1609 fta_StringCpy(p + 10, p + 11);
1610 }
1611
1612 for(p = qval; p != NULL; p = qval + len)
1613 {
1614 p = StringIStr(p, "ribosomalrna");
1615 if(p == NULL)
1616 break;
1617 q = (char*) MemNew(StringLen(qval) + 2);
1618 p[9] = '\0';
1619 StringCpy(q, qval);
1620 StringCat(q, " RNA");
1621 StringCat(q, p + 12);
1622 len = p - qval + 13;
1623 MemFree(qval);
1624 qval = q;
1625 }
1626
1627 if(qval != NULL)
1628 {
1629 p = StringIStr(qval, " rrna");
1630 if(p != NULL)
1631 {
1632 q = (char*) MemNew(StringLen(qval) + 10);
1633 *p = '\0';
1634 StringCpy(q, qval);
1635 StringCat(q, " ribosomal RNA");
1636 StringCat(q, p + 5);
1637 MemFree(qval);
1638 qval = q;
1639 }
1640 }
1641
1642 for(p = qval, q = p; q != NULL; q = p + 13)
1643 {
1644 p = StringIStr(q, "ribosomal DNA");
1645 if(p == NULL)
1646 {
1647 p = StringIStr(q, "ribosomal RNA");
1648 if(p == NULL)
1649 break;
1650 }
1651 p[10] = 'R';
1652 p[11] = 'N';
1653 p[12] = 'A';
1654 }
1655
1656 p = StringIStr(qval, "s ribosomal RNA");
1657 if(p != NULL && p > qval && p[15] == '\0')
1658 {
1659 p--;
1660 if(*p >= '0' && *p <= '9')
1661 *++p = 'S';
1662 }
1663
1664 for(p = qval;;)
1665 {
1666 p = StringIStr(p, "ribosomal");
1667 if(p == NULL)
1668 break;
1669 if(p == qval || (p[9] != ' ' && p[9] != '\0'))
1670 {
1671 p += 9;
1672 continue;
1673 }
1674 if(StringNCmp(p + 9, " RNA", 4) == 0)
1675 {
1676 p += 13;
1677 continue;
1678 }
1679 len = p - qval + 14;
1680 q = (char*) MemNew(StringLen(qval) + 5);
1681 p += 9;
1682 ch = *p;
1683 *p = '\0';
1684 StringCpy(q, qval);
1685 StringCat(q, " RNA");
1686 *p = ch;
1687 StringCat(q, p);
1688 MemFree(qval);
1689 qval = q;
1690 p = qval + len;
1691 }
1692
1693 for(p = qval;;)
1694 {
1695 p = StringIStr(p, " ribosomal RNA");
1696 if(p == NULL)
1697 break;
1698 p += 14;
1699 if(StringNICmp(p, " ribosomal RNA", 14) == 0)
1700 fta_StringCpy(p, p + 14);
1701 }
1702
1703 DeleteQual(feat.SetQual(), "product");
1704 if (feat.GetQual().empty())
1705 feat.ResetQual();
1706
1707 if(StringLen(qval) > 511)
1708 {
1709 qval[510] = '>';
1710 qval[511] = '\0';
1711 p = StringSave(qval);
1712 MemFree(qval);
1713 qval = p;
1714 }
1715
1716 rna_ref.SetExt().SetName(qval);
1717 MemFree(qval);
1718 }
1719
1720 /**********************************************************/
fta_get_aa_from_symbol(Char ch)1721 static Uint1 fta_get_aa_from_symbol(Char ch)
1722 {
1723 AaCodonsPtr acp;
1724
1725 for(acp = aacodons; acp->straa != NULL; acp++)
1726 if(acp->intaa == ch)
1727 break;
1728 if(acp->straa != NULL)
1729 return(acp->intaa);
1730
1731 return(0);
1732 }
1733
1734 /**********************************************************/
fta_get_aa_from_string(char * str)1735 static Uint1 fta_get_aa_from_string(char* str)
1736 {
1737 AaCodonsPtr acp;
1738 TrnaAaPtr tap;
1739
1740 for(tap = taa; tap->name != NULL; tap++)
1741 if(StringICmp(str, tap->name) == 0)
1742 break;
1743 if(tap->name != NULL)
1744 return(tap->aa);
1745
1746 for(acp = aacodons; acp->straa != NULL; acp++)
1747 if(StringICmp(acp->straa, str) == 0)
1748 break;
1749 if(acp->straa != NULL)
1750 return(acp->intaa);
1751
1752 return(0);
1753 }
1754
1755 /**********************************************************/
get_aa_from_trna(const objects::CTrna_ext & trna)1756 static int get_aa_from_trna(const objects::CTrna_ext& trna)
1757 {
1758 int ret = 0;
1759 if (trna.IsSetAa() && trna.GetAa().IsNcbieaa())
1760 ret = trna.GetAa().GetNcbieaa();
1761
1762 return ret;
1763 }
1764
1765 /**********************************************************/
fta_get_trna_from_product(objects::CSeq_feat & feat,const Char * product,unsigned char * remove)1766 static CRef<objects::CTrna_ext> fta_get_trna_from_product(objects::CSeq_feat& feat, const Char* product,
1767 unsigned char* remove)
1768 {
1769 const char **b;
1770
1771 char* p;
1772 char* q;
1773 char* start;
1774 char* end;
1775 char* first;
1776 char* second;
1777 char* third;
1778 char* fourth;
1779 bool fmet;
1780 char* prod;
1781
1782 if (remove != NULL)
1783 *remove = 0;
1784
1785 CRef<objects::CTrna_ext> ret(new objects::CTrna_ext);
1786
1787 if(product == NULL || StringLen(product) < 7)
1788 return ret;
1789
1790 bool digits = false;
1791 prod = StringSave(product);
1792 for(p = prod, q = prod; *p != '\0'; p++)
1793 {
1794 if(*p >= 'a' && *p <= 'z')
1795 *p &= ~040;
1796 else if((*p < 'A' || *p > 'Z') && *p != '(' && *p != ')')
1797 {
1798 if(*p >= '0' && *p <= '9')
1799 digits = true;
1800 *p = ' ';
1801 }
1802 }
1803 ShrinkSpaces(prod);
1804
1805 for(b = trna_tags; *b != NULL; b++)
1806 {
1807 start = StringStr(prod, *b);
1808 if(start != NULL)
1809 break;
1810 }
1811 if(*b == NULL)
1812 {
1813 MemFree(prod);
1814 return ret;
1815 }
1816
1817 end = start + StringLen(*b);
1818 for(p = end; *p != '\0'; p++)
1819 if(*p == '(' || *p == ')')
1820 *p = ' ';
1821 ShrinkSpaces(prod);
1822
1823 if(start == prod && *end == '\0')
1824 {
1825 if(remove != NULL && !digits)
1826 *remove = 1;
1827 MemFree(prod);
1828 return ret;
1829 }
1830
1831 first = NULL;
1832 second = NULL;
1833 third = NULL;
1834 fourth = NULL;
1835 for(p = end; *p == ' ' || *p == ')' || *p == '(';)
1836 p++;
1837 q = p;
1838 if(StringNCmp(p, "F MET", 5) == 0)
1839 p += 5;
1840 else if(StringNCmp(p, "F MT", 4) == 0)
1841 p += 4;
1842 while(*p >= 'A' && *p <= 'Z')
1843 p++;
1844 if(p > q)
1845 {
1846 if(*p != '\0')
1847 *p++ = '\0';
1848 second = q;
1849 }
1850 while(*p == ' ' || *p == ')' || *p == '(')
1851 p++;
1852 for(q = p; *p >= 'A' && *p <= 'Z';)
1853 p++;
1854 if(p > q)
1855 {
1856 if(*p != '\0')
1857 *p++ = '\0';
1858 if(q[1] == '\0')
1859 {
1860 while(*p == ' ' || *p == ')' || *p == '(')
1861 p++;
1862 for(q = p; *p >= 'A' && *p <= 'Z';)
1863 p++;
1864 if(p > q)
1865 {
1866 if(*p != '\0')
1867 *p++ = '\0';
1868 third = q;
1869 }
1870 }
1871 else
1872 third = q;
1873
1874 while(*p == ' ' || *p == '(' || *p == ')')
1875 p++;
1876 if(*p != '\0')
1877 fourth = p;
1878 }
1879 if(start > prod)
1880 {
1881 for(p = start - 1; *p == ' ' || *p == ')' || *p == '('; p--)
1882 if(p == prod)
1883 break;
1884
1885 if(p > prod && p[1] == ')')
1886 {
1887 for(p--; *p != '('; p--)
1888 if(p == prod)
1889 break;
1890 if(p > prod)
1891 {
1892 for(p--; *p == ' ' || *p == '(' || *p == '('; p--)
1893 if(p == prod)
1894 break;
1895 }
1896 }
1897 if(p > prod)
1898 {
1899 for(q = p++; *q >= 'A' && *q <= 'Z'; q--)
1900 if(q == prod)
1901 break;
1902 if(*q < 'A' || *q > 'Z')
1903 q++;
1904 if(p > q)
1905 {
1906 *p = '\0';
1907 first = q;
1908 }
1909 }
1910 }
1911
1912 fmet = false;
1913 if(second != NULL)
1914 {
1915 if(StringCmp(second, "F MET") == 0 ||
1916 StringCmp(second, "FMET") == 0 ||
1917 StringCmp(second, "F MT") == 0)
1918 {
1919 StringCpy(second, "FMET");
1920 fmet = true;
1921 }
1922
1923 ret->SetAa().SetNcbieaa(fta_get_aa_from_string(second));
1924 if (get_aa_from_trna(*ret) != 0)
1925 second = NULL;
1926 }
1927
1928 if (get_aa_from_trna(*ret) == 0 && first != NULL)
1929 {
1930 ret->SetAa().SetNcbieaa(fta_get_aa_from_string(first));
1931 if (get_aa_from_trna(*ret) != 0 && first == prod)
1932 first = NULL;
1933 }
1934
1935 if(first == NULL && second == NULL && third == NULL && fourth == NULL &&
1936 remove != NULL && !digits)
1937 *remove = 1;
1938 MemFree(prod);
1939
1940 if (!fmet)
1941 return ret;
1942
1943 if (!feat.IsSetComment())
1944 feat.SetComment("fMet");
1945 else if (StringIStr(feat.GetComment().c_str(), "fmet") == NULL)
1946 {
1947 std::string& comment = feat.SetComment();
1948 comment += "; fMet";
1949 }
1950
1951 return ret;
1952 }
1953
1954 /**********************************************************/
fta_get_trna_from_comment(const Char * comment,unsigned char * remove)1955 static CRef<objects::CTrna_ext> fta_get_trna_from_comment(const Char* comment, unsigned char* remove)
1956 {
1957 char* comm;
1958 char* p;
1959 char* q;
1960
1961 CRef<objects::CTrna_ext> ret(new objects::CTrna_ext);
1962
1963 *remove = 0;
1964 if(comment == NULL)
1965 return ret;
1966
1967 comm = StringSave(comment);
1968 for(p = comm, q = comm; *p != '\0'; p++)
1969 {
1970 if(*p >= 'a' && *p <= 'z')
1971 *p &= ~040;
1972 else if(*p < 'A' || *p > 'Z')
1973 *p = ' ';
1974 }
1975 ShrinkSpaces(comm);
1976
1977 if(StringNCmp(comm, "CODON RECOGNIZED ", 17) == 0)
1978 {
1979 p = comm + 17;
1980 q = StringChr(p, ' ');
1981 if(q != NULL && StringCmp(q + 1, "PUTATIVE") == 0)
1982 *q = '\0';
1983 if(StringChr(p, ' ') == NULL && StringLen(p) == 3)
1984 {
1985 MemFree(comm);
1986 *remove = (q == NULL) ? 1 : 2;
1987 return ret;
1988 }
1989 }
1990
1991 if(StringNCmp(comm, "PUTATIVE ", 9) == 0 && comm[10] == ' ' &&
1992 comm[14] == ' ' && StringNCmp(&comm[15], "TRNA", 4) == 0)
1993 {
1994 ret->SetAa().SetNcbieaa(fta_get_aa_from_symbol(comm[9]));
1995 if (get_aa_from_trna(*ret) != 0)
1996 {
1997 MemFree(comm);
1998 return ret;
1999 }
2000 }
2001
2002 for(q = comm, p = q; p != NULL;)
2003 {
2004 p = StringChr(p, ' ');
2005 if(p != NULL)
2006 *p++ = '\0';
2007
2008 ret->SetAa().SetNcbieaa(fta_get_aa_from_string(q));
2009 if (get_aa_from_trna(*ret) != 0)
2010 break;
2011 q = p;
2012 }
2013
2014 MemFree(comm);
2015 return ret;
2016 }
2017
2018 /**********************************************************/
get_first_codon_from_trna(const objects::CTrna_ext & trna)2019 static int get_first_codon_from_trna(const objects::CTrna_ext& trna)
2020 {
2021 int ret = 255;
2022 if (trna.IsSetCodon() && !trna.GetCodon().empty())
2023 ret = *trna.GetCodon().begin();
2024
2025 return ret;
2026 }
2027
2028 /**********************************************************/
GetRnaRef(objects::CSeq_feat & feat,objects::CBioseq & bioseq,Parser::ESource source,bool accver)2029 static void GetRnaRef(objects::CSeq_feat& feat, objects::CBioseq& bioseq,
2030 Parser::ESource source, bool accver)
2031 {
2032 char* qval;
2033 char* p;
2034
2035 Uint1 remove;
2036
2037 Int2 type;
2038
2039 if (!feat.GetData().IsImp())
2040 return;
2041
2042 const objects::CImp_feat& imp_feat = feat.GetData().GetImp();
2043
2044 CRef<objects::CRNA_ref> rna_ref(new objects::CRNA_ref);
2045
2046 type = MatchArrayString(ParFlat_RNA_array, imp_feat.GetKey().c_str());
2047 if (type < 0)
2048 type = 255;
2049 else
2050 ++type;
2051
2052 rna_ref->SetType(static_cast<objects::CRNA_ref::EType>(type));
2053
2054 feat.SetData().SetRna(*rna_ref);
2055
2056 if (type == objects::CRNA_ref::eType_rRNA)
2057 {
2058 fta_parse_rrna_feat(feat, *rna_ref);
2059 return;
2060 }
2061
2062 CRef<objects::CRNA_gen> rna_gen;
2063 CRef<objects::CRNA_qual_set> rna_quals;
2064
2065 if (type == objects::CRNA_ref::eType_ncRNA)
2066 {
2067 p = GetTheQualValue(feat.SetQual(), "ncRNA_class");
2068 if(p != NULL)
2069 {
2070 rna_gen.Reset(new objects::CRNA_gen);
2071 rna_gen->SetClass(p);
2072 }
2073 }
2074 else if (type == objects::CRNA_ref::eType_tmRNA)
2075 {
2076 p = GetTheQualValue(feat.SetQual(), "tag_peptide");
2077 if (p != NULL)
2078 {
2079 CRef<objects::CRNA_qual> rna_qual(new objects::CRNA_qual);
2080 rna_qual->SetQual("tag_peptide");
2081 rna_qual->SetVal(p);
2082
2083 rna_quals.Reset(new objects::CRNA_qual_set);
2084 rna_quals->Set().push_back(rna_qual);
2085
2086 rna_gen.Reset(new objects::CRNA_gen);
2087 rna_gen->SetQuals(*rna_quals);
2088 }
2089 }
2090
2091 if (type != objects::CRNA_ref::eType_premsg && type != objects::CRNA_ref::eType_tRNA) /* mRNA, snRNA, scRNA or other */
2092 {
2093 qval = GetTheQualValue(feat.SetQual(), "product");
2094 if(qval != NULL)
2095 {
2096 p = GetTheQualValue(feat.SetQual(), "product");
2097 if(p != NULL && p[0] != 0)
2098 {
2099 if (!feat.IsSetComment())
2100 feat.SetComment(p);
2101 else
2102 {
2103 std::string& comment = feat.SetComment();
2104 comment += "; ";
2105 comment += p;
2106 }
2107 }
2108 }
2109
2110 if (qval == NULL && type == objects::CRNA_ref::eType_mRNA &&
2111 source != Parser::ESource::EMBL && source != Parser::ESource::DDBJ)
2112 qval = GetTheQualValue(feat.SetQual(), "standard_name");
2113
2114 if (qval == NULL && feat.IsSetComment() && type == objects::CRNA_ref::eType_mRNA)
2115 {
2116 const Char* c_p = feat.GetComment().c_str();
2117 const Char* c_q = NULL;
2118 for ( ; ; c_p += 5, c_q = c_p)
2119 {
2120 c_p = StringIStr(c_p, " mRNA");
2121 if (c_p == NULL)
2122 break;
2123 }
2124
2125 const Char* c_r = NULL;
2126 for (c_p = feat.GetComment().c_str(); ; c_p += 4, c_r = c_p)
2127 {
2128 c_p = StringIStr(c_p, " RNA");
2129 if (c_p == NULL)
2130 break;
2131 }
2132
2133 if (c_q != NULL && c_r != NULL)
2134 {
2135 c_p = (c_q > c_r) ? c_q : c_r;
2136 }
2137 else if (c_q != NULL)
2138 c_p = c_q;
2139 else
2140 c_p = c_r;
2141
2142 if (c_p != NULL)
2143 {
2144 while (*c_p == ' ' || *c_p == '\t' || *c_p == ',' || *c_p == ';')
2145 ++c_p;
2146
2147 if (*c_p == '\0')
2148 {
2149 qval = StringSave(feat.GetComment().c_str());
2150 feat.ResetComment();
2151 }
2152 }
2153 }
2154
2155 if (qval != NULL)
2156 {
2157 if(StringLen(qval) > 511)
2158 {
2159 qval[510] = '>';
2160 qval[511] = '\0';
2161 p = StringSave(qval);
2162 MemFree(qval);
2163 qval = p;
2164 }
2165
2166 if (type > objects::CRNA_ref::eType_snoRNA && type <= objects::CRNA_ref::eType_miscRNA)
2167 {
2168 if (rna_gen.Empty())
2169 rna_gen.Reset(new objects::CRNA_gen);
2170
2171 rna_gen->SetProduct(qval);
2172 }
2173 else
2174 {
2175 rna_ref->SetExt().SetName(qval);
2176 }
2177 }
2178 }
2179
2180 if (feat.GetQual().empty())
2181 feat.ResetQual();
2182
2183 if (rna_gen.NotEmpty())
2184 {
2185 rna_ref->SetExt().SetGen(*rna_gen);
2186 }
2187
2188 if (type != objects::CRNA_ref::eType_tRNA) /* if tRNA and codon value exist */
2189 return;
2190
2191 qval = GetTheQualValue(feat.SetQual(), "anticodon");
2192 CRef<objects::CTrna_ext> trnaa;
2193 if (qval != NULL)
2194 {
2195 bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_na);
2196
2197 CRef<objects::CSeq_loc> anticodon = GetTrnaAnticodon(feat, qval, bioseq.GetId(), accver);
2198 if (anticodon.NotEmpty())
2199 {
2200 trnaa.Reset(new objects::CTrna_ext);
2201
2202 /* value has format: (pos:base_range, aa:amino_acid)
2203 */
2204 trnaa->SetAa().SetNcbieaa(GetQualValueAa(qval, true));
2205 trnaa->SetAnticodon(*anticodon);
2206 rna_ref->SetExt().SetTRNA(*trnaa);
2207 }
2208
2209 MemFree(qval);
2210 }
2211
2212 qval = CpTheQualValue(feat.SetQual(), "product");
2213
2214 CRef<objects::CTrna_ext> trnap;
2215 if (qval != NULL)
2216 {
2217 trnap = fta_get_trna_from_product(feat, qval, NULL);
2218 MemFree(qval);
2219 }
2220
2221 if (feat.IsSetComment() && feat.GetComment().empty())
2222 {
2223 feat.ResetComment();
2224 }
2225
2226 remove = 0;
2227 CRef<objects::CTrna_ext> trnac;
2228 if (feat.IsSetComment())
2229 {
2230 trnac = fta_get_trna_from_product(feat, feat.GetComment().c_str(), &remove);
2231
2232 if (get_aa_from_trna(*trnac) == 0)
2233 {
2234 trnac = fta_get_trna_from_comment(feat.GetComment().c_str(), &remove);
2235 }
2236
2237 if (get_aa_from_trna(*trnac) == 0 && get_first_codon_from_trna(*trnac) == 255)
2238 {
2239 trnac.Reset();
2240 }
2241 }
2242
2243 if (trnaa.Empty())
2244 {
2245 if (trnap.Empty())
2246 {
2247 if (trnac.NotEmpty() && get_aa_from_trna(*trnac) != 0)
2248 {
2249 rna_ref->SetExt().SetTRNA(*trnac);
2250 if(remove != 0)
2251 {
2252 feat.ResetComment();
2253 }
2254 }
2255 }
2256 else
2257 {
2258 rna_ref->SetExt().SetTRNA(*trnap);
2259
2260 if (get_aa_from_trna(*trnap) == 0)
2261 {
2262 if (trnac.NotEmpty() && get_aa_from_trna(*trnac) != 0)
2263 rna_ref->SetExt().SetTRNA(*trnac);
2264 }
2265 else if (trnac.NotEmpty())
2266 {
2267 if (get_aa_from_trna(*trnac) == 0 && get_first_codon_from_trna(*trnac) != 255 &&
2268 get_first_codon_from_trna(*trnap) == 255 && remove != 0)
2269 {
2270 trnap->SetCodon().assign(trnac->GetCodon().begin(), trnac->GetCodon().end());
2271
2272 feat.ResetComment();
2273 if(remove == 2)
2274 feat.SetComment("putative");
2275 }
2276
2277 if (get_aa_from_trna(*trnac) == get_aa_from_trna(*trnap) && remove != 0)
2278 {
2279 feat.ResetComment();
2280 }
2281 }
2282 }
2283 }
2284 else
2285 {
2286 if(trnap.NotEmpty())
2287 {
2288 trnap.Reset();
2289 }
2290
2291 if (trnac.NotEmpty() && get_aa_from_trna(*trnac) != 0)
2292 {
2293 if (get_aa_from_trna(*trnac) == get_aa_from_trna(*trnaa) || get_aa_from_trna(*trnaa) == 88)
2294 {
2295 trnac->SetAnticodon(trnaa->SetAnticodon());
2296 trnaa->ResetAnticodon();
2297
2298 if (get_first_codon_from_trna(*trnac) == 255)
2299 {
2300 trnac->SetCodon().assign(trnaa->GetCodon().begin(), trnaa->GetCodon().end());
2301 }
2302
2303 rna_ref->SetExt().SetTRNA(*trnac);
2304 if(remove != 0)
2305 {
2306 feat.ResetComment();
2307 }
2308 }
2309 }
2310 }
2311
2312 if (feat.GetQual().empty())
2313 feat.ResetQual();
2314
2315 if (rna_ref->IsSetExt() && rna_ref->GetExt().IsTRNA())
2316 {
2317 const objects::CTrna_ext& trna = rna_ref->GetExt().GetTRNA();
2318 if (get_aa_from_trna(trna) == 0 && !trna.IsSetAnticodon())
2319 {
2320 rna_ref->ResetExt();
2321 }
2322 }
2323 }
2324
2325 /**********************************************************
2326 *
2327 * static void GetImpFeat(sfp, fbp, locmap):
2328 *
2329 * 'replace' in loc will be changed later
2330 * in SeqEntryToAsn3Ex.
2331 *
2332 * 01/07/97
2333 *
2334 **********************************************************/
GetImpFeat(objects::CSeq_feat & feat,FeatBlkPtr fbp,bool locmap)2335 static void GetImpFeat(objects::CSeq_feat& feat, FeatBlkPtr fbp, bool locmap)
2336 {
2337 CRef<objects::CImp_feat> imp_feat(new objects::CImp_feat);
2338 imp_feat->SetKey(fbp->key);
2339
2340 if (locmap)
2341 imp_feat->SetLoc(fbp->location);
2342
2343 feat.SetData().SetImp(*imp_feat);
2344 }
2345
2346 /**********************************************************/
fta_sort_biosource(objects::CBioSource & bio)2347 void fta_sort_biosource(objects::CBioSource& bio)
2348 {
2349 if(bio.CanGetOrg() && !bio.GetOrg().GetDb().empty())
2350 {
2351 NON_CONST_ITERATE(objects::COrg_ref::TDb, db, bio.SetOrg().SetDb())
2352 {
2353 if (!(*db)->CanGetDb())
2354 continue;
2355
2356 objects::COrg_ref::TDb::iterator tdb = db;
2357 for (++tdb; tdb != bio.SetOrg().SetDb().end(); ++tdb)
2358 {
2359 if (!(*tdb)->IsSetDb())
2360 continue;
2361
2362 if ((*db)->GetDb() < (*tdb)->GetDb())
2363 continue;
2364
2365 if ((*db)->GetDb() == (*tdb)->GetDb())
2366 {
2367 const objects::CObject_id& db_id = (*db)->GetTag();
2368 const objects::CObject_id& tdb_id = (*tdb)->GetTag();
2369
2370 if (!db_id.IsStr() && tdb_id.IsStr())
2371 continue;
2372
2373 if (db_id.IsStr() && tdb_id.IsStr() &&
2374 db_id.GetStr() <= tdb_id.GetStr())
2375 continue;
2376
2377 if (!db_id.IsStr() && !tdb_id.IsStr() &&
2378 db_id.GetId() <= tdb_id.GetId())
2379 continue;
2380 }
2381
2382 db->Swap(*tdb);
2383 }
2384 }
2385
2386 if (bio.GetOrg().IsSetOrgname() && bio.GetOrg().GetOrgname().IsSetMod())
2387 {
2388 NON_CONST_ITERATE(objects::COrgName::TMod, mod, bio.SetOrg().SetOrgname().SetMod())
2389 {
2390 objects::COrgName::TMod::iterator tmod = mod;
2391 for (++tmod; tmod != bio.SetOrg().SetOrgname().SetMod().end(); ++tmod)
2392 {
2393 if ((*mod)->GetSubtype() < (*tmod)->GetSubtype())
2394 continue;
2395
2396 if ((*mod)->GetSubtype() == (*tmod)->GetSubtype() &&
2397 (*mod)->GetSubname() <= (*tmod)->GetSubname())
2398 continue;
2399
2400 mod->Swap(*tmod);
2401 }
2402 }
2403 }
2404 }
2405
2406 if (!bio.IsSetSubtype())
2407 return;
2408
2409 NON_CONST_ITERATE(objects::CBioSource::TSubtype, sub, bio.SetSubtype())
2410 {
2411 objects::CBioSource::TSubtype::iterator tsub = sub;
2412 for (++tsub; tsub != bio.SetSubtype().end(); ++tsub)
2413 {
2414 if ((*sub)->GetSubtype() < (*tsub)->GetSubtype())
2415 continue;
2416
2417 if ((*sub)->GetSubtype() == (*tsub)->GetSubtype() &&
2418 (*sub)->GetName() <= (*tsub)->GetName())
2419 continue;
2420
2421 sub->Swap(*tsub);
2422 }
2423 }
2424 }
2425
2426 /**********************************************************/
ConvertQualifierValue(CRef<objects::CGb_qual> & qual)2427 static void ConvertQualifierValue(CRef<objects::CGb_qual>& qual)
2428 {
2429 std::string val = qual->GetVal();
2430 bool has_comma = val.find(',') != std::string::npos;
2431
2432 if (has_comma)
2433 {
2434 std::replace(val.begin(), val.end(), ',', ';');
2435 qual->SetVal(val);
2436 }
2437
2438 if (has_comma)
2439 ErrPostEx(SEV_WARNING, ERR_QUALIFIER_MultRptUnitComma,
2440 "Converting commas to semi-colons due to format conventions for multiple /rpt_unit qualifiers.");
2441 }
2442
2443 /**********************************************************/
fta_parse_rpt_units(FeatBlkPtr fbp)2444 static void fta_parse_rpt_units(FeatBlkPtr fbp)
2445 {
2446 char* p;
2447
2448 if(fbp == NULL || fbp->quals.empty())
2449 return;
2450
2451 TQualVector::iterator first = fbp->quals.end();
2452 size_t len = 0, count = 0;
2453
2454 for (TQualVector::iterator qual = fbp->quals.begin(); qual != fbp->quals.end();)
2455 {
2456 if ((*qual)->GetQual() != "rpt_unit")
2457 {
2458 ++qual;
2459 continue;
2460 }
2461
2462 ErrPostEx(SEV_ERROR, ERR_QUALIFIER_ObsoleteRptUnit,
2463 "Obsolete /rpt_unit qualifier found on feature \"%s\" at location \"%s\".",
2464 (fbp->key == NULL) ? "Unknown" : fbp->key,
2465 (fbp->location == NULL) ? "unknown" : fbp->location);
2466
2467 if ((*qual)->GetVal().empty())
2468 {
2469 qual = fbp->quals.erase(qual);
2470 continue;
2471 }
2472
2473 count++;
2474 len += (*qual)->GetVal().size();
2475 if (first == fbp->quals.end())
2476 first = qual;
2477
2478 if (count == 1)
2479 {
2480 ++qual;
2481 continue;
2482 }
2483
2484 if(count == 2)
2485 ConvertQualifierValue(*first);
2486
2487 ConvertQualifierValue(*qual);
2488 ++qual;
2489 }
2490
2491 if(count == 0)
2492 return;
2493
2494 if(count == 1)
2495 {
2496 const std::string& val = (*first)->GetVal();
2497 if(*val.begin() == '(' && *val.rbegin() == ')')
2498 {
2499 ConvertQualifierValue(*first);
2500 }
2501 return;
2502 }
2503
2504 p = (char*) MemNew(len + count + 2);
2505 StringCpy(p, "(");
2506 StringCat(p, (*first)->GetVal().c_str());
2507
2508 for (TQualVector::iterator qual = first; qual != fbp->quals.end();)
2509 {
2510 if ((*qual)->GetQual() != "rpt_unit")
2511 {
2512 ++qual;
2513 continue;
2514 }
2515
2516 StringCat(p, ",");
2517 StringCat(p, (*qual)->GetVal().c_str());
2518 qual = fbp->quals.erase(qual);
2519 }
2520 StringCat(p, ")");
2521 (*first)->SetVal(p);
2522 }
2523
2524 /**********************************************************/
fta_check_evidence(objects::CSeq_feat & feat,FeatBlkPtr fbp)2525 static bool fta_check_evidence(objects::CSeq_feat& feat, FeatBlkPtr fbp)
2526 {
2527 Int4 evi_exp;
2528 Int4 evi_not;
2529 Int4 exp_good;
2530 Int4 exp_bad;
2531 Int4 inf_good;
2532 Int4 inf_bad;
2533 Char ch;
2534
2535 if (fbp == NULL || fbp->quals.empty())
2536 return true;
2537
2538 evi_exp = 0;
2539 evi_not = 0;
2540 exp_good = 0;
2541 exp_bad = 0;
2542 inf_good = 0;
2543 inf_bad = 0;
2544
2545 for (TQualVector::iterator qual = fbp->quals.begin(); qual != fbp->quals.end();)
2546 {
2547 const std::string& qual_str = (*qual)->IsSetQual() ? (*qual)->GetQual() : "";
2548 const std::string& val_str = (*qual)->IsSetVal() ? (*qual)->GetVal() : "";
2549 if (qual_str == "experiment")
2550 {
2551 if (val_str == "experimental evidence, no additional details recorded")
2552 {
2553 exp_good++;
2554 qual = fbp->quals.erase(qual);
2555 }
2556 else
2557 {
2558 exp_bad++;
2559 ++qual;
2560 }
2561 continue;
2562 }
2563
2564 if (qual_str == "inference")
2565 {
2566 if (val_str == "non-experimental evidence, no additional details recorded")
2567 {
2568 inf_good++;
2569 qual = fbp->quals.erase(qual);
2570 }
2571 else
2572 {
2573 inf_bad++;
2574 ++qual;
2575 }
2576 continue;
2577 }
2578
2579 if (qual_str != "evidence")
2580 {
2581 ++qual;
2582 continue;
2583 }
2584
2585 if (StringICmp(val_str.c_str(), "not_experimental") == 0)
2586 evi_not++;
2587 else if (StringICmp(val_str.c_str(), "experimental") == 0)
2588 evi_exp++;
2589 else
2590 {
2591 if(fbp->location != NULL && StringLen(fbp->location) > 50)
2592 {
2593 ch = fbp->location[50];
2594 fbp->location[50] = '\0';
2595 }
2596 else
2597 ch = '\0';
2598 ErrPostEx(SEV_ERROR, ERR_QUALIFIER_InvalidEvidence,
2599 "Illegal value \"%s\" for /evidence qualifier on the \"%s\" feature at \"%s\". Qualifier dropped.",
2600 (val_str.empty()) ? "Unknown" : val_str.c_str(),
2601 (fbp->key == NULL) ? "Unknown" : fbp->key,
2602 (fbp->location == NULL) ? "unknown location" : fbp->location);
2603 if(ch != '\0')
2604 fbp->location[50] = ch;
2605 }
2606
2607 qual = fbp->quals.erase(qual);
2608 }
2609
2610 if(evi_exp + evi_not > 0 && exp_good + exp_bad + inf_good + inf_bad > 0)
2611 {
2612 if(fbp->location != NULL && StringLen(fbp->location) > 50)
2613 {
2614 ch = fbp->location[50];
2615 fbp->location[50] = '\0';
2616 }
2617 else
2618 ch = '\0';
2619 ErrPostEx(SEV_REJECT, ERR_QUALIFIER_Conflict,
2620 "Old /evidence and new /experiment or /inference qualifiers both exist on the \"%s\" feature at \"%s\". This is currently unsupported.",
2621 (fbp->key == NULL) ? "Unknown" : fbp->key,
2622 (fbp->location == NULL) ? "unknown location" : fbp->location);
2623 if(ch != '\0')
2624 fbp->location[50] = ch;
2625 return false;
2626 }
2627
2628 if(evi_exp + exp_good > 0 && evi_not + inf_good > 0)
2629 {
2630 if(fbp->location != NULL && StringLen(fbp->location) > 50)
2631 {
2632 ch = fbp->location[50];
2633 fbp->location[50] = '\0';
2634 }
2635 else
2636 ch = '\0';
2637 ErrPostEx(SEV_REJECT, ERR_QUALIFIER_Conflict,
2638 "The special \"no additional details recorded\" values for both /experiment and /inference exist on the \"%s\" feature at \"%s\". This is currently unsupported.",
2639 (fbp->key == NULL) ? "Unknown" : fbp->key,
2640 (fbp->location == NULL) ? "unknown location" : fbp->location);
2641 if(ch != '\0')
2642 fbp->location[50] = ch;
2643 return false;
2644 }
2645
2646 if((exp_good > 0 && exp_bad > 0) || (inf_good > 0 && inf_bad > 0))
2647 {
2648 if(fbp->location != NULL && StringLen(fbp->location) > 50)
2649 {
2650 ch = fbp->location[50];
2651 fbp->location[50] = '\0';
2652 }
2653 else
2654 ch = '\0';
2655 ErrPostEx(SEV_REJECT, ERR_QUALIFIER_Conflict,
2656 "The special \"no additional details recorded\" value for /experiment or /inference exists in conjunction with other /experiment or /inference qualifiers on the \"%s\" feature at \"%s\". This is currently unsupported.",
2657 (fbp->key == NULL) ? "Unknown" : fbp->key,
2658 (fbp->location == NULL) ? "unknown location" : fbp->location);
2659 if(ch != '\0')
2660 fbp->location[50] = ch;
2661 return false;
2662 }
2663
2664 if(exp_good + evi_exp > 0)
2665 feat.SetExp_ev(objects::CSeq_feat::eExp_ev_experimental);
2666 else if (inf_good + evi_not > 0)
2667 feat.SetExp_ev(objects::CSeq_feat::eExp_ev_not_experimental);
2668 return true;
2669 }
2670
2671 /**********************************************************
2672 *
2673 * static CRef<objects::CSeq_feat> ProcFeatBlk(pp, fbp, seqids):
2674 *
2675 * Process each feature sub-block.
2676 * location, SeqLocPtr by calling Karl's routine,
2677 * Nml_gbparseint which return locmap = TRUE if mapping
2678 * location rules not work, then SeqLocPtr->whole = seqids[0].
2679 * sitesmap = TRUE if found "(sites" string, num_errs > 0
2680 * if any errors occurred.
2681 * If there is a illegal location, then assign
2682 * qualifier to be a Imp-feat.
2683 *
2684 **********************************************************/
ProcFeatBlk(ParserPtr pp,FeatBlkPtr fbp,TSeqIdList & seqids)2685 static CRef<objects::CSeq_feat> ProcFeatBlk(ParserPtr pp, FeatBlkPtr fbp, TSeqIdList& seqids)
2686 {
2687 const char **b;
2688
2689 char* loc = NULL;
2690
2691 bool locmap = false;
2692 bool err = false;
2693
2694 CRef<objects::CSeq_feat> feat;
2695
2696 if (fbp->location != NULL)
2697 {
2698 loc = fbp->location;
2699 DelCharBtwData(loc);
2700 if(pp->buf != NULL)
2701 MemFree(pp->buf);
2702 pp->buf = (char*) MemNew(StringLen(fbp->key) + StringLen(loc) + 4);
2703 StringCpy(pp->buf, fbp->key);
2704 StringCat(pp->buf, " : ");
2705 StringCat(pp->buf, loc);
2706
2707 feat.Reset(new objects::CSeq_feat);
2708 locmap = GetSeqLocation(*feat, loc, seqids, &err, pp, fbp->key);
2709
2710 if(pp->buf != NULL)
2711 MemFree(pp->buf);
2712 pp->buf = NULL;
2713 }
2714 if(err)
2715 {
2716 if(pp->debug == false)
2717 {
2718 ErrPostEx(SEV_ERROR, ERR_FEATURE_Dropped,
2719 "%s|%s| range check detects problems", fbp->key, loc);
2720 feat.Reset();
2721 return feat;
2722 }
2723 ErrPostEx(SEV_WARNING, ERR_LOCATION_FailedCheck,
2724 "%s|%s| range check detects problems", fbp->key, loc);
2725 }
2726
2727 if (!fbp->quals.empty()) {
2728 if (DeleteQual(fbp->quals, "partial"))
2729 feat->SetPartial(true);
2730 }
2731
2732 if (StringStr(loc, "order") != NULL)
2733 feat->SetPartial(true);
2734
2735 if (!fbp->quals.empty())
2736 {
2737 if (DeleteQual(fbp->quals, "pseudo"))
2738 feat->SetPseudo(true);
2739 }
2740
2741 if (!fbp->quals.empty())
2742 DeleteQual(fbp->quals, "gsdb_id");
2743
2744 if (!fbp->quals.empty())
2745 fta_parse_rpt_units(fbp);
2746
2747 if (!fbp->quals.empty())
2748 {
2749 for(b = TransSplicingFeats; *b != NULL; b++)
2750 if(StringCmp(fbp->key, *b) == 0)
2751 break;
2752 if (*b != NULL && DeleteQual(fbp->quals, "trans_splicing"))
2753 {
2754 feat->SetExcept(true);
2755 if (!feat->IsSetExcept_text())
2756 feat->SetExcept_text("trans-splicing");
2757 else
2758 {
2759 std::string& exc_text = feat->SetExcept_text();
2760 exc_text += ", trans-splicing";
2761 }
2762 }
2763 }
2764
2765 if(!fta_check_evidence(*feat, fbp))
2766 {
2767 pp->entrylist[pp->curindx]->drop = 1;
2768 return feat;
2769 }
2770
2771 if ((!feat->IsSetPartial() || !feat->GetPartial()) && StringCmp(fbp->key, "gap") != 0) {
2772 if (SeqLocHaveFuzz(feat->GetLocation()))
2773 feat->SetPartial(true);
2774 }
2775
2776 if (!fbp->quals.empty())
2777 {
2778 const Char* comment = GetTheQualValue(fbp->quals, "note");
2779
2780 if (comment && comment[0])
2781 feat->SetComment(comment);
2782 }
2783
2784 /* assume all imp for now
2785 */
2786 if (StringStr(fbp->key, "source") == NULL)
2787 GetImpFeat(*feat, fbp, locmap);
2788
2789 ITERATE(TQualVector, cur, fbp->quals)
2790 {
2791 const std::string& qual_str = (*cur)->GetQual();
2792 if (qual_str == "pseudogene")
2793 feat->SetPseudo(true);
2794
2795 // Do nothing for 'translation' qualifier in case of its value is empty
2796 if (qual_str == "translation" && (!(*cur)->IsSetVal() || (*cur)->GetVal().empty()))
2797 continue;
2798
2799 if (!qual_str.empty())
2800 feat->SetQual().push_back(*cur);
2801 }
2802
2803 return feat;
2804 }
2805
2806 /**********************************************************/
fta_get_gcode_from_biosource(const objects::CBioSource & bio_src,IndexblkPtr ibp)2807 static void fta_get_gcode_from_biosource(const objects::CBioSource& bio_src, IndexblkPtr ibp)
2808 {
2809 if (!bio_src.IsSetOrg() || !bio_src.GetOrg().IsSetOrgname())
2810 return;
2811
2812 ibp->gc_genomic = bio_src.GetOrg().GetOrgname().IsSetGcode() ? bio_src.GetOrg().GetOrgname().GetGcode() : 0;
2813 ibp->gc_mito = bio_src.GetOrg().GetOrgname().IsSetMgcode() ? bio_src.GetOrg().GetOrgname().GetMgcode() : 0;
2814 }
2815
2816 /**********************************************************/
fta_sort_quals(FeatBlkPtr fbp,bool qamode)2817 static void fta_sort_quals(FeatBlkPtr fbp, bool qamode)
2818 {
2819 if(fbp == NULL)
2820 return;
2821
2822 NON_CONST_ITERATE(TQualVector, q, fbp->quals)
2823 {
2824 if((*q)->GetQual() == "gene" ||
2825 (!qamode && (*q)->GetQual() == "product"))
2826 continue;
2827
2828 TQualVector::iterator tq = q;
2829 for (++tq; tq != fbp->quals.end(); ++tq)
2830 {
2831 const std::string& q_qual = (*q)->GetQual();
2832 const std::string& tq_qual = (*tq)->GetQual();
2833
2834 if (!tq_qual.empty())
2835 {
2836 if (q_qual == "gene")
2837 continue;
2838
2839 Int4 i = StringICmp(q_qual.c_str(), tq_qual.c_str());
2840 if(i < 0)
2841 continue;
2842 if(i == 0)
2843 {
2844 /* Do not sort /gene qualifiers
2845 */
2846 const std::string q_val = (*q)->GetVal();
2847 const std::string tq_val = (*tq)->GetVal();
2848
2849 if (q_val.empty())
2850 continue;
2851
2852 if(!tq_val.empty())
2853 {
2854 if(q_val[0] >= '0' && q_val[0] <= '9' &&
2855 tq_val[0] >= '0' && tq_val[0] <= '9')
2856 {
2857 if(atoi(q_val.c_str()) <= atoi(tq_val.c_str()))
2858 continue;
2859 }
2860 else if(q_val <= tq_val)
2861 continue;
2862 }
2863 }
2864 }
2865
2866 q->Swap(*tq);
2867 }
2868 }
2869 }
2870
2871 /**********************************************************/
fta_qual_a_in_b(const TQualVector & qual1,const TQualVector & qual2)2872 static bool fta_qual_a_in_b(const TQualVector& qual1, const TQualVector& qual2)
2873 {
2874 bool found = false;
2875
2876 ITERATE(TQualVector, gbqp1, qual1)
2877 {
2878 found = false;
2879 ITERATE(TQualVector, gbqp2, qual2)
2880 {
2881 const Char* qual_a = (*gbqp1)->IsSetQual() ? (*gbqp1)->GetQual().c_str() : NULL;
2882 const Char* qual_b = (*gbqp2)->IsSetQual() ? (*gbqp2)->GetQual().c_str() : NULL;
2883
2884 const Char* val_a = (*gbqp1)->IsSetVal() ? (*gbqp1)->GetVal().c_str() : NULL;
2885 const Char* val_b = (*gbqp2)->IsSetVal() ? (*gbqp2)->GetVal().c_str() : NULL;
2886
2887 if (fta_strings_same(qual_a, qual_b) && fta_strings_same(val_a, val_b))
2888 {
2889 found = true;
2890 break;
2891 }
2892 }
2893 if (!found)
2894 break;
2895 }
2896
2897 if (!found)
2898 return false;
2899
2900 return true;
2901 }
2902
2903 /**********************************************************/
fta_feats_same(FeatBlkPtr fbp1,FeatBlkPtr fbp2)2904 static bool fta_feats_same(FeatBlkPtr fbp1, FeatBlkPtr fbp2)
2905 {
2906 if(fbp1 == NULL && fbp2 == NULL)
2907 return true;
2908 if(fbp1 == NULL || fbp2 == NULL ||
2909 fta_strings_same(fbp1->key, fbp2->key) == false ||
2910 fta_strings_same(fbp1->location, fbp2->location) == false)
2911 return false;
2912
2913 if (fta_qual_a_in_b(fbp1->quals, fbp2->quals) && fta_qual_a_in_b(fbp2->quals, fbp1->quals))
2914 return true;
2915
2916 return false;
2917 }
2918
2919 /**********************************************************/
fta_check_rpt_unit_span(const char * val,size_t length)2920 static bool fta_check_rpt_unit_span(const char* val, size_t length)
2921 {
2922 const char* p;
2923 const char* q;
2924 Int4 i1;
2925 Int4 i2;
2926
2927 if(val == NULL || *val == '\0')
2928 return false;
2929
2930 for(p = val; *p >= '0' && *p <= '9';)
2931 p++;
2932
2933 if(p == val || p[0] != '.' || p[1] != '.')
2934 return false;
2935
2936 i1 = atoi(val);
2937 for(p += 2, q = p; *q >= '0' && *q <= '9';)
2938 q++;
2939 if(q == p || *q != '\0')
2940 return false;
2941 i2 = atoi(p);
2942
2943 if(i1 == 0 || i1 > i2 || i2 > (Int4) length)
2944 return false;
2945 return true;
2946 }
2947
2948 /**********************************************************/
fta_check_rpt_unit_range(FeatBlkPtr fbp,size_t length)2949 static void fta_check_rpt_unit_range(FeatBlkPtr fbp, size_t length)
2950 {
2951 Char ch;
2952
2953 if (fbp == NULL || fbp->quals.empty())
2954 return;
2955
2956 for (TQualVector::iterator cur = fbp->quals.begin(); cur != fbp->quals.end();)
2957 {
2958 if (!(*cur)->IsSetQual() || !(*cur)->IsSetVal())
2959 {
2960 ++cur;
2961 continue;
2962 }
2963
2964 const std::string& qual_str = (*cur)->GetQual();
2965 const std::string& val_str = (*cur)->GetVal();
2966
2967 if (qual_str != "rpt_unit_range" || fta_check_rpt_unit_span(val_str.c_str(), length))
2968 {
2969 ++cur;
2970 continue;
2971 }
2972
2973 if(fbp->location != NULL && StringLen(fbp->location) > 20)
2974 {
2975 ch = fbp->location[20];
2976 fbp->location[20] = '\0';
2977 }
2978 else
2979 ch = '\0';
2980 ErrPostEx(SEV_ERROR, ERR_QUALIFIER_InvalidRptUnitRange,
2981 "/rpt_unit_range qualifier \"%s\" on feature \"%s\" at location \"%s%s\" is not a valid basepair range. Qualifier dropped.",
2982 val_str.empty() ? "(EMPTY)" : val_str.c_str(),
2983 (fbp->key == NULL) ? "Unknown" : fbp->key,
2984 (fbp->location == NULL) ? "unknown" : fbp->location,
2985 (ch == '\0') ? "" : "...");
2986 if(ch != '\0')
2987 fbp->location[20] = ch;
2988
2989 cur = fbp->quals.erase(cur);
2990 }
2991 }
2992
2993 /**********************************************************/
fta_remove_dup_feats(DataBlkPtr dbp)2994 static void fta_remove_dup_feats(DataBlkPtr dbp)
2995 {
2996 DataBlkPtr tdbp;
2997 DataBlkPtr tdbpprev;
2998 DataBlkPtr tdbpnext;
2999 FeatBlkPtr fbp1;
3000 FeatBlkPtr fbp2;
3001 Char ch;
3002
3003 if(dbp == NULL || dbp->next == NULL)
3004 return;
3005
3006 for(; dbp != NULL; dbp = dbp->next)
3007 {
3008 if(dbp->data == NULL)
3009 continue;
3010
3011 fbp1 = (FeatBlkPtr) dbp->data;
3012 tdbpprev = dbp;
3013 for(tdbp = dbp->next; tdbp != NULL; tdbp = tdbpnext)
3014 {
3015 tdbpnext = tdbp->next;
3016 if(tdbp->data == NULL)
3017 {
3018 tdbpprev->next = tdbpnext;
3019 MemFree(tdbp);
3020 continue;
3021 }
3022
3023 fbp2 = (FeatBlkPtr) tdbp->data;
3024
3025 if(fbp1->location != NULL && fbp2->location != NULL &&
3026 StringCmp(fbp1->location, fbp2->location) < 0)
3027 break;
3028
3029 if(!fta_feats_same(fbp1, fbp2))
3030 {
3031 tdbpprev = tdbp;
3032 continue;
3033 }
3034
3035 if(fbp2->location != NULL && StringLen(fbp2->location) > 20)
3036 {
3037 ch = fbp2->location[20];
3038 fbp2->location[20] = '\0';
3039 }
3040 else
3041 ch = '\0';
3042 ErrPostEx(SEV_WARNING, ERR_FEATURE_DuplicateRemoved,
3043 "Duplicated feature \"%s\" at location \"%s%s\" removed.",
3044 (fbp2->key == NULL) ? "???" : fbp2->key,
3045 (fbp2->location == NULL) ? "???" : fbp2->location,
3046 (ch == '\0') ? "" : "...");
3047
3048 FreeFeatBlkQual(fbp2);
3049 tdbpprev->next = tdbpnext;
3050 MemFree(tdbp);
3051 }
3052 }
3053 }
3054
3055 /**********************************************************/
3056 class PredIsGivenQual
3057 {
3058 public:
PredIsGivenQual(const std::string & qual)3059 PredIsGivenQual(const std::string& qual) : qual_(qual) {}
3060
operator ()(const CRef<objects::CGb_qual> & qual)3061 bool operator()(const CRef<objects::CGb_qual>& qual)
3062 {
3063 return qual->GetQual() == qual_;
3064 }
3065
3066 private:
3067 std::string qual_;
3068 };
3069
fta_check_multiple_locus_tag(DataBlkPtr dbp,unsigned char * drop)3070 static void fta_check_multiple_locus_tag(DataBlkPtr dbp, unsigned char* drop)
3071 {
3072 FeatBlkPtr fbp;
3073 Char ch;
3074
3075 for(; dbp != NULL; dbp = dbp->next)
3076 {
3077 fbp = (FeatBlkPtr) dbp->data;
3078 if(fbp == NULL)
3079 continue;
3080
3081 size_t i = std::count_if(fbp->quals.begin(), fbp->quals.end(), PredIsGivenQual("locus_tag"));
3082 if(i < 2)
3083 continue;
3084
3085 if(fbp->location != NULL && StringLen(fbp->location) > 50)
3086 {
3087 ch = fbp->location[50];
3088 fbp->location[50] = '\0';
3089 }
3090 else
3091 ch = '\0';
3092 ErrPostEx(SEV_REJECT, ERR_FEATURE_MultipleLocusTags,
3093 "Multiple /locus_tag values for \"%s\" feature at \"%s\".",
3094 (fbp->key == NULL) ? "Unknown" : fbp->key,
3095 (fbp->location == NULL) ? "unknown location" : fbp->location);
3096 if(ch != '\0')
3097 fbp->location[50] = ch;
3098 *drop = 1;
3099 break;
3100 }
3101 }
3102
3103 /**********************************************************/
fta_check_old_locus_tags(DataBlkPtr dbp,unsigned char * drop)3104 static void fta_check_old_locus_tags(DataBlkPtr dbp, unsigned char* drop)
3105 {
3106 Int4 i;
3107
3108 PredIsGivenQual isOldLocusTag("old_locus_tag"),
3109 isLocusTag("locus_tag");
3110
3111 for(; dbp != NULL; dbp = dbp->next)
3112 {
3113 FeatBlkPtr fbp = (FeatBlkPtr)dbp->data;
3114 if(fbp == NULL)
3115 continue;
3116 size_t olt = std::count_if(fbp->quals.begin(), fbp->quals.end(), isOldLocusTag);
3117 size_t lt = std::count_if(fbp->quals.begin(), fbp->quals.end(), isLocusTag);
3118
3119 if(olt == 0)
3120 continue;
3121
3122 if(lt == 0)
3123 {
3124 ErrPostEx(SEV_REJECT, ERR_FEATURE_OldLocusTagWithoutNew,
3125 "Feature \"%s\" at \"%s\" has an /old_locus_tag qualifier but lacks a /locus_tag qualifier. Entry dropped.",
3126 (fbp->key == NULL) ? "Unknown" : fbp->key,
3127 (fbp->location == NULL) ? "unknown location" : fbp->location);
3128 *drop = 1;
3129 }
3130 else
3131 {
3132 i = 0;
3133 ITERATE(TQualVector, gbqp1, fbp->quals)
3134 {
3135 if (!(*gbqp1)->IsSetQual() || !(*gbqp1)->IsSetVal() || !isLocusTag(*gbqp1))
3136 continue;
3137
3138 i++;
3139
3140 const std::string& gbqp1_val = (*gbqp1)->GetVal();
3141 if (gbqp1_val.empty())
3142 continue;
3143
3144 ITERATE(TQualVector, gbqp2, fbp->quals)
3145 {
3146 if (!(*gbqp2)->IsSetQual() || !(*gbqp2)->IsSetVal())
3147 continue;
3148
3149 const std::string& gbqp2_val = (*gbqp2)->GetVal();
3150
3151 if (!isOldLocusTag(*gbqp2) || !NStr::EqualNocase(gbqp1_val, gbqp2_val))
3152 continue;
3153
3154 ErrPostEx(SEV_REJECT, ERR_FEATURE_MatchingOldNewLocusTag,
3155 "Feature \"%s\" at \"%s\" has an /old_locus_tag qualifier with a value that is identical to that of a /locus_tag qualifier: \"%s\". Entry dropped.",
3156 (fbp->key == NULL) ? "Unknown" : fbp->key,
3157 (fbp->location == NULL) ? "unknown location" : fbp->location,
3158 gbqp1_val.c_str());
3159 *drop = 1;
3160 }
3161 }
3162 }
3163
3164 if(olt == 1)
3165 continue;
3166
3167 ITERATE(TQualVector, gbqp1, fbp->quals)
3168 {
3169 const std::string& gbqp1_val = (*gbqp1)->GetVal();
3170 if (isOldLocusTag(*gbqp1) || gbqp1_val.empty())
3171 continue;
3172
3173 TQualVector::const_iterator gbqp2 = gbqp1;
3174 for (++gbqp2; gbqp2 != fbp->quals.end(); ++gbqp2)
3175 {
3176 const std::string& gbqp2_val = (*gbqp2)->GetVal();
3177 if (isOldLocusTag(*gbqp2) || gbqp2_val.empty())
3178 continue;
3179
3180 if (StringICmp(gbqp1_val.c_str(), gbqp2_val.c_str()) == 0)
3181 {
3182 ErrPostEx(SEV_ERROR, ERR_FEATURE_RedundantOldLocusTag,
3183 "Feature \"%s\" at \"%s\" has redundant /old_locus_tag qualifiers. Dropping all but the first.",
3184 (fbp->key == NULL) ? "Unknown" : fbp->key,
3185 (fbp->location == NULL) ? "unknown location" : fbp->location);
3186 break;
3187 }
3188 }
3189
3190 if (gbqp2 != fbp->quals.end())
3191 break;
3192 }
3193 }
3194 }
3195
3196 /**********************************************************/
fta_check_pseudogene_qual(DataBlkPtr dbp)3197 static void fta_check_pseudogene_qual(DataBlkPtr dbp)
3198 {
3199 FeatBlkPtr fbp;
3200 bool got_pseudogene;
3201 bool got_pseudo;
3202
3203 for(; dbp != NULL; dbp = dbp->next)
3204 {
3205 fbp = (FeatBlkPtr) dbp->data;
3206 if(fbp == NULL)
3207 continue;
3208
3209 got_pseudo = false;
3210 got_pseudogene = false;
3211
3212 for (TQualVector::iterator cur = fbp->quals.begin(); cur != fbp->quals.end(); )
3213 {
3214 const std::string& qual_str = (*cur)->GetQual();
3215 const std::string& val_str = (*cur)->IsSetVal() ? (*cur)->GetVal() : "";
3216
3217 if (qual_str != "pseudogene")
3218 {
3219 if(!got_pseudo && qual_str == "pseudo")
3220 got_pseudo = true;
3221 ++cur;
3222 continue;
3223 }
3224
3225 if(got_pseudogene)
3226 {
3227 ErrPostEx(SEV_ERROR, ERR_QUALIFIER_MultiplePseudoGeneQuals,
3228 "Dropping a /pseudogene qualifier because multiple /pseudogene qualifiers are present : <%s> : Feature key <%s> : Feature location <%s>.",
3229 val_str.empty() ? "[empty]" : val_str.c_str(),
3230 fbp->key, fbp->location);
3231
3232 cur = fbp->quals.erase(cur);
3233 continue;
3234 }
3235
3236 got_pseudogene = true;
3237
3238 if (val_str.empty())
3239 {
3240 ErrPostEx(SEV_ERROR, ERR_QUALIFIER_InvalidPseudoGeneValue,
3241 "Dropping a /pseudogene qualifier because its value is empty : Feature key <%s> : Feature location <%s>.",
3242 fbp->key, fbp->location);
3243
3244 cur = fbp->quals.erase(cur);
3245 continue;
3246 }
3247
3248 if(MatchArrayString(PseudoGeneValues, val_str.c_str()) >= 0)
3249 {
3250 ++cur;
3251 continue;
3252 }
3253
3254 ErrPostEx(SEV_ERROR, ERR_QUALIFIER_InvalidPseudoGeneValue,
3255 "Dropping a /pseudogene qualifier because its value is invalid : <%s> : Feature key <%s> : Feature location <%s>.",
3256 val_str.c_str(), fbp->key, fbp->location);
3257
3258 cur = fbp->quals.erase(cur);
3259 }
3260
3261 if(!got_pseudogene || !got_pseudo)
3262 continue;
3263
3264 ErrPostEx(SEV_ERROR, ERR_QUALIFIER_OldPseudoWithPseudoGene,
3265 "A legacy /pseudo qualifier and a /pseudogene qualifier are present on the same feature : Dropping /pseudo : Feature key <%s> : Feature location <%s>.",
3266 fbp->key, fbp->location);
3267 DeleteQual(fbp->quals, "pseudo");
3268 }
3269 }
3270
3271 /**********************************************************/
fta_check_compare_qual(DataBlkPtr dbp,bool is_tpa)3272 static void fta_check_compare_qual(DataBlkPtr dbp, bool is_tpa)
3273 {
3274 FeatBlkPtr fbp;
3275 char* p;
3276 char* q;
3277 bool badcom;
3278 Char ch;
3279 Int4 com_count;
3280 Int4 cit_count;
3281
3282 for(; dbp != NULL; dbp = dbp->next)
3283 {
3284 fbp = (FeatBlkPtr) dbp->data;
3285 if(fbp == NULL)
3286 continue;
3287
3288 com_count = 0;
3289 cit_count = 0;
3290
3291 for (TQualVector::iterator cur = fbp->quals.begin(); cur != fbp->quals.end();)
3292 {
3293 const std::string& qual_str = (*cur)->GetQual();
3294 const std::string& val_str = (*cur)->IsSetVal() ? (*cur)->GetVal() : "";
3295
3296 if (qual_str == "compare")
3297 {
3298 badcom = true;
3299 if (!val_str.empty())
3300 {
3301 q = StringChr(val_str.c_str(), '.');
3302 if(q != NULL && q[1] != '\0')
3303 {
3304 for(p = q + 1; *p >= '0' && *p <= '9';)
3305 p++;
3306 if(*p == '\0')
3307 {
3308 *q = '\0';
3309 if (GetNucAccOwner(val_str.c_str(), is_tpa) > 0)
3310 badcom = false;
3311 *q = '.';
3312 }
3313 }
3314 }
3315
3316 if(badcom)
3317 {
3318 ErrPostEx(SEV_ERROR, ERR_QUALIFIER_IllegalCompareQualifier,
3319 "/compare qualifier value is not a legal Accession.Version : feature \"%s\" at \"%s\" : value \"%s\" : qualifier has been dropped.",
3320 fbp->key, fbp->location,
3321 val_str.empty() ? "[empty]" : val_str.c_str());
3322
3323 cur = fbp->quals.erase(cur);
3324 continue;
3325 }
3326 com_count++;
3327 }
3328 else if (qual_str == "citation")
3329 cit_count++;
3330
3331 ++cur;
3332 }
3333
3334 if(com_count > 0 || cit_count > 0 ||
3335 (StringCmp(fbp->key, "old_sequence") != 0 &&
3336 StringCmp(fbp->key, "conflict") != 0))
3337 continue;
3338
3339 ch = '\0';
3340 if(StringLen(fbp->location) > 30)
3341 {
3342 ch = fbp->location[30];
3343 fbp->location[30] = '\0';
3344 }
3345 ErrPostEx(SEV_ERROR, ERR_FEATURE_RequiredQualifierMissing,
3346 "Feature \"%s\" at \"%s\" lacks required /citation and/or /compare qualifier : feature has been dropped.",
3347 fbp->key, fbp->location);
3348 if(ch != '\0')
3349 fbp->location[30] = ch;
3350 dbp->drop = 1;
3351 }
3352 }
3353
3354 /**********************************************************/
fta_check_non_tpa_tsa_tls_locations(DataBlkPtr dbp,IndexblkPtr ibp)3355 static void fta_check_non_tpa_tsa_tls_locations(DataBlkPtr dbp,
3356 IndexblkPtr ibp)
3357 {
3358 FeatBlkPtr fbp;
3359 char* location;
3360 char* p;
3361 char* q;
3362 char* r;
3363 Uint1 i;
3364
3365 location = NULL;
3366 for(; dbp != NULL; dbp = dbp->next)
3367 {
3368 fbp = (FeatBlkPtr) dbp->data;
3369 if(fbp == NULL || fbp->location == NULL)
3370 continue;
3371 location = StringSave(fbp->location);
3372 for(p = location, q = p; *p != '\0'; p++)
3373 if(*p != ' ' && *p != '\t' && *p != '\n')
3374 *q++ = *p;
3375 *q = '\0';
3376 if(q == location)
3377 {
3378 MemFree(location);
3379 location = NULL;
3380 continue;
3381 }
3382
3383 for(p = location + 1; *p != '\0'; p++)
3384 {
3385 if(*p != ':')
3386 continue;
3387 for(r = NULL, q = p - 1;; q--)
3388 {
3389 if(q == location)
3390 {
3391 if(*q != '_' && (*q < '0' || *q > '9') &&
3392 (*q < 'a' || *q > 'z') && (*q < 'A' || *q > 'Z'))
3393 q++;
3394 break;
3395 }
3396 if(*q == '.')
3397 {
3398 if(r == NULL)
3399 {
3400 r = q;
3401 continue;
3402 }
3403 q++;
3404 break;
3405 }
3406 if(*q != '_' && (*q < '0' || *q > '9') &&
3407 (*q < 'a' || *q > 'z') && (*q < 'A' || *q > 'Z'))
3408 {
3409 q++;
3410 break;
3411 }
3412 }
3413 if(q == p)
3414 continue;
3415 if(r != NULL)
3416 *r = '\0';
3417 else
3418 *p = '\0';
3419 i = GetNucAccOwner(q, ibp->is_tpa);
3420 if(r != NULL)
3421 *r = '.';
3422 else
3423 *p = ':';
3424
3425
3426 if (i == objects::CSeq_id::e_Genbank && (q[0] == 'e' || q[0] == 'E') &&
3427 (q[1] == 'z' || q[1] == 'Z') && ibp->is_tpa == false)
3428 continue;
3429 if (ibp->is_tpa && (i == objects::CSeq_id::e_Tpg || i == objects::CSeq_id::e_Tpd ||
3430 i == objects::CSeq_id::e_Tpe))
3431 continue;
3432 break;
3433 }
3434 if(*p != '\0')
3435 break;
3436 if(location != NULL)
3437 {
3438 MemFree(location);
3439 location = NULL;
3440 }
3441 }
3442 if(dbp == NULL)
3443 return;
3444
3445 ibp->drop = 1;
3446 if(location != NULL && StringLen(location) > 45)
3447 {
3448 location[40] = '\0';
3449 StringCat(location, "...");
3450 }
3451 if(ibp->is_tsa)
3452 ErrPostEx(SEV_REJECT, ERR_LOCATION_AccessionNotTSA,
3453 "Feature \"%s\" at \"%s\" on a TSA record cannot point to a non-TSA record.",
3454 fbp->key, (location == NULL) ? "empty_location" : location);
3455 else if(ibp->is_tls)
3456 ErrPostEx(SEV_REJECT, ERR_LOCATION_AccessionNotTLS,
3457 "Feature \"%s\" at \"%s\" on a TLS record cannot point to a non-TLS record.",
3458 fbp->key, (location == NULL) ? "empty_location" : location);
3459 else
3460 ErrPostEx(SEV_REJECT, ERR_LOCATION_AccessionNotTPA,
3461 "Feature \"%s\" at \"%s\" on a TPA record cannot point to a non-TPA record.",
3462 fbp->key, (location == NULL) ? "empty_location" : location);
3463 if(location != NULL)
3464 MemFree(location);
3465 }
3466
3467 /**********************************************************/
fta_perform_operon_checks(ParserPtr pp,TSeqFeatList & feats,IndexblkPtr ibp)3468 static bool fta_perform_operon_checks(ParserPtr pp, TSeqFeatList& feats, IndexblkPtr ibp)
3469 {
3470 FTAOperonPtr fophead;
3471 FTAOperonPtr fop;
3472 FTAOperonPtr tfop;
3473
3474 char* p;
3475 bool got;
3476 Int4 count;
3477
3478 if(pp == NULL)
3479 return true;
3480
3481 if (feats.empty())
3482 {
3483 if(ibp->segnum == ibp->segtotal && pp->operon != NULL)
3484 return(pp->operon->ret);
3485 return true;
3486 }
3487
3488 if(ibp->segnum < 2 && pp->operon != NULL)
3489 {
3490 fta_operon_free(pp->operon);
3491 pp->operon = NULL;
3492 }
3493
3494 if(pp->operon == NULL)
3495 {
3496 fop = new FTAOperon;
3497 fop->ret = true;
3498 pp->operon = fop;
3499 }
3500 else
3501 for(fop = pp->operon; fop->next != NULL;)
3502 fop = fop->next;
3503
3504 fophead = NULL;
3505 ITERATE(TSeqFeatList, feat, feats)
3506 {
3507 if (!(*feat)->GetData().IsImp())
3508 continue;
3509
3510 const objects::CImp_feat& imp_feat = (*feat)->GetData().GetImp();
3511
3512 count = 0;
3513 ITERATE(objects::CSeq_feat::TQual, qual, (*feat)->GetQual())
3514 {
3515 if (!(*qual)->IsSetQual() || (*qual)->GetQual() != "operon" ||
3516 !(*qual)->IsSetVal() || (*qual)->GetVal().empty())
3517 continue;
3518
3519 tfop = new FTAOperon;
3520 tfop->location = &(*feat)->GetLocation();
3521 tfop->operon = (*qual)->GetVal().c_str();
3522 tfop->featname = imp_feat.IsSetKey() ? imp_feat.GetKey().c_str() : "Unknown";
3523
3524 tfop->operon_feat = false;
3525 tfop->ret = true;
3526 tfop->strloc = NULL;
3527 tfop->next = NULL;
3528 if(StringCmp(tfop->featname, "operon") == 0)
3529 tfop->operon_feat = true;
3530
3531 if(fophead == NULL)
3532 fophead = tfop;
3533
3534 fop->next = tfop;
3535 fop = fop->next;
3536
3537 count++;
3538
3539 if(fop->operon_feat == false || fop == fophead)
3540 continue;
3541
3542 for(tfop = fophead; tfop->next != NULL; tfop = tfop->next)
3543 {
3544 if(tfop->operon_feat == false ||
3545 StringCmp(tfop->operon, fop->operon) != 0)
3546 continue;
3547
3548 if(tfop->strloc == NULL)
3549 tfop->strloc = location_to_string_or_unknown(*tfop->location);
3550
3551 if(fop->strloc == NULL)
3552 fop->strloc = location_to_string_or_unknown(*fop->location);
3553
3554 ErrPostEx(SEV_REJECT, ERR_FEATURE_OperonQualsNotUnique,
3555 "The operon features at \"%s\" and \"%s\" utilize the same /operon qualifier : \"%s\".",
3556 tfop->strloc, fop->strloc, fop->operon);
3557 pp->operon->ret = false;
3558 }
3559 }
3560
3561 if(count > 1)
3562 {
3563 if(fop->strloc == NULL)
3564 fop->strloc = location_to_string_or_unknown(*fop->location);
3565
3566 ErrPostEx(SEV_REJECT, ERR_FEATURE_MultipleOperonQuals,
3567 "Feature \"%s\" at \"%s\" has more than one operon qualifier.",
3568 fop->featname, fop->strloc);
3569 pp->operon->ret = false;
3570 }
3571
3572 if (count == 0 && imp_feat.IsSetKey() && imp_feat.GetKey() == "operon")
3573 {
3574 p = location_to_string_or_unknown((*feat)->GetLocation());
3575
3576 ErrPostEx(SEV_REJECT, ERR_FEATURE_MissingOperonQual,
3577 "The operon feature at \"%s\" lacks an /operon qualifier.",
3578 p);
3579
3580 MemFree(p);
3581 pp->operon->ret = false;
3582 }
3583 }
3584
3585 if(ibp->segnum != 0 && ibp->segnum != ibp->segtotal)
3586 return true;
3587
3588 if(pp->operon->next == NULL || pp->operon->next->next == NULL)
3589 return(pp->operon->ret);
3590
3591 for(fop = pp->operon->next; fop != NULL; fop = fop->next)
3592 {
3593 if(fop->operon_feat)
3594 continue;
3595
3596 got = false;
3597 for(tfop = pp->operon->next; tfop != NULL; tfop = tfop->next)
3598 {
3599 if(tfop->operon_feat == false ||
3600 StringCmp(fop->operon, tfop->operon) != 0)
3601 continue;
3602
3603 got = true;
3604 objects::sequence::ECompare cmp_res = objects::sequence::Compare(*fop->location, *tfop->location, nullptr, objects::sequence::fCompareOverlapping);
3605 if (cmp_res == objects::sequence::eContained || cmp_res == objects::sequence::eSame)
3606 continue;
3607
3608 if(fop->strloc == NULL)
3609 fop->strloc = location_to_string_or_unknown(*fop->location);
3610
3611 if(tfop->strloc == NULL)
3612 tfop->strloc = location_to_string_or_unknown(*tfop->location);
3613
3614 ErrPostEx(SEV_REJECT, ERR_FEATURE_OperonLocationMisMatch,
3615 "Feature \"%s\" at \"%s\" with /operon qualifier \"%s\" does not fall within the span of the operon feature at \"%s\".",
3616 fop->featname, fop->strloc, fop->operon, tfop->strloc);
3617 pp->operon->ret = false;
3618 }
3619
3620 if(!got)
3621 {
3622 if(fop->strloc == NULL)
3623 fop->strloc = location_to_string_or_unknown(*fop->location);
3624
3625 ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidOperonQual,
3626 "/operon qualifier \"%s\" on feature \"%s\" at \"%s\" has a value that does not match any of the /operon qualifiers on operon features.",
3627 fop->operon, fop->featname, fop->strloc);
3628 pp->operon->ret = false;
3629 }
3630 }
3631
3632 got = pp->operon->ret;
3633 fta_operon_free(pp->operon);
3634 pp->operon = NULL;
3635 return(got);
3636 }
3637
3638 /**********************************************************/
fta_remove_dup_quals(FeatBlkPtr fbp)3639 static void fta_remove_dup_quals(FeatBlkPtr fbp)
3640 {
3641 Char ch;
3642
3643 if(fbp == NULL || fbp->quals.empty())
3644 return;
3645
3646 NON_CONST_ITERATE(TQualVector, cur, fbp->quals)
3647 {
3648 const char* cur_qual = (*cur)->IsSetQual() ? (*cur)->GetQual().c_str() : NULL;
3649 const char* cur_val = (*cur)->IsSetVal() ? (*cur)->GetVal().c_str() : NULL;
3650
3651 TQualVector::iterator next = cur;
3652 for (++next; next != fbp->quals.end();)
3653 {
3654 const char* next_qual = (*next)->IsSetQual() ? (*next)->GetQual().c_str() : NULL;
3655 const char* next_val = (*next)->IsSetVal() ? (*next)->GetVal().c_str() : NULL;
3656
3657 if (!fta_strings_same(cur_qual, next_qual) || !fta_strings_same(cur_val, next_val))
3658 {
3659 ++next;
3660 continue;
3661 }
3662
3663 if(fbp->location != NULL && StringLen(fbp->location) > 20)
3664 {
3665 ch = fbp->location[20];
3666 fbp->location[20] = '\0';
3667 }
3668 else
3669 ch = '\0';
3670
3671 ErrPostEx(SEV_ERROR, ERR_QUALIFIER_DuplicateRemoved,
3672 "Duplicated qualifier \"%s\" in feature \"%s\" at location \"%s%s\" removed.",
3673 (cur_qual == NULL) ? "???" : cur_qual,
3674 (fbp->key == NULL) ? "???" : fbp->key,
3675 (fbp->location == NULL) ? "???" : fbp->location,
3676 (ch == '\0') ? "" : "...");
3677
3678 if(ch != '\0')
3679 fbp->location[20] = ch;
3680
3681 next = fbp->quals.erase(next);
3682 }
3683 }
3684 }
3685
3686 /**********************************************************/
CollectGapFeats(DataBlkPtr entry,DataBlkPtr dbp,ParserPtr pp,Int2 type)3687 static void CollectGapFeats(DataBlkPtr entry, DataBlkPtr dbp,
3688 ParserPtr pp, Int2 type)
3689 {
3690 IndexblkPtr ibp;
3691 GapFeatsPtr gfp = NULL;
3692 GapFeatsPtr tgfp;
3693 DataBlkPtr tdbp;
3694 FeatBlkPtr fbp;
3695
3696 objects::CLinkage_evidence::TLinkage_evidence asn_linkage_evidence;
3697 std::list<std::string> linkage_evidence_names;
3698
3699 StrNumPtr snp;
3700 char* p;
3701 char* q;
3702 const char* gap_type;
3703 bool finished_gap;
3704 ErrSev sev;
3705 Int4 estimated_length;
3706 Int4 is_htg;
3707 Int4 from;
3708 Int4 to;
3709 Int4 prev_gap; /* 0 - initial, 1 - "gap",
3710 2 - "assembly_gap" */
3711 Int4 curr_gap; /* 0 - initial, 1 - "gap",
3712 2 - "assembly_gap" */
3713 Int4 asn_gap_type;
3714
3715 ibp = pp->entrylist[pp->curindx];
3716
3717 if(ibp->keywords.empty())
3718 {
3719 if(pp->format == Parser::EFormat::GenBank)
3720 GetSequenceOfKeywords(entry, ParFlat_KEYWORDS,
3721 ParFlat_COL_DATA, ibp->keywords);
3722 else if(pp->format == Parser::EFormat::EMBL)
3723 GetSequenceOfKeywords(entry, ParFlat_KW, ParFlat_COL_DATA_EMBL,
3724 ibp->keywords);
3725 else if(pp->format == Parser::EFormat::XML)
3726 XMLGetKeywords(entry->offset, ibp->xip, ibp->keywords);
3727 }
3728
3729 is_htg = -1;
3730 ITERATE(TKeywordList, key, ibp->keywords)
3731 {
3732 if(is_htg >= 0 && is_htg <= 2)
3733 break;
3734 if(*key == "HTG")
3735 is_htg = 3;
3736 else if(*key == "HTGS_PHASE0")
3737 is_htg = 0;
3738 else if(*key == "HTGS_PHASE1")
3739 is_htg = 1;
3740 else if(*key == "HTGS_PHASE2")
3741 is_htg = 2;
3742 else if(*key == "HTGS_PHASE3")
3743 is_htg = 3;
3744 }
3745
3746 prev_gap = 0;
3747 curr_gap = 0;
3748 finished_gap = false;
3749 for(ibp->gaps = NULL; dbp != NULL; dbp = dbp->next)
3750 {
3751 if(ibp->drop != 0)
3752 break;
3753 if(dbp->type != type)
3754 continue;
3755
3756 linkage_evidence_names.clear();
3757 asn_linkage_evidence.clear();
3758
3759 for(tdbp = (DataBlkPtr) dbp->data; tdbp != NULL; tdbp = tdbp->next)
3760 {
3761 if(ibp->drop != 0)
3762 break;
3763 fbp = (FeatBlkPtr) tdbp->data;
3764 if(fbp == NULL || fbp->key == NULL)
3765 continue;
3766 if(StringCmp(fbp->key, "gap") == 0)
3767 {
3768 prev_gap = curr_gap;
3769 curr_gap = 1;
3770 }
3771 else if(StringCmp(fbp->key, "assembly_gap") == 0)
3772 {
3773 prev_gap = curr_gap;
3774 curr_gap = 2;
3775 }
3776 else
3777 continue;
3778
3779 from = 0;
3780 to = 0;
3781 estimated_length = 0;
3782 gap_type = NULL;
3783 linkage_evidence_names.clear();
3784 asn_gap_type = -1;
3785 asn_linkage_evidence.clear();
3786 estimated_length = -1;
3787
3788 ITERATE(TQualVector, cur, fbp->quals)
3789 {
3790 if (!(*cur)->IsSetQual() || !(*cur)->IsSetVal())
3791 continue;
3792
3793 const std::string& cur_qual = (*cur)->GetQual();
3794 const std::string& cur_val = (*cur)->GetVal();
3795
3796 if (cur_qual.empty() || cur_val.empty())
3797 continue;
3798
3799 if (cur_qual == "estimated_length")
3800 {
3801 if (cur_val == "unknown")
3802 estimated_length = -100;
3803 else
3804 {
3805 const char* cp = cur_val.c_str();
3806 for (; *cp >= '0' && *cp <= '9';)
3807 ++cp;
3808 if(*cp == '\0')
3809 estimated_length = atoi(cur_val.c_str());
3810 }
3811 }
3812 else if (cur_qual == "gap_type")
3813 gap_type = cur_val.c_str();
3814 else if (cur_qual == "linkage_evidence")
3815 {
3816 linkage_evidence_names.push_back(cur_val);
3817 }
3818 }
3819
3820 if(fbp->location != NULL)
3821 {
3822 p = fbp->location;
3823 if(*p == '<')
3824 p++;
3825 for(q = p; *p >= '0' && *p <= '9';)
3826 p++;
3827 if(*p == '\0')
3828 {
3829 from = atoi(q);
3830 to = from;
3831 }
3832 else if(*p == '.')
3833 {
3834 *p = '\0';
3835 from = atoi(q);
3836 *p++ = '.';
3837 if(*fbp->location == '<' && from != 1)
3838 from = 0;
3839 else if(*p == '.')
3840 {
3841 if(*++p == '>')
3842 p++;
3843 for(q = p; *p >= '0' && *p <= '9';)
3844 p++;
3845 if(*p == '\0')
3846 to = atoi(q);
3847 if(*(q - 1) == '>' && to != (int) ibp->bases)
3848 to = 0;
3849 }
3850 }
3851 }
3852
3853 if(from == 0 || to == 0 || from > to)
3854 {
3855 if(curr_gap == 1)
3856 ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidGapLocation,
3857 "Invalid gap feature location : \"%s\" : all gap features must have a simple X..Y location on the plus strand.",
3858 (fbp->location == NULL) ? "unknown" : fbp->location);
3859 else
3860 ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidAssemblyGapLocation,
3861 "Invalid assembly_gap location : \"%s\".",
3862 (fbp->location == NULL) ? "unknown" : fbp->location);
3863 ibp->drop = 1;
3864 break;
3865 }
3866
3867 if(curr_gap == 2) /* "assembly_gap" feature */
3868 {
3869 if(gap_type != NULL && is_htg > -1 &&
3870 StringCmp(gap_type, "within scaffold") != 0 &&
3871 StringCmp(gap_type, "repeat within scaffold") != 0)
3872 ErrPostEx(SEV_ERROR, ERR_QUALIFIER_UnexpectedGapTypeForHTG,
3873 "assembly_gap has /gap_type of \"%s\", but clone-based HTG records are only expected to have \"within scaffold\" or \"repeat within scaffold\" gaps. assembly_gap feature located at \"%d..%d\".",
3874 gap_type, from, to);
3875
3876 if(is_htg == 0 || is_htg == 1)
3877 {
3878 ITERATE(std::list<std::string>, evidence, linkage_evidence_names)
3879 {
3880 if (*evidence != LinkageEvidenceValues[objects::CLinkage_evidence_Base::eType_unspecified].str)
3881 {
3882 ErrPostEx(SEV_ERROR, ERR_QUALIFIER_LinkageShouldBeUnspecified,
3883 "assembly gap has /linkage_evidence of \"%s\", but unoriented and unordered Phase0/Phase1 HTG records are expected to have \"unspecified\" evidence. assembly_gap feature located at \"%d..%d\".",
3884 evidence->c_str(), from, to);
3885 }
3886 }
3887 }
3888 else if(is_htg == 2 || is_htg == 3)
3889 {
3890 ITERATE(std::list<std::string>, evidence, linkage_evidence_names)
3891 {
3892 if (*evidence != LinkageEvidenceValues[objects::CLinkage_evidence_Base::eType_unspecified].str)
3893 continue;
3894
3895 ErrPostEx(SEV_ERROR, ERR_QUALIFIER_LinkageShouldNotBeUnspecified,
3896 "assembly gap has /linkage_evidence of \"unspecified\", but ordered and oriented HTG records are expected to have some level of linkage for their gaps. assembly_gap feature located at \"%d..%d\".",
3897 from, to);
3898 }
3899 }
3900
3901 if(is_htg == 3 && !finished_gap)
3902 {
3903 ErrPostEx(SEV_ERROR, ERR_FEATURE_FinishedHTGHasAssemblyGap,
3904 "Finished Phase-3 HTG records are not expected to have any gaps. First assembly_gap feature encountered at \"%d..%d\".",
3905 from, to);
3906 finished_gap = true;
3907 }
3908
3909 if(gap_type == NULL)
3910 {
3911 ErrPostEx(SEV_REJECT, ERR_QUALIFIER_MissingGapType,
3912 "assembly_gap feature at \"%d..%d\" lacks the required /gap_type qualifier.",
3913 from, to);
3914 ibp->drop = 1;
3915 break;
3916 }
3917
3918 for(snp = GapTypeValues; snp->str != NULL; snp++)
3919 if(StringCmp(snp->str, gap_type) == 0)
3920 break;
3921 if(snp->str == NULL)
3922 {
3923 ErrPostEx(SEV_REJECT, ERR_QUALIFIER_InvalidGapType,
3924 "assembly_gap feature at \"%d..%d\" has an invalid gap type : \"%s\".",
3925 from, to, gap_type);
3926 ibp->drop = 1;
3927 break;
3928 }
3929 asn_gap_type = snp->num;
3930
3931 if(linkage_evidence_names.empty() &&
3932 (StringCmp(gap_type, "within scaffold") == 0 ||
3933 StringCmp(gap_type, "repeat within scaffold") == 0))
3934 {
3935 ErrPostEx(SEV_REJECT, ERR_QUALIFIER_MissingLinkageEvidence,
3936 "assembly_gap feature at \"%d..%d\" with gap type \"%s\" lacks a /linkage_evidence qualifier.",
3937 from, to, gap_type);
3938 ibp->drop = 1;
3939 break;
3940 }
3941 if (!linkage_evidence_names.empty())
3942 {
3943 if (StringCmp(gap_type, "unknown") != 0 &&
3944 StringCmp(gap_type, "within scaffold") != 0 &&
3945 StringCmp(gap_type, "repeat within scaffold") != 0)
3946 {
3947 ErrPostEx(SEV_REJECT,
3948 ERR_QUALIFIER_InvalidGapTypeForLinkageEvidence,
3949 "The /linkage_evidence qualifier is not legal for the assembly_gap feature at \"%d..%d\" with /gap_type \"%s\".",
3950 from, to, gap_type);
3951 ibp->drop = 1;
3952 break;
3953 }
3954
3955 ITERATE(std::list<std::string>, evidence, linkage_evidence_names)
3956 {
3957 for(snp = LinkageEvidenceValues; snp->str != NULL; snp++)
3958 if (*evidence == snp->str)
3959 break;
3960 if(snp->str == NULL)
3961 {
3962 ErrPostEx(SEV_REJECT,
3963 ERR_QUALIFIER_InvalidLinkageEvidence,
3964 "assembly_gap feature at \"%d..%d\" has an invalid linkage evidence : \"%s\".",
3965 from, to, evidence->c_str());
3966 ibp->drop = 1;
3967 break;
3968 }
3969
3970 CRef<objects::CLinkage_evidence> new_evidence(new objects::CLinkage_evidence);
3971 new_evidence->SetType(snp->num);
3972 asn_linkage_evidence.push_back(new_evidence);
3973 }
3974 }
3975 }
3976
3977 if(prev_gap + curr_gap == 3)
3978 {
3979 if(curr_gap == 1)
3980 ErrPostEx(SEV_REJECT, ERR_FEATURE_AssemblyGapAndLegacyGap,
3981 "Legacy gap feature at \"%d..%d\" co-exists with a new AGP 2.0 assembly_gap feature at \"%d..%d\".",
3982 from, to, gfp->from, gfp->to);
3983 else
3984 ErrPostEx(SEV_REJECT, ERR_FEATURE_AssemblyGapAndLegacyGap,
3985 "Legacy gap feature at \"%d..%d\" co-exists with a new AGP 2.0 assembly_gap feature at \"%d..%d\".",
3986 gfp->from, gfp->to, from, to);
3987 ibp->drop = 1;
3988 break;
3989 }
3990
3991 if(estimated_length == -1) /* missing qual */
3992 {
3993 ErrPostEx(SEV_REJECT, ERR_FEATURE_RequiredQualifierMissing,
3994 "The gap feature at \"%d..%d\" lacks the required /estimated_length qualifier.",
3995 from, to);
3996 ibp->drop = 1;
3997 }
3998 else if(estimated_length == 0)
3999 {
4000 ErrPostEx(SEV_REJECT, ERR_FEATURE_IllegalEstimatedLength,
4001 "Gap feature at \"%d..%d\" has an illegal /estimated_length qualifier : \"%s\" : should be \"unknown\" or an integer.",
4002 // from, to, gbqp->val); // at this point gbqp is definitely = NULL
4003 from, to, "");
4004 ibp->drop = 1;
4005 }
4006 else if(estimated_length == -100)
4007 {
4008 if(is_htg >= 0 && to - from != 99)
4009 {
4010 ErrPostEx(SEV_ERROR, ERR_FEATURE_UnknownGapNot100,
4011 "Gap feature at \"%d..%d\" has /estimated_length \"unknown\" but the gap size is not 100 bases.",
4012 from, to);
4013 }
4014 }
4015 else if(estimated_length != to - from + 1)
4016 {
4017 if(pp->source == Parser::ESource::EMBL || pp->source == Parser::ESource::DDBJ)
4018 sev = SEV_ERROR;
4019 else
4020 {
4021 sev = SEV_REJECT;
4022 ibp->drop = 1;
4023 }
4024
4025 ErrPostEx(sev, ERR_FEATURE_GapSizeEstLengthMissMatch,
4026 "Gap feature at \"%d..%d\" has a size that does not match the /estimated_length : %d.",
4027 from, to, estimated_length);
4028 }
4029
4030 for(gfp = ibp->gaps; gfp != NULL; gfp = gfp->next)
4031 {
4032 if((gfp->from >= from && gfp->from <= to) ||
4033 (gfp->to >= from && gfp->to <= to) ||
4034 (gfp->from <= from && gfp->to >= to))
4035 {
4036 ErrPostEx(SEV_REJECT, ERR_FEATURE_OverlappingGaps,
4037 "Gap features at \"%d..%d\" and \"%d..%d\" overlap.",
4038 from, to, gfp->from, gfp->to);
4039 ibp->drop = 1;
4040 }
4041 else if(to + 1 == gfp->from || from - 1 == gfp->to)
4042 {
4043 if(pp->source == Parser::ESource::EMBL)
4044 sev = SEV_ERROR;
4045 else
4046 {
4047 sev = SEV_REJECT;
4048 ibp->drop = 1;
4049 }
4050
4051 ErrPostEx(sev, ERR_FEATURE_ContiguousGaps,
4052 "Gap features at \"%d..%d\" and \"%d..%d\" are contiguous, and should probably be represented by a single gap that spans both.",
4053 from, to, gfp->from, gfp->to);
4054 }
4055 }
4056 if(ibp->drop != 0)
4057 break;
4058
4059 gfp = new GapFeats;
4060 gfp->from = from;
4061 gfp->to = to;
4062 gfp->estimated_length = estimated_length;
4063 if(curr_gap == 2) /* /assembly_gap feature */
4064 gfp->assembly_gap = true;
4065 if(gap_type != NULL)
4066 {
4067 gfp->gap_type = StringSave(gap_type);
4068 gfp->asn_gap_type = asn_gap_type;
4069 }
4070 if(!asn_linkage_evidence.empty())
4071 {
4072 gfp->asn_linkage_evidence.swap(asn_linkage_evidence);
4073 asn_linkage_evidence.clear();
4074 }
4075 gfp->next = NULL;
4076
4077 if(ibp->gaps == NULL)
4078 {
4079 ibp->gaps = gfp;
4080 continue;
4081 }
4082
4083 if(ibp->gaps->from > from)
4084 {
4085 gfp->next = ibp->gaps;
4086 ibp->gaps = gfp;
4087 continue;
4088 }
4089
4090 if(ibp->gaps->next == NULL)
4091 {
4092 ibp->gaps->next = gfp;
4093 continue;
4094 }
4095
4096 for(tgfp = ibp->gaps; tgfp != NULL; tgfp = tgfp->next)
4097 {
4098 if(tgfp->next != NULL && tgfp->next->from < from)
4099 continue;
4100 gfp->next = tgfp->next;
4101 tgfp->next = gfp;
4102 break;
4103 }
4104 }
4105 if(ibp->drop != 0)
4106 {
4107 linkage_evidence_names.clear();
4108 asn_linkage_evidence.clear();
4109 }
4110 }
4111
4112 if(ibp->gaps == NULL)
4113 return;
4114
4115 if(ibp->drop != 0)
4116 {
4117 GapFeatsFree(ibp->gaps);
4118 ibp->gaps = NULL;
4119 }
4120 }
4121
4122 /**********************************************************/
XMLGetQuals(char * entry,XmlIndexPtr xip,TQualVector & quals)4123 static void XMLGetQuals(char* entry, XmlIndexPtr xip, TQualVector& quals)
4124 {
4125 XmlIndexPtr xipqual;
4126
4127 if(entry == NULL || xip == NULL)
4128 return;
4129
4130 for(; xip != NULL; xip = xip->next)
4131 {
4132 if(xip->subtags == NULL)
4133 continue;
4134
4135 CRef<objects::CGb_qual> qual(new objects::CGb_qual);
4136 for(xipqual = xip->subtags; xipqual != NULL; xipqual = xipqual->next)
4137 {
4138 if(xipqual->tag == INSDQUALIFIER_NAME)
4139 qual->SetQual(XMLGetTagValue(entry, xipqual));
4140 else if(xipqual->tag == INSDQUALIFIER_VALUE)
4141 qual->SetVal(XMLGetTagValue(entry, xipqual));
4142 }
4143
4144 if (qual->GetQual() == "replace" && !qual->IsSetVal())
4145 {
4146 qual->SetVal("");
4147 }
4148
4149 if (qual->IsSetQual() && !qual->GetQual().empty())
4150 quals.push_back(qual);
4151 }
4152 }
4153
4154 /**********************************************************/
XMLLoadFeatBlk(char * entry,XmlIndexPtr xip)4155 static DataBlkPtr XMLLoadFeatBlk(char* entry, XmlIndexPtr xip)
4156 {
4157 XmlIndexPtr xipfeat;
4158 DataBlkPtr headdbp;
4159 DataBlkPtr dbp;
4160 DataBlkPtr ret;
4161 FeatBlkPtr fbp;
4162
4163 if(entry == NULL || xip == NULL)
4164 return(NULL);
4165
4166 for(; xip != NULL; xip = xip->next)
4167 if(xip->tag == INSDSEQ_FEATURE_TABLE)
4168 break;
4169
4170 if(xip == NULL || xip->subtags == NULL)
4171 return(NULL);
4172
4173 headdbp = NULL;
4174 for(xip = xip->subtags; xip != NULL; xip = xip->next)
4175 {
4176 if(xip->subtags == NULL)
4177 continue;
4178 fbp = new FeatBlk;
4179 for(xipfeat = xip->subtags; xipfeat != NULL; xipfeat = xipfeat->next)
4180 {
4181 if(xipfeat->tag == INSDFEATURE_KEY)
4182 fbp->key = XMLGetTagValue(entry, xipfeat);
4183 else if(xipfeat->tag == INSDFEATURE_LOCATION)
4184 fbp->location = XMLGetTagValue(entry, xipfeat);
4185 else if(xipfeat->tag == INSDFEATURE_QUALS)
4186 XMLGetQuals(entry, xipfeat->subtags, fbp->quals);
4187 }
4188 if(headdbp == NULL)
4189 {
4190 headdbp = (DataBlkPtr) MemNew(sizeof(DataBlk));
4191 dbp = headdbp;
4192 }
4193 else
4194 {
4195 dbp->next = (DataBlkPtr) MemNew(sizeof(DataBlk));
4196 dbp = dbp->next;
4197 }
4198 dbp->data = fbp;
4199 }
4200 ret = (DataBlkPtr) MemNew(sizeof(DataBlk));
4201 ret->type = XML_FEATURES;
4202 ret->data = headdbp;
4203 ret->next = NULL;
4204 return(ret);
4205 }
4206
4207 /**********************************************************
4208 *
4209 * static FeatBlkPtr MergeNoteQual(fbp):
4210 *
4211 * Only one note on every key feature block,
4212 * not complete.
4213 *
4214 * 5-28-93
4215 *
4216 **********************************************************/
MergeNoteQual(FeatBlkPtr fbp)4217 static FeatBlkPtr MergeNoteQual(FeatBlkPtr fbp)
4218 {
4219 char* note;
4220 char* p;
4221 char* q;
4222
4223 size_t size = 0;
4224
4225 NON_CONST_ITERATE(TQualVector, cur, fbp->quals)
4226 {
4227 if (!(*cur)->IsSetQual() || !(*cur)->IsSetVal())
4228 continue;
4229
4230 const std::string& cur_qual = (*cur)->GetQual();
4231 const std::string& cur_val = (*cur)->GetVal();
4232
4233 if (cur_qual != "note" || cur_val.empty())
4234 continue;
4235
4236 size += 2;
4237 std::vector<Char> buf(cur_val.size() + 1);
4238
4239 const char* cp = cur_val.c_str();
4240 for(q = &buf[0]; *cp != '\0'; ++cp)
4241 {
4242 *q++ = *cp;
4243 if (*cp == ';' && (cp[1] == ' ' || cp[1] == ';'))
4244 {
4245 for(++cp; *cp == ' ' || *cp == ';';)
4246 ++cp;
4247 if(*cp != '\0')
4248 *q++ = ' ';
4249 --cp;
4250 }
4251 }
4252
4253 *q = '\0';
4254 (*cur)->SetVal(&buf[0]);
4255
4256 size += (*cur)->GetVal().size();
4257 for (cp = (*cur)->GetVal().c_str(); *cp != '\0'; ++cp)
4258 if(*cp == '~')
4259 ++size;
4260 }
4261
4262 if(size == 0)
4263 return(fbp);
4264
4265 note = (char*) MemNew(size);
4266 p = note;
4267
4268 for (TQualVector::iterator cur = fbp->quals.begin(); cur != fbp->quals.end();)
4269 {
4270 if (!(*cur)->IsSetQual() || !(*cur)->IsSetVal())
4271 {
4272 ++cur;
4273 continue;
4274 }
4275
4276 const std::string& cur_qual = (*cur)->GetQual();
4277 const std::string& cur_val = (*cur)->GetVal();
4278
4279 if (cur_qual != "note")
4280 {
4281 ++cur;
4282 continue;
4283 }
4284
4285 if (!cur_val.empty())
4286 {
4287 /* sometime we get note qual w/o value
4288 */
4289 if(p > note)
4290 {
4291 *p++ = ';';
4292 *p++ = '~';
4293 }
4294
4295 for (const char* cq = cur_val.c_str(); *cq != '\0'; *p++ = *cq++)
4296 if(*cq == '~')
4297 *p++ = '~';
4298 }
4299
4300 cur = fbp->quals.erase(cur);
4301 }
4302 *p = '\0';
4303
4304 CRef<objects::CGb_qual> qual_new(new objects::CGb_qual);
4305 qual_new->SetQual("note");
4306 qual_new->SetVal(note);
4307
4308 fbp->quals.push_back(qual_new);
4309
4310 return(fbp);
4311 }
4312
4313 /**********************************************************/
CheckLegalQual(const Char * val,Char ch,std::string * qual)4314 static bool CheckLegalQual(const Char* val, Char ch, std::string* qual)
4315 {
4316 std::string qual_name;
4317 for (; *val && *val != ch && (isalpha(*val) || *val == '_'); ++val)
4318 qual_name += *val;
4319
4320 objects::CSeqFeatData::EQualifier type = objects::CSeqFeatData::GetQualifierType(qual_name);
4321 if (type == objects::CSeqFeatData::eQual_bad)
4322 return false;
4323
4324 if (qual != nullptr)
4325 *qual = qual_name;
4326
4327 return true;
4328 }
4329
4330 /**********************************************************/
fta_set_merge_marks(char * val,size_t quallen,size_t vallen)4331 static void fta_set_merge_marks(char* val, size_t quallen, size_t vallen)
4332 {
4333 char* start;
4334 char* p;
4335 char* q;
4336 bool first;
4337
4338 if(val == NULL || *val == '\0')
4339 return;
4340
4341 p = StringChr(val, '\n');
4342 if(p == NULL)
4343 return;
4344
4345 for(first = true, start = val; p != NULL;)
4346 {
4347 if((p - 1) >= start && *(p - 1) == '-' &&
4348 (p - 2) >= start && *(p - 2) != ' ')
4349 {
4350 *p = '\t';
4351 start = ++p;
4352 p = StringChr(p, '\n');
4353 continue;
4354 }
4355 if((p - 3) >= start && StringNCmp(p - 3, "(EC", 3) == 0 &&
4356 p[1] >= '0' && p[1] <= '9')
4357 {
4358 start = ++p;
4359 p = StringChr(p, '\n');
4360 continue;
4361 }
4362 if(p[1] == '(' || ((p - 1) >= start && *(p - 1) == ','))
4363 {
4364 start = ++p;
4365 p = StringChr(p, '\n');
4366 continue;
4367 }
4368 *p = '\0';
4369 q = StringChr(start, ' ');
4370 size_t len = StringLen(start);
4371 if(first)
4372 {
4373 first = false;
4374 len += quallen;
4375 }
4376 *p = (q == NULL && len == vallen) ? '\t' : '\n';
4377 start = ++p;
4378 p = StringChr(p, '\n');
4379 }
4380 }
4381
4382 /**********************************************************/
fta_convert_to_lower_case(char * str)4383 static void fta_convert_to_lower_case(char* str)
4384 {
4385 char* p;
4386
4387 if (str == NULL || *str == '\0')
4388 return;
4389
4390 for (p = str; *p != '\0'; p++)
4391 if (*p >= 'A' && *p <= 'Z')
4392 *p |= 040;
4393 }
4394
4395 /**********************************************************/
fta_process_con_slice(std::vector<char> & val_buf)4396 static void fta_process_con_slice(std::vector<char>& val_buf)
4397 {
4398 size_t i = 1;
4399 char* p = &val_buf[0];
4400
4401 for (; *p != '\0'; p++)
4402 if (*p == ',' && p[1] != ' ' && p[1] != '\0')
4403 i++;
4404
4405 if (i > 1)
4406 {
4407 vector<char> buf(i + val_buf.size());
4408 char* q = &buf[0];
4409 for (p = &val_buf[0]; *p != '\0'; p++)
4410 {
4411 *q++ = *p;
4412 if (*p == ',' && p[1] != ' ' && p[1] != '\0')
4413 *q++ = ' ';
4414 }
4415 *q = '\0';
4416 val_buf.swap(buf);
4417 }
4418 }
4419
4420
4421 /**********************************************************
4422 *
4423 * static void ParseQualifiers(fbp, bptr, eptr,
4424 * format):
4425 *
4426 * Parsing qualifier and put into link list fbp->qual.
4427 * Some qualifiers may not have value.
4428 * genbank qualifier format: /qualifier=value
4429 * embl qualifier format: /qualifier= value
4430 *
4431 * 10-12-93
4432 *
4433 **********************************************************/
ParseQualifiers(FeatBlkPtr fbp,char * bptr,char * eptr,Parser::EFormat format)4434 static void ParseQualifiers(FeatBlkPtr fbp, char* bptr, char* eptr,
4435 Parser::EFormat format)
4436 {
4437 const char **b;
4438
4439 char* ptr;
4440 char* str;
4441 char* qstr;
4442 char* p;
4443 char* q;
4444 char* r;
4445 Char ch;
4446 Int4 vallen;
4447 Int4 count;
4448 Int2 got;
4449 Int2 quotes;
4450 Int2 reject;
4451
4452 vallen = (format == Parser::EFormat::EMBL) ? 59 : 58;
4453
4454 qstr = (char*) MemNew(eptr - bptr + 2);
4455 ch = *eptr;
4456 *eptr = '\0';
4457
4458 for(p = bptr; *p == ' ' || *p == '\n';)
4459 p++;
4460 for(q = qstr; *p != '\0';)
4461 {
4462 if(*p != ' ' && *p != '\n')
4463 {
4464 *q++ = *p++;
4465 continue;
4466 }
4467
4468 for(got = 0, r = p; *r == ' ' || *r == '\n'; r++)
4469 if(*r == '\n')
4470 got = 1;
4471 if(got == 1)
4472 {
4473 *q++ = '\n';
4474 p = r;
4475 }
4476 else
4477 while(*p == ' ')
4478 *q++ = *p++;
4479 }
4480 if(q == qstr || *(q - 1) != '\n')
4481 *q++ = '\n';
4482 *q = '\0';
4483 *eptr = ch;
4484
4485 for(str = qstr + 1; *str != '\0';)
4486 {
4487 reject = 0;
4488
4489 CRef<objects::CGb_qual> qual_new(new objects::CGb_qual);
4490 for(ptr = str; *str != '/' && *str != '=' && *str != '\0' && *str != '\n';)
4491 str++;
4492
4493 std::string qual_str(ptr, str);
4494 size_t quallen = qual_str.size() + 1;
4495
4496 NStr::ReplaceInPlace(qual_str, "\n", " ");
4497 NStr::TruncateSpacesInPlace(qual_str, NStr::eTrunc_End);
4498
4499 if (qual_str == "specific_host")
4500 qual_str = "host";
4501 qual_new->SetQual(qual_str);
4502
4503 quotes = 0;
4504 if(*str == '=') /* get gbq->val */
4505 {
4506 quallen++;
4507 while(*str == '=' || *str == ' ' || *str == '\n')
4508 str++;
4509
4510 if(*str == '\"') /* found open double quote */
4511 {
4512 quallen++;
4513 quotes = 1;
4514 str++;
4515 ptr = str;
4516
4517 /* search first close double quote
4518 */
4519 if (qual_str == "note")
4520 {
4521 for(;;)
4522 {
4523 str = StringChr(str, '\n');
4524 if(str[1] == '\0')
4525 {
4526 if(*(str - 1) == '\"')
4527 {
4528 quotes++;
4529 str--;
4530 }
4531 break;
4532 }
4533 if (str[1] != '/' || !CheckLegalQual(str + 2, '\n', nullptr))
4534 {
4535 str++;
4536 continue;
4537 }
4538 if(*(str - 1) == '\"')
4539 {
4540 quotes++;
4541 str--;
4542 }
4543 break;
4544 }
4545 }
4546 else
4547 {
4548 while(*str != '\"' && *str != '\0')
4549 str++;
4550 }
4551 }
4552 else
4553 {
4554 for(ptr = str; *str != '\0'; str++)
4555 if(*str == '\n' && str[1] == '/')
4556 {
4557 str++;
4558 break;
4559 }
4560 }
4561
4562 std::vector<Char> val_buf(ptr, str);
4563 val_buf.push_back(0);
4564
4565 if (!val_buf.empty())
4566 {
4567 fta_set_merge_marks(&val_buf[0], quallen, vallen);
4568
4569 std::replace(val_buf.begin(), val_buf.end(), '\n', ' ');
4570 val_buf.erase(std::remove(val_buf.begin(), val_buf.end(), '\t'), val_buf.end());
4571
4572 std::string aux(&val_buf[0]);
4573 NStr::TruncateSpacesInPlace(aux, NStr::eTrunc_End);
4574 val_buf.assign(aux.begin(), aux.end());
4575 val_buf.push_back(0);
4576
4577 if(qual_str == "translation" ||
4578 qual_str == "replace")
4579 {
4580 /* delete blanks in the middle of the data
4581 */
4582 val_buf.erase(std::remove(val_buf.begin(), val_buf.end(), ' '), val_buf.end());
4583 }
4584 else if(qual_str == "rpt_unit")
4585 {
4586 fta_convert_to_lower_case(&val_buf[0]);
4587 }
4588 else if (qual_str == "cons_splice")
4589 {
4590 fta_process_con_slice(val_buf);
4591 }
4592 else if (qual_str == "note")
4593 {
4594 if(quotes == 1)
4595 {
4596 if (val_buf.size() > 30)
4597 {
4598 ch = val_buf[30];
4599 val_buf[30] = '\0';
4600 }
4601 else
4602 ch = '\0';
4603 ErrPostEx(SEV_WARNING,
4604 ERR_QUALIFIER_MissingTerminalDoubleQuote,
4605 "/note qualifier is not terminated with double quote : [%s%s].",
4606 &val_buf[0], (ch == '\0') ? "" : " ...");
4607 if(ch != '\0')
4608 val_buf[30] = ch;
4609 }
4610 for (quotes = 0, p = &val_buf[0]; *p != '\0'; p++)
4611 {
4612 if(*p != '\"')
4613 continue;
4614
4615 if(p[1] != '\"')
4616 {
4617 quotes = 1;
4618 break;
4619 }
4620 quotes = !quotes;
4621 p++;
4622 }
4623 if(quotes != 0)
4624 {
4625 if (val_buf.size() > 30)
4626 {
4627 ch = val_buf[30];
4628 val_buf[30] = '\0';
4629 }
4630 else
4631 ch = '\0';
4632 ErrPostEx(SEV_ERROR, ERR_QUALIFIER_UnbalancedQuotes,
4633 "/note qualifier value contains unbalanced double-quotes, and has been discarded : [%s%s].",
4634 &val_buf[0], (ch == '\0') ? "" : " ...");
4635 if(ch != '\0')
4636 val_buf[30] = ch;
4637 reject = 1;
4638 }
4639
4640 if(fbp != NULL && fbp->key != NULL &&
4641 StringCmp(fbp->key, "misc_feature") != 0)
4642 {
4643 std::string qual;
4644 for (count = 0, p = &val_buf[0]; ; p++)
4645 {
4646 p = StringChr(p, '/');
4647 if(p == NULL)
4648 break;
4649
4650 std::string cur_qual;
4651 if (CheckLegalQual(p + 1, ' ', &cur_qual))
4652 {
4653 if (qual.empty())
4654 qual = cur_qual;
4655 else
4656 count++;
4657 }
4658 }
4659
4660 if (!qual.empty())
4661 {
4662 FtaDeletePrefix(PREFIX_FEATURE);
4663 if(count == 0)
4664 ErrPostEx(SEV_WARNING, ERR_QUALIFIER_EmbeddedQual,
4665 "/note contains /%s : FEAT=%s[%s] : %s.",
4666 qual.c_str(), fbp->key, fbp->location, &val_buf[0]);
4667 else
4668 ErrPostEx(SEV_WARNING, ERR_QUALIFIER_EmbeddedQual,
4669 "/note contains /%s and %d other embedded qualifiers : FEAT=%s[%s] : %s.",
4670 qual.c_str(), count, fbp->key, fbp->location, &val_buf[0]);
4671 FtaInstallPrefix(PREFIX_FEATURE, fbp->key,
4672 fbp->location);
4673 }
4674 }
4675 }
4676
4677 qual_new->SetVal(&val_buf[0]);
4678 }
4679
4680 while(*str == ' ' || *str == '\"' || *str == '\n')
4681 str++;
4682
4683 /* check any truncated data
4684 */
4685 if(*str != '\0' && *str != '/')
4686 {
4687 for(ptr = str; *str != '/' && *str != '\0';)
4688 str++;
4689
4690 std::string aux(ptr, str);
4691 if(str - ptr > 50)
4692 aux.resize(50);
4693 NStr::ReplaceInPlace(aux, "\n", " ");
4694
4695 ErrPostEx(SEV_WARNING, ERR_FEATURE_DiscardData, "%s", aux.c_str());
4696 }
4697 } /* if, = */
4698
4699 while(*str == ' ' || *str == '/' || *str == '\"' || *str == '\n')
4700 str++;
4701
4702 if(reject != 0)
4703 continue;
4704
4705 if (qual_new->IsSetVal())
4706 {
4707 const std::string& val_str = qual_new->GetVal();
4708 const char* cp = val_str.c_str();
4709 for(; *cp == '\"' || *cp == ' ' || *cp == '\t';)
4710 ++cp;
4711 if(*cp == '\0')
4712 {
4713 if(qual_str == "replace")
4714 qual_new->SetVal("");
4715 else
4716 qual_new->ResetVal();
4717 }
4718 }
4719
4720 for(b = EmptyQuals; *b != NULL; b++)
4721 if (qual_str == *b)
4722 break;
4723
4724 if(*b == NULL)
4725 {
4726 if (!qual_new->IsSetVal())
4727 {
4728 if (qual_str == "old_locus_tag")
4729 ErrPostEx(SEV_ERROR, ERR_FEATURE_EmptyOldLocusTag,
4730 "Feature \"%s\" at \"%s\" has an /old_locus_tag qualifier with no value. Qualifier has been dropped.",
4731 (fbp->key == NULL) ? "Unknown" : fbp->key,
4732 (fbp->location == NULL) ? "Empty" : fbp->location);
4733 else
4734 ErrPostEx(SEV_WARNING, ERR_QUALIFIER_EmptyQual,
4735 "Qualifier /%s ignored because it lacks a data value. Feature \"%s\", location \"%s\".",
4736 qual_str.c_str(),
4737 (fbp->key == NULL) ? "Unknown" : fbp->key,
4738 (fbp->location == NULL) ? "Empty" : fbp->location);
4739 continue;
4740 }
4741 }
4742 else if (qual_new->IsSetVal())
4743 {
4744 if (qual_str != "artificial_location" &&
4745 qual_str != "mobile_element_type")
4746 {
4747 ErrPostEx(SEV_WARNING, ERR_QUALIFIER_ShouldNotHaveValue,
4748 "Qualifier /%s should not have data value. Qualifier value has been ignored. Feature \"%s\", location \"%s\".",
4749 qual_str.c_str(), (fbp->key == NULL) ? "Unknown" : fbp->key,
4750 (fbp->location == NULL) ? "Empty" : fbp->location);
4751 qual_new->ResetVal();
4752 }
4753 }
4754
4755 if (qual_new->IsSetVal() && qual_str == "note")
4756 {
4757 std::string val = qual_new->GetVal();
4758 std::replace(val.begin(), val.end(), '\"', '\'');
4759 qual_new->SetVal(val);
4760 }
4761
4762 if (qual_new->IsSetQual() && !qual_new->GetQual().empty())
4763 fbp->quals.push_back(qual_new);
4764 }
4765
4766 MemFree(qstr);
4767 }
4768
4769 /**********************************************************/
fta_check_satellite(char * str,unsigned char * drop)4770 static void fta_check_satellite(char* str, unsigned char* drop)
4771 {
4772 char* p;
4773 Int2 i;
4774
4775 if(str == NULL || *str == '\0')
4776 return;
4777
4778 p = StringChr(str, ':');
4779 if(p != NULL)
4780 *p = '\0';
4781
4782 i = MatchArrayString(SatelliteValues, str);
4783 if(p != NULL)
4784 *p = ':';
4785 if(i < 0)
4786 {
4787 ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidSatelliteType,
4788 "/satellite qualifier \"%s\" does not begin with a valid satellite type.",
4789 str);
4790 *drop = 1;
4791 }
4792 else if(p != NULL && p[1] == '\0')
4793 {
4794 ErrPostEx(SEV_REJECT, ERR_FEATURE_NoSatelliteClassOrIdentifier,
4795 "/satellite qualifier \"%s\" does not include a class or identifier after the satellite type.",
4796 str);
4797 *drop = 1;
4798 }
4799 }
4800
4801 /**********************************************************
4802 *
4803 * int ParseFeatureBlock(ibp, deb, dbp, source, format):
4804 *
4805 * Parsing each feature sub-block, dbp, to
4806 * FeatBlkPtr, fbp.
4807 * Put warning message if bad qualifier's value or
4808 * unknown feature key found.
4809 * fdbp->drop = 1, if found unknown feature key, or
4810 * do not go through 2nd time of qualifiers sematic
4811 * check (i.e. drop bad qualifier if the value if illegal
4812 * format in the 1st time)
4813 *
4814 * 11-22-93
4815 *
4816 * The location begins at column 22, and qualifier
4817 * begin on subsequent lines at column 22, they may
4818 * extend from column 22-80.
4819 * Qualifiers take the form of a slash, "/", followed
4820 * by the qualifier name and, if applicable, an equal
4821 * sign, "=", and a value (i.e. some qualifiers only
4822 * have name w/o value, s.t. /pseudo).
4823 *
4824 * 5-4-93
4825 *
4826 **********************************************************/
ParseFeatureBlock(IndexblkPtr ibp,bool deb,DataBlkPtr dbp,Parser::ESource source,Parser::EFormat format)4827 int ParseFeatureBlock(IndexblkPtr ibp, bool deb, DataBlkPtr dbp,
4828 Parser::ESource source, Parser::EFormat format)
4829 {
4830 char* bptr;
4831 char* eptr;
4832 char* ptr1;
4833 char* ptr2;
4834 char* p;
4835 char* q;
4836 Char loc[100];
4837 Char ch;
4838
4839 FeatBlkPtr fbp;
4840 Int4 num;
4841 size_t i;
4842 int retval = GB_FEAT_ERR_NONE;
4843 int ret;
4844
4845 if(ibp->is_mga)
4846 sprintf(loc, "1..%ld", ibp->bases);
4847 for(num = 0; dbp != NULL; dbp = dbp->next, num++)
4848 {
4849 fbp = new FeatBlk;
4850 fbp->num = num;
4851 dbp->data = fbp;
4852
4853 bptr = dbp->offset;
4854 eptr = bptr + dbp->len;
4855
4856 for(p = bptr; *p != '\n';)
4857 p++;
4858 *p = '\0';
4859 FtaInstallPrefix(PREFIX_FEATURE, (char *) "Parsing FT line: ", bptr);
4860 *p = '\n';
4861 ptr1 = bptr + ParFlat_COL_FEATKEY;
4862 if(*ptr1 == ' ')
4863 {
4864 ErrPostStr(SEV_WARNING, ERR_FEATURE_FeatureKeyReplaced,
4865 "Empty featkey");
4866 }
4867 for(ptr1 = bptr; *ptr1 == ' ';)
4868 ptr1++;
4869
4870 for(ptr2 = ptr1; *ptr2 != ' ' && *ptr2 != '\n';)
4871 ptr2++;
4872
4873 if(StringNCmp(ptr1, "- ", 2) == 0)
4874 {
4875 ErrPostStr(SEV_WARNING, ERR_FEATURE_FeatureKeyReplaced,
4876 "Featkey '-' is replaced by 'misc_feature'");
4877 fbp->key = StringSave("misc_feature");
4878 }
4879 else
4880 fbp->key = StringSave(std::string(ptr1, ptr2).c_str());
4881
4882 for(ptr1 = ptr2; *ptr1 == ' ';)
4883 ptr1++;
4884 if(*ptr1 == '\n')
4885 {
4886 if(ibp->is_mga == false)
4887 {
4888 ErrPostEx(SEV_WARNING, ERR_FEATURE_LocationParsing,
4889 "Location missing");
4890 dbp->drop = 1;
4891 retval = GB_FEAT_ERR_DROP;
4892 continue;
4893 }
4894 }
4895 else
4896 {
4897 i = ptr1 - bptr;
4898 if(i < ParFlat_COL_FEATDAT)
4899 ErrPostEx(SEV_WARNING, ERR_FEATURE_LocationParsing,
4900 "Location data is shifted to the left");
4901 else if(i > ParFlat_COL_FEATDAT)
4902 ErrPostEx(SEV_WARNING, ERR_FEATURE_LocationParsing,
4903 "Location data is shifted to the right");
4904 }
4905
4906 for(ptr2 = ptr1; *ptr2 != '/' && ptr2 < eptr;)
4907 ptr2++;
4908 ch = *ptr2;
4909 *ptr2 = '\0';
4910 fbp->location = StringSave(ptr1);
4911 if(ibp->is_prot)
4912 fta_strip_aa(fbp->location);
4913 *ptr2 = ch;
4914 for(p = fbp->location, q = p; *p != '\0'; p++)
4915 if(*p != ' ' && *p != '\n')
4916 *q++ = *p;
4917 *q = '\0';
4918
4919 if(fbp->location[0] == '\0' && ibp->is_mga)
4920 {
4921 MemFree(fbp->location);
4922 fbp->location = StringSave(loc);
4923 }
4924
4925 FtaInstallPrefix(PREFIX_FEATURE, fbp->key, fbp->location);
4926 if(StringCmp(fbp->key, "allele") == 0 ||
4927 StringCmp(fbp->key, "mutation") == 0)
4928 {
4929 ErrPostEx(SEV_ERROR, ERR_FEATURE_ObsoleteFeature,
4930 "Obsolete feature \"%s\" found. Replaced with \"variation\".",
4931 fbp->key);
4932 MemFree(fbp->key);
4933 fbp->key = StringSave("variation");
4934 }
4935
4936 objects::CSeqFeatData::ESubtype subtype = objects::CSeqFeatData::SubtypeNameToValue(fbp->key);
4937
4938 if (subtype == objects::CSeqFeatData::eSubtype_bad && !deb)
4939 {
4940 ErrPostEx(SEV_ERROR, ERR_FEATURE_UnknownFeatKey, fbp->key,
4941 "Feature dropped");
4942 dbp->drop = 1;
4943 retval = GB_FEAT_ERR_DROP;
4944 continue;
4945 }
4946
4947 if(*ptr2 == '/') /* qualifier start in first "/" */
4948 {
4949 ParseQualifiers(fbp, ptr2, eptr, format);
4950
4951 if(StringCmp(fbp->key, "assembly_gap") != 0)
4952 {
4953 ITERATE(TQualVector, cur, fbp->quals)
4954 {
4955 const std::string& cur_qual = (*cur)->GetQual();
4956 if (cur_qual == "gap_type" ||
4957 cur_qual == "assembly_evidence")
4958 {
4959 ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidQualifier,
4960 "Qualifier /%s is invalid for the feature \"%s\" at \"%s\".",
4961 cur_qual.c_str(), fbp->key, (fbp->location == NULL) ? "Unknown" : fbp->location);
4962 ibp->drop = 1;
4963 }
4964 }
4965 }
4966
4967 if(StringCmp(fbp->key, "source") != 0)
4968 {
4969 ITERATE(TQualVector, cur, fbp->quals)
4970 {
4971 const std::string& cur_qual = (*cur)->GetQual();
4972 if (cur_qual == "submitter_seqid" )
4973 {
4974 ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidQualifier,
4975 "Qualifier /%s is invalid for the feature \"%s\" at \"%s\".",
4976 cur_qual.c_str(), fbp->key, (fbp->location == NULL) ? "Unknown" : fbp->location);
4977 ibp->drop = 1;
4978 }
4979 }
4980 }
4981
4982 fbp = MergeNoteQual(fbp); /* allow more than one
4983 notes w/i a key */
4984
4985 if (subtype == objects::CSeqFeatData::eSubtype_bad)
4986 {
4987 ErrPostStr(SEV_ERROR, ERR_FEATURE_UnknownFeatKey, fbp->key);
4988 ret = GB_FEAT_ERR_REPAIRABLE;
4989 }
4990 else
4991 {
4992 /* last argument is perform_corrections if debug
4993 * mode is FALSE
4994 */
4995 ret = XGBFeatKeyQualValid(subtype, fbp->quals, true, (source == Parser::ESource::Flybase ? false : !deb));
4996 }
4997 if(ret > retval)
4998 retval = ret;
4999
5000 if(ret > GB_FEAT_ERR_REPAIRABLE &&
5001 StringCmp(fbp->key, "ncRNA") != 0)
5002 dbp->drop = 1;
5003 }
5004 else if (subtype == objects::CSeqFeatData::eSubtype_bad && !objects::CSeqFeatData::GetMandatoryQualifiers(subtype).empty())
5005 {
5006 if(StringCmp(fbp->key, "mobile_element") != 0)
5007 {
5008 auto qual_idx = *objects::CSeqFeatData::GetMandatoryQualifiers(subtype).begin();
5009 std::string str1 = objects::CSeqFeatData::GetQualifierAsString(qual_idx);
5010 const char *str = str1.c_str();
5011 if((StringCmp(fbp->key, "old_sequence") != 0 &&
5012 StringCmp(fbp->key, "conflict") != 0) ||
5013 StringCmp(str, "citation") != 0)
5014 {
5015 ErrPostEx(SEV_ERROR, ERR_FEATURE_RequiredQualifierMissing,
5016 "lacks required /%s qualifier : feature has been dropped.",
5017 str);
5018 if(!deb)
5019 {
5020 dbp->drop = 1;
5021 retval = GB_FEAT_ERR_DROP;
5022 }
5023 }
5024 }
5025 }
5026 else if(StringCmp(fbp->key, "misc_feature") == 0 && fbp->quals.empty())
5027 {
5028 if (!deb)
5029 {
5030 dbp->drop = 1;
5031 retval = GB_FEAT_ERR_DROP;
5032 ErrPostStr(SEV_WARNING, ERR_FEATURE_Dropped,
5033 "Empty 'misc_feature' dropped");
5034 }
5035 else
5036 retval = GB_FEAT_ERR_REPAIRABLE;
5037 }
5038
5039 NON_CONST_ITERATE(TQualVector, cur, fbp->quals)
5040 {
5041 if (!(*cur)->IsSetQual() || !(*cur)->IsSetVal())
5042 continue;
5043
5044 const std::string& qual_str = (*cur)->GetQual();
5045 const std::string& val_str = (*cur)->GetVal();
5046
5047 std::vector<Char> val_buf(val_str.begin(), val_str.end());
5048 val_buf.push_back(0);
5049
5050 p = &val_buf[0];
5051 ShrinkSpaces(p);
5052 if (*p == '\0' && qual_str != "replace")
5053 {
5054 (*cur)->ResetVal();
5055 val_buf[0] = 0;
5056 }
5057 else
5058 {
5059 if (qual_str == "replace")
5060 fta_convert_to_lower_case(p);
5061 (*cur)->SetVal(p);
5062 }
5063
5064 if (qual_str == "satellite")
5065 fta_check_satellite(&val_buf[0], &ibp->drop);
5066 }
5067 } /* for, each sub-block, or each feature key */
5068 FtaDeletePrefix(PREFIX_FEATURE);
5069 return(retval);
5070 }
5071
5072 /**********************************************************/
XMLCheckQualifiers(FeatBlkPtr fbp)5073 static void XMLCheckQualifiers(FeatBlkPtr fbp)
5074 {
5075 const char **b;
5076 char* p;
5077 Char ch;
5078
5079 if(fbp == NULL || fbp->quals.empty())
5080 return;
5081
5082 for (TQualVector::iterator cur = fbp->quals.begin(); cur != fbp->quals.end();)
5083 {
5084 const std::string& qual_str = (*cur)->GetQual();
5085
5086 if ((*cur)->IsSetVal())
5087 {
5088 const std::string& val_str = (*cur)->GetVal();
5089 std::vector<Char> val_buf(val_str.begin(), val_str.end());
5090 val_buf.push_back(0);
5091
5092 if (qual_str == "translation")
5093 {
5094 DelCharBtwData(&val_buf[0]);
5095 }
5096 else if (qual_str == "rpt_unit")
5097 {
5098 fta_convert_to_lower_case(&val_buf[0]);
5099 }
5100 else if (qual_str == "cons_splice")
5101 {
5102 fta_process_con_slice(val_buf);
5103 }
5104 else if (qual_str == "note")
5105 {
5106 for(p = &val_buf[0];;)
5107 {
5108 p = StringChr(p, '/');
5109 if(p == NULL)
5110 break;
5111 p++;
5112 if (!CheckLegalQual(p, ' ', nullptr))
5113 continue;
5114
5115 if (val_buf.size() > 30)
5116 {
5117 ch = val_buf[30];
5118 val_buf[30] = '\0';
5119 }
5120 else
5121 ch = '\0';
5122 ErrPostEx(SEV_WARNING, ERR_QUALIFIER_EmbeddedQual,
5123 "/note qualifier value appears to contain other qualifiers : [%s%s].",
5124 &val_buf[0], (ch == '\0') ? "" : " ...");
5125 if(ch != '\0')
5126 val_buf[30] = ch;
5127 }
5128 }
5129
5130 for (p = &val_buf[0]; *p == '\"' || *p == ' ' || *p == '\t';)
5131 p++;
5132
5133 if(*p == '\0')
5134 {
5135 if (qual_str == "replace")
5136 {
5137 (*cur)->SetVal("");
5138 }
5139 else
5140 (*cur)->ResetVal();
5141 }
5142 else
5143 (*cur)->SetVal(&val_buf[0]);
5144 }
5145
5146 for (b = EmptyQuals; *b != NULL; b++)
5147 if (qual_str == *b)
5148 break;
5149
5150 if (*b == NULL)
5151 {
5152 if (!(*cur)->IsSetVal())
5153 {
5154 if (qual_str == "old_locus_tag")
5155 ErrPostEx(SEV_ERROR, ERR_FEATURE_EmptyOldLocusTag,
5156 "Feature \"%s\" at \"%s\" has an /old_locus_tag qualifier with no value. Qualifier has been dropped.",
5157 (fbp->key == NULL) ? "Unknown" : fbp->key,
5158 (fbp->location == NULL) ? "Empty" : fbp->location);
5159 else
5160 ErrPostEx(SEV_WARNING, ERR_QUALIFIER_EmptyQual,
5161 "Qualifier /%s ignored because it lacks a data value. Feature \"%s\", location \"%s\".",
5162 qual_str.c_str(),
5163 (fbp->key == NULL) ? "Unknown" : fbp->key,
5164 (fbp->location == NULL) ? "Empty" : fbp->location);
5165
5166 cur = fbp->quals.erase(cur);
5167 continue;
5168 }
5169 }
5170 else if ((*cur)->IsSetVal())
5171 {
5172 ErrPostEx(SEV_WARNING, ERR_QUALIFIER_ShouldNotHaveValue,
5173 "Qualifier /%s should not have data value. Qualifier value has been ignored. Feature \"%s\", location \"%s\".",
5174 qual_str.c_str(), (fbp->key == NULL) ? "Unknown" : fbp->key,
5175 (fbp->location == NULL) ? "Empty" : fbp->location);
5176
5177 (*cur)->ResetVal();
5178 }
5179
5180 if ((*cur)->IsSetVal() && qual_str == "note")
5181 {
5182 std::string val = (*cur)->GetVal();
5183 std::replace(val.begin(), val.end(), '\"', '\'');
5184 (*cur)->SetVal(val);
5185 }
5186
5187 ++cur;
5188 }
5189 }
5190
5191 /**********************************************************/
XMLParseFeatureBlock(bool deb,DataBlkPtr dbp,Parser::ESource source)5192 static int XMLParseFeatureBlock(bool deb, DataBlkPtr dbp, Parser::ESource source)
5193 {
5194 FeatBlkPtr fbp;
5195 char* p;
5196 Int4 num;
5197 int retval = GB_FEAT_ERR_NONE;
5198 int ret;
5199
5200 for(num = 0; dbp != NULL; dbp = dbp->next, num++)
5201 {
5202 if(dbp->data == NULL)
5203 continue;
5204 fbp = (FeatBlkPtr) dbp->data;
5205 fbp->num = num;
5206 FtaInstallPrefix(PREFIX_FEATURE, fbp->key, fbp->location);
5207
5208 if(fbp->key[0] == '-' && fbp->key[1] == '\0')
5209 {
5210 ErrPostStr(SEV_WARNING, ERR_FEATURE_FeatureKeyReplaced,
5211 "Featkey '-' is replaced by 'misc_feature'");
5212 MemFree(fbp->key);
5213 fbp->key = StringSave("misc_feature");
5214 }
5215
5216 if(StringCmp(fbp->key, "allele") == 0 ||
5217 StringCmp(fbp->key, "mutation") == 0)
5218 {
5219 ErrPostEx(SEV_ERROR, ERR_FEATURE_ObsoleteFeature,
5220 "Obsolete feature \"%s\" found. Replaced with \"variation\".",
5221 fbp->key);
5222 MemFree(fbp->key);
5223 fbp->key = StringSave("variation");
5224 }
5225
5226 objects::CSeqFeatData::ESubtype subtype = objects::CSeqFeatData::SubtypeNameToValue(fbp->key);
5227
5228 if (subtype == objects::CSeqFeatData::eSubtype_bad && !deb)
5229 {
5230 ErrPostEx(SEV_ERROR, ERR_FEATURE_UnknownFeatKey, fbp->key,
5231 "Feature dropped");
5232 dbp->drop = 1;
5233 retval = GB_FEAT_ERR_DROP;
5234 continue;
5235 }
5236
5237 if (!fbp->quals.empty())
5238 {
5239 XMLCheckQualifiers(fbp);
5240 fbp = MergeNoteQual(fbp); /* allow more than one
5241 notes w/i a key */
5242
5243 if (subtype == objects::CSeqFeatData::eSubtype_bad)
5244 {
5245 ErrPostStr(SEV_ERROR, ERR_FEATURE_UnknownFeatKey, fbp->key);
5246 ret = GB_FEAT_ERR_REPAIRABLE;
5247 }
5248 else
5249 {
5250 /* last argument is perform_corrections if debug
5251 * mode is FALSE
5252 */
5253 ret = XGBFeatKeyQualValid(subtype, fbp->quals, true, ((source == Parser::ESource::Flybase) ? false : !deb));
5254 }
5255 if(ret > retval)
5256 retval = ret;
5257
5258 if(ret > GB_FEAT_ERR_REPAIRABLE &&
5259 StringCmp(fbp->key, "ncRNA") != 0)
5260 dbp->drop = 1;
5261 }
5262 else if (subtype == objects::CSeqFeatData::eSubtype_bad && !objects::CSeqFeatData::GetMandatoryQualifiers(subtype).empty())
5263 {
5264 if(StringCmp(fbp->key, "mobile_element") != 0)
5265 {
5266 auto qual_idx = *objects::CSeqFeatData::GetMandatoryQualifiers(subtype).begin();
5267 std::string str1 = objects::CSeqFeatData::GetQualifierAsString(qual_idx);
5268 const char *str = str1.c_str();
5269 if((StringCmp(fbp->key, "old_sequence") != 0 &&
5270 StringCmp(fbp->key, "conflict") != 0) ||
5271 StringCmp(str, "citation") != 0)
5272 {
5273 ErrPostEx(SEV_ERROR, ERR_FEATURE_RequiredQualifierMissing,
5274 "lacks required /%s qualifier : feature has been dropped.",
5275 str);
5276 if(!deb)
5277 {
5278 dbp->drop = 1;
5279 retval = GB_FEAT_ERR_DROP;
5280 }
5281 }
5282 }
5283 }
5284 else if(StringCmp(fbp->key, "misc_feature") == 0 && fbp->quals.empty())
5285 {
5286 if (!deb)
5287 {
5288 dbp->drop = 1;
5289 retval = GB_FEAT_ERR_DROP;
5290 ErrPostStr(SEV_WARNING, ERR_FEATURE_Dropped,
5291 "Empty 'misc_feature' dropped");
5292 }
5293 else
5294 retval = GB_FEAT_ERR_REPAIRABLE;
5295 }
5296
5297 NON_CONST_ITERATE(TQualVector, cur, fbp->quals)
5298 {
5299 if (!(*cur)->IsSetQual() || !(*cur)->IsSetVal())
5300 continue;
5301
5302 const std::string& qual_str = (*cur)->GetQual();
5303 const std::string& val_str = (*cur)->GetVal();
5304
5305 std::vector<Char> val_buf(val_str.begin(), val_str.end());
5306 val_buf.push_back(0);
5307
5308 p = &val_buf[0];
5309 ShrinkSpaces(p);
5310 if (*p == '\0' && qual_str != "replace")
5311 {
5312 (*cur)->ResetVal();
5313 val_buf[0] = 0;
5314 }
5315 else
5316 {
5317 if (qual_str == "replace")
5318 fta_convert_to_lower_case(p);
5319 (*cur)->SetVal(p);
5320 }
5321 }
5322 } /* for, each sub-block, or each feature key */
5323 FtaDeletePrefix(PREFIX_FEATURE);
5324 return(retval);
5325 }
5326
5327 /**********************************************************/
fta_check_ncrna(const objects::CSeq_feat & feat)5328 static bool fta_check_ncrna(const objects::CSeq_feat& feat)
5329 {
5330 char* p;
5331 Int4 count = 0;
5332
5333 bool stop = false;
5334 ITERATE(objects::CSeq_feat::TQual, qual, feat.GetQual())
5335 {
5336 if (!(*qual)->IsSetQual() || (*qual)->GetQual().empty() ||
5337 (*qual)->GetQual() != "ncRNA_class")
5338 continue;
5339
5340 count++;
5341
5342 if (!(*qual)->IsSetVal() || (*qual)->GetVal().empty())
5343 {
5344 p = location_to_string_or_unknown(feat.GetLocation());
5345
5346 ErrPostEx(SEV_REJECT, ERR_FEATURE_ncRNA_class,
5347 "Feature \"ncRNA\" at location \"%s\" has an empty /ncRNA_class qualifier.",
5348 (p == NULL) ? "unknown" : p);
5349
5350 if(p != NULL)
5351 MemFree(p);
5352
5353 stop = true;
5354 break;
5355 }
5356
5357 if (MatchArrayString(ncRNA_class_values, (*qual)->GetVal().c_str()) < 0)
5358 {
5359 p = location_to_string_or_unknown(feat.GetLocation());
5360
5361 ErrPostEx(SEV_REJECT, ERR_FEATURE_ncRNA_class,
5362 "Feature \"ncRNA\" at location \"%s\" has an invalid /ncRNA_class qualifier: \"%s\".",
5363 (p == NULL) ? "unknown" : p, (*qual)->GetVal().c_str());
5364
5365 if (p != NULL)
5366 MemFree(p);
5367
5368 stop = true;
5369 break;
5370 }
5371 }
5372
5373 if (stop)
5374 return false;
5375
5376 if (count == 1)
5377 return true;
5378
5379 p = location_to_string_or_unknown(feat.GetLocation());
5380
5381 ErrPostEx(SEV_REJECT, ERR_FEATURE_ncRNA_class,
5382 "Feature \"ncRNA\" at location \"%s\" %s /ncRNA_class qualifier.",
5383 (p == NULL) ? "unknown" : p,
5384 (count == 0) ? "lacks the mandatory" : "has more than one");
5385
5386 if(p != NULL)
5387 MemFree(p);
5388
5389 return false;
5390 }
5391
5392 /**********************************************************/
fta_check_artificial_location(objects::CSeq_feat & feat,char * key)5393 static void fta_check_artificial_location(objects::CSeq_feat& feat, char* key)
5394 {
5395 NON_CONST_ITERATE(objects::CSeq_feat::TQual, qual, feat.SetQual())
5396 {
5397 if (!(*qual)->IsSetQual() || (*qual)->GetQual() != "artificial_location")
5398 continue;
5399
5400 if ((*qual)->IsSetVal())
5401 {
5402 const Char* p_val = (*qual)->GetVal().c_str();
5403 for (; *p_val == '\"';)
5404 ++p_val;
5405
5406 if (*p_val == '\0')
5407 (*qual)->ResetVal();
5408 }
5409
5410 std::string val = (*qual)->IsSetVal() ? (*qual)->GetVal() : "";
5411
5412 if (val == "heterogenous population sequenced" ||
5413 val == "low-quality sequence region")
5414 {
5415 feat.SetExcept(true);
5416
5417 if (!feat.IsSetExcept_text())
5418 feat.SetExcept_text(val);
5419 else
5420 {
5421 std::string& except_text = feat.SetExcept_text();
5422 except_text += ", ";
5423 except_text += val;
5424 }
5425 }
5426 else
5427 {
5428 Char* cstr = location_to_string_or_unknown(feat.GetLocation());
5429 std::string loc_str = cstr;
5430 MemFree(cstr);
5431
5432 if (val.empty())
5433 ErrPostEx(SEV_ERROR, ERR_QUALIFIER_InvalidArtificialLoc,
5434 "Encountered empty /artificial_location qualifier : Feature \"%s\" : Location \"%s\". Qualifier dropped.",
5435 (key == NULL || *key == '\0') ? "unknown" : key,
5436 loc_str.empty() ? "unknown" : loc_str.c_str());
5437 else
5438 ErrPostEx(SEV_ERROR, ERR_QUALIFIER_InvalidArtificialLoc,
5439 "Value \"%s\" is not legal for the /artificial_location qualifier : Feature \"%s\" : Location \"%s\". Qualifier dropped.",
5440 val.c_str(),
5441 (key == NULL || *key == '\0') ? "unknown" : key,
5442 loc_str.empty() ? "unknown" : loc_str.c_str());
5443 }
5444
5445 feat.SetQual().erase(qual);
5446 break;
5447 }
5448 }
5449
5450 /**********************************************************/
fta_check_mobile_element(const objects::CSeq_feat & feat)5451 static bool fta_check_mobile_element(const objects::CSeq_feat& feat)
5452 {
5453 bool found = false;
5454 ITERATE(objects::CSeq_feat::TQual, qual, feat.GetQual())
5455 {
5456 if ((*qual)->IsSetQual() && (*qual)->GetQual() == "mobile_element_type" &&
5457 (*qual)->IsSetVal() && !(*qual)->GetVal().empty())
5458 {
5459 const Char* p_val = (*qual)->GetVal().c_str();
5460 for (; *p_val == '\"';)
5461 ++p_val;
5462
5463 if (*p_val != '\0')
5464 {
5465 found = true;
5466 break;
5467 }
5468 }
5469 }
5470
5471 if (found)
5472 return true;
5473
5474 Char* cstr = location_to_string_or_unknown(feat.GetLocation());
5475 std::string loc_str = cstr;
5476 MemFree(cstr);
5477
5478 ErrPostEx(SEV_REJECT, ERR_FEATURE_RequiredQualifierMissing,
5479 "Mandatory qualifier /mobile_element_type is absent or has no value : Feature \"mobile_element\" : Location \"%s\". Entry dropped.",
5480 loc_str.empty() ? "unknown" : loc_str.c_str());
5481
5482 return false;
5483 }
5484
5485 /**********************************************************/
SortFeaturesByLoc(const DataBlkPtr & sp1,const DataBlkPtr & sp2)5486 static bool SortFeaturesByLoc(const DataBlkPtr& sp1, const DataBlkPtr& sp2)
5487 {
5488 FeatBlkPtr fbp1;
5489 FeatBlkPtr fbp2;
5490 Int4 status;
5491
5492 fbp1 = (FeatBlkPtr) sp1->data;
5493 fbp2 = (FeatBlkPtr) sp2->data;
5494
5495 if(fbp1->location == NULL && fbp2->location != NULL)
5496 return false;
5497 if(fbp1->location != NULL && fbp2->location == NULL)
5498 return false;
5499
5500 if (fbp1->location != NULL && fbp2->location != NULL)
5501 {
5502 status = StringCmp(fbp1->location, fbp2->location);
5503 if (status != 0)
5504 return status < 0;
5505 }
5506
5507 if(fbp1->key == NULL && fbp2->key != NULL)
5508 return false;
5509 if (fbp1->key != NULL && fbp2->key == NULL)
5510 return false;
5511 if (fbp1->key != NULL && fbp2->key != NULL)
5512 {
5513 status = StringCmp(fbp1->key, fbp2->key);
5514 if (status != 0)
5515 return status < 0;
5516 }
5517
5518 return false;
5519 }
5520
5521 /**********************************************************/
SortFeaturesByOrder(const DataBlkPtr & sp1,const DataBlkPtr & sp2)5522 static bool SortFeaturesByOrder(const DataBlkPtr& sp1, const DataBlkPtr& sp2)
5523 {
5524 FeatBlkPtr fbp1;
5525 FeatBlkPtr fbp2;
5526
5527 fbp1 = (FeatBlkPtr) sp1->data;
5528 fbp2 = (FeatBlkPtr) sp2->data;
5529
5530 return fbp1->num < fbp2->num;
5531 }
5532
5533 /**********************************************************/
fta_sort_features(DataBlkPtr dbp,bool order)5534 static DataBlkPtr fta_sort_features(DataBlkPtr dbp, bool order)
5535 {
5536 DataBlkPtr* temp;
5537 DataBlkPtr tdbp;
5538 Int4 total;
5539 Int4 i;
5540
5541 for(total = 0, tdbp = dbp; tdbp != NULL; tdbp = tdbp->next)
5542 total++;
5543
5544 temp = (DataBlkPtr*) MemNew(total * sizeof(DataBlkPtr));
5545
5546 for(i = 0, tdbp = dbp; tdbp != NULL; tdbp = tdbp->next)
5547 temp[i++] = tdbp;
5548
5549 std::sort(temp, temp + i, (order ? SortFeaturesByOrder : SortFeaturesByLoc));
5550
5551 dbp = tdbp = temp[0];
5552 for(i = 0; i < total - 1; tdbp = tdbp->next, i++)
5553 tdbp->next = temp[i+1];
5554
5555 tdbp = temp[total-1];
5556 tdbp->next = NULL;
5557
5558 MemFree(temp);
5559
5560 return(dbp);
5561 }
5562
5563 /**********************************************************/
fta_convert_to_regulatory(FeatBlkPtr fbp,const char * rclass)5564 static void fta_convert_to_regulatory(FeatBlkPtr fbp, const char *rclass)
5565 {
5566 if(fbp == NULL || fbp->key == NULL || rclass == NULL)
5567 return;
5568
5569 if(fbp->key != NULL)
5570 MemFree(fbp->key);
5571 fbp->key = StringSave("regulatory");
5572
5573 CRef<objects::CGb_qual> qual(new objects::CGb_qual);
5574 qual->SetQual("regulatory_class");
5575 qual->SetVal(rclass);
5576 fbp->quals.push_back(qual);
5577 }
5578
5579 /**********************************************************/
fta_check_replace_regulatory(DataBlkPtr dbp,unsigned char * drop)5580 static void fta_check_replace_regulatory(DataBlkPtr dbp, unsigned char* drop)
5581 {
5582 FeatBlkPtr fbp;
5583 const char **b;
5584 char* p;
5585 bool got_note;
5586 bool other_class;
5587 Int4 count;
5588 Char ch;
5589
5590 for(; dbp != NULL; dbp = dbp->next)
5591 {
5592 fbp = (FeatBlkPtr) dbp->data;
5593 if(fbp == NULL || fbp->key == NULL)
5594 continue;
5595
5596 if(StringCmp(fbp->key, "attenuator") == 0)
5597 fta_convert_to_regulatory(fbp, "attenuator");
5598 else if(StringCmp(fbp->key, "CAAT_signal") == 0)
5599 fta_convert_to_regulatory(fbp, "CAAT_signal");
5600 else if(StringCmp(fbp->key, "enhancer") == 0)
5601 fta_convert_to_regulatory(fbp, "enhancer");
5602 else if(StringCmp(fbp->key, "GC_signal") == 0)
5603 fta_convert_to_regulatory(fbp, "GC_signal");
5604 else if(StringCmp(fbp->key, "-35_signal") == 0)
5605 fta_convert_to_regulatory(fbp, "minus_35_signal");
5606 else if(StringCmp(fbp->key, "-10_signal") == 0)
5607 fta_convert_to_regulatory(fbp, "minus_10_signal");
5608 else if(StringCmp(fbp->key, "polyA_signal") == 0)
5609 fta_convert_to_regulatory(fbp, "polyA_signal_sequence");
5610 else if(StringCmp(fbp->key, "promoter") == 0)
5611 fta_convert_to_regulatory(fbp, "promoter");
5612 else if(StringCmp(fbp->key, "RBS") == 0)
5613 fta_convert_to_regulatory(fbp, "ribosome_binding_site");
5614 else if(StringCmp(fbp->key, "TATA_signal") == 0)
5615 fta_convert_to_regulatory(fbp, "TATA_box");
5616 else if(StringCmp(fbp->key, "terminator") == 0)
5617 fta_convert_to_regulatory(fbp, "terminator");
5618 else if(StringCmp(fbp->key, "regulatory") != 0)
5619 continue;
5620
5621 got_note = false;
5622 other_class = false;
5623 count = 0;
5624
5625 ITERATE(TQualVector, cur, fbp->quals)
5626 {
5627 if (!(*cur)->IsSetQual() || !(*cur)->IsSetVal())
5628 continue;
5629
5630 const std::string& qual_str = (*cur)->GetQual();
5631
5632 if (qual_str != "regulatory_class")
5633 {
5634 if (qual_str == "note")
5635 got_note = true;
5636 continue;
5637 }
5638
5639 count++;
5640 if (!(*cur)->IsSetVal() || (*cur)->GetVal().empty())
5641 {
5642 ch = '\0';
5643 if(fbp->location == NULL || *fbp->location == '\0')
5644 p = (char*) "(empty)";
5645 else
5646 {
5647 p = fbp->location;
5648 if(StringLen(p) > 50)
5649 {
5650 ch = p[50];
5651 p[50] = '\0';
5652 }
5653 }
5654 ErrPostEx(SEV_REJECT, ERR_QUALIFIER_InvalidRegulatoryClass,
5655 "Empty /regulatory_class qualifier value in regulatory feature at location %s.",
5656 p);
5657 if(ch != '\0')
5658 p[50] = ch;
5659 *drop = 1;
5660 continue;
5661 }
5662
5663 const std::string& val_str = (*cur)->GetVal();
5664
5665 for (b = RegulatoryClassValues; *b != NULL; b++)
5666 if (val_str == *b)
5667 break;
5668
5669 if(*b != NULL)
5670 {
5671 if (val_str == "other")
5672 other_class = true;
5673 continue;
5674 }
5675
5676 ch = '\0';
5677 if(fbp->location == NULL || *fbp->location == '\0')
5678 p = (char*) "(empty)";
5679 else
5680 {
5681 p = fbp->location;
5682 if(StringLen(p) > 50)
5683 {
5684 ch = p[50];
5685 p[50] = '\0';
5686 }
5687 }
5688 ErrPostEx(SEV_REJECT, ERR_QUALIFIER_InvalidRegulatoryClass,
5689 "Invalid /regulatory_class qualifier value %s provided in regulatory feature at location %s.",
5690 val_str.c_str(), p);
5691 if(ch != '\0')
5692 p[50] = ch;
5693 *drop = 1;
5694 }
5695
5696 if(count == 0)
5697 {
5698 ch = '\0';
5699 if(fbp->location == NULL || *fbp->location == '\0')
5700 p = (char*) "(empty)";
5701 else
5702 {
5703 p = fbp->location;
5704 if(StringLen(p) > 50)
5705 {
5706 ch = p[50];
5707 p[50] = '\0';
5708 }
5709 }
5710 ErrPostEx(SEV_REJECT, ERR_QUALIFIER_MissingRegulatoryClass,
5711 "The regulatory feature is missing mandatory /regulatory_class qualifier at location %s.",
5712 p);
5713 if(ch != '\0')
5714 p[50] = ch;
5715 *drop = 1;
5716 }
5717 else if(count > 1)
5718 {
5719 ch = '\0';
5720 if(fbp->location == NULL || *fbp->location == '\0')
5721 p = (char*) "(empty)";
5722 else
5723 {
5724 p = fbp->location;
5725 if(StringLen(p) > 50)
5726 {
5727 ch = p[50];
5728 p[50] = '\0';
5729 }
5730 }
5731 ErrPostEx(SEV_REJECT, ERR_QUALIFIER_MultipleRegulatoryClass,
5732 "Multiple /regulatory_class qualifiers were encountered in regulatory feature at location %s.",
5733 p);
5734 if(ch != '\0')
5735 p[50] = ch;
5736 *drop = 1;
5737 }
5738
5739 if(other_class && !got_note)
5740 {
5741 ch = '\0';
5742 if(fbp->location == NULL || *fbp->location == '\0')
5743 p = (char*) "(empty)";
5744 else
5745 {
5746 p = fbp->location;
5747 if(StringLen(p) > 50)
5748 {
5749 ch = p[50];
5750 p[50] = '\0';
5751 }
5752 }
5753 ErrPostEx(SEV_REJECT, ERR_QUALIFIER_NoNoteForOtherRegulatory,
5754 "The regulatory feature of class other is lacking required /note qualifier at location %s.",
5755 p);
5756 if(ch != '\0')
5757 p[50] = ch;
5758 *drop = 1;
5759 }
5760 }
5761 }
5762
5763 /**********************************************************/
fta_create_wgs_dbtag(objects::CBioseq & bioseq,char * submitter_seqid,char * prefix,Int4 seqtype)5764 static void fta_create_wgs_dbtag(objects::CBioseq &bioseq,
5765 char* submitter_seqid,
5766 char* prefix, Int4 seqtype)
5767 {
5768 char* dbname;
5769
5770 dbname = (char*) MemNew(11);
5771 if(seqtype == 0 || seqtype == 1 || seqtype == 7)
5772 StringCpy(dbname, "WGS:");
5773 else if(seqtype == 4 || seqtype == 5 || seqtype == 8 || seqtype == 9)
5774 StringCpy(dbname, "TSA:");
5775 else
5776 StringCpy(dbname, "TLS:");
5777 StringCat(dbname, prefix);
5778
5779 CRef<objects::CSeq_id> gen_id(new objects::CSeq_id);
5780 objects::CDbtag &tag = gen_id->SetGeneral();
5781 tag.SetTag().SetStr(submitter_seqid);
5782 tag.SetDb(dbname);
5783 bioseq.SetId().push_back(gen_id);
5784 }
5785
5786 /**********************************************************/
fta_create_wgs_seqid(objects::CBioseq & bioseq,IndexblkPtr ibp,Parser::ESource source)5787 static void fta_create_wgs_seqid(objects::CBioseq &bioseq,
5788 IndexblkPtr ibp, Parser::ESource source)
5789 {
5790 TokenBlkPtr tbp;
5791 char* prefix;
5792 char* p;
5793 Int4 seqtype;
5794 Int4 i;
5795
5796 if(!ibp || !ibp->submitter_seqid)
5797 return;
5798
5799 prefix = NULL;
5800
5801 seqtype = fta_if_wgs_acc(ibp->acnum);
5802 if(seqtype == 0 || seqtype == 3 || seqtype == 4 || seqtype == 6 ||
5803 seqtype == 10 || seqtype == 12)
5804 {
5805 ErrPostEx(SEV_REJECT, ERR_SOURCE_SubmitterSeqidNotAllowed,
5806 "WGS/TLS/TSA master records are not allowed to have /submitter_seqid qualifiers, only contigs and scaffolds. Entry dropped.");
5807 ibp->drop = 1;
5808 return;
5809 }
5810
5811 if(seqtype == 1 || seqtype == 5 || seqtype == 7 || seqtype == 8 ||
5812 seqtype == 9 || seqtype == 11)
5813 {
5814 prefix = StringSave(ibp->acnum);
5815 if(prefix[4] >= '0' && prefix[4] <= '9')
5816 prefix[6] = '\0';
5817 else
5818 prefix[8] = '\0';
5819 fta_create_wgs_dbtag(bioseq, ibp->submitter_seqid, prefix, seqtype);
5820 MemFree(prefix);
5821 return;
5822 }
5823
5824 for(tbp = ibp->secaccs; tbp != NULL; tbp = tbp->next)
5825 {
5826 if(tbp->str[0] == '-')
5827 continue;
5828
5829 if(prefix == NULL)
5830 prefix = StringSave(tbp->str);
5831 else
5832 {
5833 i = (prefix[4] >= '0' && prefix[4] <= '9') ? 6 : 8;
5834 if(StringNCmp(prefix, tbp->str, i) != 0)
5835 break;
5836 }
5837 }
5838
5839 if(tbp == NULL && prefix != NULL)
5840 {
5841 seqtype = fta_if_wgs_acc(prefix);
5842 if(seqtype == 0 || seqtype == 1 || seqtype == 4 || seqtype == 5 ||
5843 seqtype == 7 || seqtype == 8 || seqtype == 9 || seqtype == 10 ||
5844 seqtype == 11)
5845 {
5846 if(prefix[4] >= '0' && prefix[4] <= '9')
5847 prefix[6] = '\0';
5848 else
5849 prefix[8] = '\0';
5850 fta_create_wgs_dbtag(bioseq, ibp->submitter_seqid, prefix,
5851 seqtype);
5852 MemFree(prefix);
5853 return;
5854 }
5855 }
5856
5857 if(prefix != NULL)
5858 {
5859 MemFree(prefix);
5860 prefix = NULL;
5861 }
5862
5863 if(bioseq.GetInst().IsSetExt() && bioseq.GetInst().GetExt().IsDelta())
5864 {
5865 objects::CDelta_ext::Tdata deltas =
5866 bioseq.GetInst().GetExt().GetDelta();
5867 objects::CDelta_ext::Tdata::iterator delta;
5868
5869 for(delta = deltas.begin(); delta != deltas.end(); delta++)
5870 {
5871 const objects::CSeq_id *id = nullptr;
5872
5873 if(!(*delta)->IsLoc())
5874 continue;
5875
5876 const objects::CSeq_loc &locs = (*delta)->GetLoc();
5877 objects::CSeq_loc_CI ci(locs);
5878
5879 for(; ci; ++ci)
5880 {
5881 CConstRef<objects::CSeq_loc> loc =
5882 ci.GetRangeAsSeq_loc();
5883 if(!loc->IsInt())
5884 continue;
5885 id = &ci.GetSeq_id();
5886 if(!id)
5887 break;
5888 if(!id->IsGenbank() && !id->IsEmbl() && !id->IsDdbj() &&
5889 !id->IsOther() && !id->IsTpg() && !id->IsTpe() &&
5890 !id->IsTpd())
5891 break;
5892
5893 const objects::CTextseq_id *text_id =
5894 id->GetTextseq_Id();
5895 if(text_id == nullptr || !text_id->IsSetAccession() ||
5896 text_id->GetAccession().empty())
5897 break;
5898
5899 p = (char *) text_id->GetAccession().c_str();
5900 if(prefix == NULL)
5901 prefix = StringSave(p);
5902 else
5903 {
5904 i = (prefix[4] >= '0' && prefix[4] <= '9') ? 6 : 8;
5905 if(StringNCmp(prefix, p, i) != 0)
5906 break;
5907 }
5908 }
5909 if(ci)
5910 break;
5911 }
5912
5913 if(delta == deltas.end() && prefix != NULL)
5914 {
5915 seqtype = fta_if_wgs_acc(prefix);
5916 if(seqtype == 0 || seqtype == 1 || seqtype == 4 || seqtype == 5 ||
5917 seqtype == 7 || seqtype == 8 || seqtype == 9 || seqtype == 10 ||
5918 seqtype == 11)
5919 {
5920 if(prefix[4] >= '0' && prefix[4] <= '9')
5921 prefix[6] = '\0';
5922 else
5923 prefix[8] = '\0';
5924 fta_create_wgs_dbtag(bioseq, ibp->submitter_seqid, prefix,
5925 seqtype);
5926 MemFree(prefix);
5927 return;
5928 }
5929 }
5930
5931 if(prefix != NULL)
5932 {
5933 MemFree(prefix);
5934 prefix = NULL;
5935 }
5936
5937 ErrPostEx(SEV_ERROR, ERR_SOURCE_SubmitterSeqidDropped,
5938 "Could not determine project code for what appears to be a WGS/TLS/TSA scaffold record. /submitter_seqid dropped.");
5939 return;
5940 }
5941
5942 if((source == Parser::ESource::EMBL || source == Parser::ESource::DDBJ) && ibp->is_tsa)
5943 {
5944 ErrPostEx(SEV_ERROR, ERR_SOURCE_SubmitterSeqidIgnored,
5945 "Submitter sequence identifiers for non-project-based TSA records are not supported. /submitter_seqid \"%s\" has been dropped.",
5946 ibp->submitter_seqid);
5947 return;
5948 }
5949
5950 ErrPostEx(SEV_REJECT, ERR_SOURCE_SubmitterSeqidNotAllowed,
5951 "Only WGS/TLS/TSA related records (contigs and scaffolds) are allowed to have /submitter_seqid qualifier. This \"%s\" is not one of them. Entry dropped.",
5952 ibp->acnum);
5953 ibp->drop = 1;
5954 }
5955
5956 /**********************************************************
5957 *
5958 * SeqAnnotPtr LoadFeat(pp, entry, bsp):
5959 *
5960 * 5-4-93
5961 *
5962 **********************************************************/
LoadFeat(ParserPtr pp,DataBlkPtr entry,objects::CBioseq & bioseq)5963 void LoadFeat(ParserPtr pp, DataBlkPtr entry, objects::CBioseq& bioseq)
5964 {
5965 DataBlkPtr dab;
5966 DataBlkPtr dabnext;
5967 DataBlkPtr dbp;
5968 DataBlkPtr tdbp;
5969 FeatBlkPtr fbp;
5970
5971 IndexblkPtr ibp;
5972 Int4 col_data;
5973 Int2 type;
5974 Int4 i = 0;
5975 CRef<objects::CSeq_id> pat_seq_id;
5976
5977 xinstall_gbparse_range_func(pp, flat2asn_range_func);
5978
5979 ibp = pp->entrylist[pp->curindx];
5980
5981 CRef<objects::CSeq_id> seq_id =
5982 MakeAccSeqId(ibp->acnum, pp->seqtype, pp->accver, ibp->vernum,
5983 true, ibp->is_tpa);
5984 if(pp->source == Parser::ESource::USPTO)
5985 {
5986 pat_seq_id = new objects::CSeq_id;
5987 CRef<objects::CPatent_seq_id> pat_id = MakeUsptoPatSeqId(ibp->acnum);
5988 pat_seq_id->SetPatent(*pat_id);
5989 }
5990
5991 if (!seq_id) {
5992 if (ibp->acnum && !NStr::IsBlank(ibp->acnum)) {
5993 seq_id = Ref(new CSeq_id(CSeq_id::e_Local, ibp->acnum));
5994 }
5995 else if (pp->mode == Parser::EMode::Relaxed) {
5996 seq_id = Ref(new CSeq_id(CSeq_id::e_Local, ibp->locusname));
5997 }
5998 }
5999
6000 TSeqIdList ids;
6001 ids.push_back(seq_id);
6002
6003 if(pp->format == Parser::EFormat::GenBank)
6004 {
6005 col_data = ParFlat_COL_DATA;
6006 type = ParFlat_FEATURES;
6007 }
6008 else if(pp->format == Parser::EFormat::XML)
6009 {
6010 col_data = 0;
6011 type = XML_FEATURES;
6012 }
6013 else
6014 {
6015 col_data = ParFlat_COL_DATA_EMBL;
6016 type = ParFlat_FH;
6017 }
6018
6019 /* Find feature already isolated in a "block"
6020 * The key, location and qualifiers will be isolated to
6021 * a FeatBlk at the first step of ParseFeatureBlock, which
6022 * parses a single feature at a time.
6023 * -Karl
6024 */
6025 if(pp->format == Parser::EFormat::XML)
6026 dab = XMLLoadFeatBlk(entry->offset, ibp->xip);
6027 else
6028 dab = TrackNodeType(entry, type);
6029 for(dbp = dab; dbp != NULL; dbp = dbp->next)
6030 {
6031 if(dbp->type != type)
6032 continue;
6033
6034 /* Parsing each feature subblock to FeatBlkPtr, fbp
6035 * it also checks semantics of qualifiers and keys
6036 */
6037 if(pp->format == Parser::EFormat::XML)
6038 XMLParseFeatureBlock(pp->debug, (DataBlkPtr) dbp->data, pp->source);
6039 else
6040 ParseFeatureBlock(ibp, pp->debug, (DataBlkPtr) dbp->data, pp->source, pp->format);
6041
6042 dbp->data = (DataBlkPtr) fta_sort_features((DataBlkPtr) dbp->data, false);
6043 fta_check_pseudogene_qual((DataBlkPtr) dbp->data);
6044 fta_check_old_locus_tags((DataBlkPtr) dbp->data, &ibp->drop);
6045 fta_check_compare_qual((DataBlkPtr) dbp->data, ibp->is_tpa);
6046 tdbp = (DataBlkPtr) dbp->data;
6047 for(i = 0; tdbp != NULL; i++, tdbp = tdbp->next)
6048 fta_remove_dup_quals((FeatBlkPtr) tdbp->data);
6049 fta_remove_dup_feats((DataBlkPtr) dbp->data);
6050 for(tdbp = (DataBlkPtr) dbp->data; tdbp != NULL; tdbp = tdbp->next)
6051 fta_check_rpt_unit_range((FeatBlkPtr) tdbp->data, ibp->bases);
6052 fta_check_multiple_locus_tag((DataBlkPtr) dbp->data, &ibp->drop);
6053 if(ibp->is_tpa || ibp->is_tsa || ibp->is_tls)
6054 fta_check_non_tpa_tsa_tls_locations((DataBlkPtr) dbp->data, ibp);
6055 fta_check_replace_regulatory((DataBlkPtr) dbp->data, &ibp->drop);
6056 dbp->data = fta_sort_features((DataBlkPtr) dbp->data, true);
6057 }
6058
6059 if(i > 1 && ibp->is_mga)
6060 {
6061 ErrPostEx(SEV_REJECT, ERR_FEATURE_MoreThanOneCAGEFeat,
6062 "CAGE records are allowed to have only one feature, and it must be the \"source\" one. Entry dropped.");
6063 ibp->drop = 1;
6064 }
6065
6066 if(ibp->drop == 0)
6067 CollectGapFeats(entry, dab, pp, type);
6068
6069 TSeqFeatList seq_feats;
6070 if(ibp->drop == 0)
6071 ParseSourceFeat(pp, dab, ids, type, bioseq, seq_feats);
6072
6073 if (seq_feats.empty())
6074 {
6075 ibp->drop = 1;
6076 for(; dab != NULL; dab = dabnext)
6077 {
6078 dabnext = dab->next;
6079 FreeFeatBlk((DataBlkPtr) dab->data, pp->format);
6080 if(pp->format == Parser::EFormat::XML)
6081 MemFree(dab);
6082 }
6083 xinstall_gbparse_range_func(NULL, NULL);
6084 return;
6085 }
6086
6087 if(ibp->submitter_seqid != NULL)
6088 fta_create_wgs_seqid(bioseq, ibp, pp->source);
6089
6090 objects::CSeq_descr::Tdata& descr_list = bioseq.SetDescr().Set();
6091 for (objects::CSeq_descr::Tdata::iterator descr = descr_list.begin(); descr != descr_list.end();)
6092 {
6093 if (!(*descr)->IsSource())
6094 {
6095 ++descr;
6096 continue;
6097 }
6098
6099 descr = descr_list.erase(descr);
6100 }
6101
6102 CRef<objects::CSeqdesc> descr_src(new objects::CSeqdesc);
6103 descr_src->SetSource(seq_feats.front()->SetData().SetBiosrc());
6104
6105 descr_list.push_back(descr_src);
6106 seq_feats.pop_front();
6107
6108 fta_get_gcode_from_biosource(descr_src->GetSource(), ibp);
6109
6110 for(; dab != NULL; dab = dabnext)
6111 {
6112 dabnext = dab->next;
6113 if(dab->type != type)
6114 {
6115 if(pp->format == Parser::EFormat::XML)
6116 MemFree(dab);
6117 continue;
6118 }
6119
6120 for(dbp = (DataBlkPtr) dab->data; dbp != NULL; dbp = dbp->next)
6121 {
6122 if(dbp->drop == 1)
6123 continue;
6124
6125 fbp = (FeatBlkPtr) dbp->data;
6126 if(StringCmp(fbp->key, "source") == 0 ||
6127 StringCmp(fbp->key, "assembly_gap") == 0 ||
6128 (StringCmp(fbp->key, "gap") == 0 &&
6129 pp->source != Parser::ESource::DDBJ && pp->source != Parser::ESource::EMBL))
6130 continue;
6131
6132 fta_sort_quals(fbp, pp->qamode);
6133 CRef<objects::CSeq_feat> feat = ProcFeatBlk(pp, fbp, ids);
6134 if (feat.Empty())
6135 {
6136 if(StringCmp(fbp->key, "CDS") == 0)
6137 {
6138 ErrPostEx(SEV_ERROR, ERR_FEATURE_LocationParsing,
6139 "CDS feature has unparsable location. Entry dropped. Location = [%s].",
6140 fbp->location);
6141 ibp->drop = 1;
6142 }
6143 continue;
6144 }
6145
6146 if(StringCmp(fbp->key, "mobile_element") == 0 &&
6147 !fta_check_mobile_element(*feat))
6148 {
6149 ibp->drop = 1;
6150 continue;
6151 }
6152
6153 fta_check_artificial_location(*feat, fbp->key);
6154
6155 if(CheckForeignLoc(feat->GetLocation(),
6156 (pp->source == Parser::ESource::USPTO) ? *pat_seq_id : *seq_id))
6157 {
6158 ErrPostEx(SEV_WARNING, ERR_LOCATION_FailedCheck,
6159 "Location pointing outside the entry [%s]",
6160 fbp->location);
6161
6162 if (feat->GetData().IsImp())
6163 {
6164 const objects::CImp_feat& imp_feat = feat->GetData().GetImp();
6165 if (imp_feat.GetKey() == "intron" ||
6166 imp_feat.GetKey() == "exon")
6167 {
6168 /* foreign introns and exons wouldn't be parsed
6169 */
6170 feat.Reset();
6171 continue;
6172 }
6173 }
6174 }
6175
6176 FilterDb_xref(*feat, pp->source);
6177
6178 i = FTASeqLocCheck(feat->GetLocation(), ibp->acnum);
6179 if(i == 0)
6180 {
6181 ErrPostEx(SEV_WARNING, ERR_LOCATION_FailedCheck,
6182 fbp->location);
6183
6184 if(pp->debug)
6185 seq_feats.push_back(feat);
6186 else
6187 {
6188 feat.Reset();
6189 continue;
6190 }
6191 }
6192 else
6193 {
6194 if(i == 1)
6195 {
6196 if (feat->IsSetExcept_text() && feat->GetExcept_text() == "trans-splicing")
6197 ErrPostEx(SEV_INFO,
6198 ERR_LOCATION_TransSpliceMixedStrand,
6199 "Mixed strands in SeqLoc of /trans_splicing feature: %s",
6200 fbp->location);
6201 else
6202 ErrPostEx(SEV_WARNING, ERR_LOCATION_MixedStrand,
6203 "Mixed strands in SeqLoc: %s", fbp->location);
6204 }
6205
6206 seq_feats.push_back(feat);
6207 }
6208 }
6209 FreeFeatBlk((DataBlkPtr) dab->data, pp->format);
6210 if(pp->format == Parser::EFormat::XML)
6211 MemFree(dab);
6212 }
6213
6214 if (!fta_perform_operon_checks(pp, seq_feats, ibp))
6215 {
6216 ibp->drop = 1;
6217 seq_feats.clear();
6218 xinstall_gbparse_range_func(NULL, NULL);
6219 return;
6220 }
6221
6222 bool stop = false;
6223 NON_CONST_ITERATE(TSeqFeatList, feat, seq_feats)
6224 {
6225 if (!(*feat)->GetData().IsImp())
6226 continue;
6227
6228 const objects::CImp_feat& imp_feat = (*feat)->GetData().GetImp();
6229
6230 if (imp_feat.IsSetKey() &&
6231 StringStr(imp_feat.GetKey().c_str(), "RNA") != NULL)
6232 {
6233 if (imp_feat.GetKey() == "ncRNA" && !fta_check_ncrna(*(*feat)))
6234 {
6235 stop = true;
6236 break;
6237 }
6238
6239 GetRnaRef(*(*feat), bioseq, pp->source, pp->accver);
6240 }
6241 }
6242
6243 if (stop)
6244 {
6245 ibp->drop = 1;
6246 seq_feats.clear();
6247 xinstall_gbparse_range_func(NULL, NULL);
6248 return;
6249 }
6250
6251 SeqFeatPub(pp, entry, seq_feats, ids, col_data, ibp);
6252 if (seq_feats.empty() && ibp->drop != 0)
6253 {
6254 xinstall_gbparse_range_func(NULL, NULL);
6255 return;
6256 }
6257
6258 /* ImpFeatPub() call will be removed in asn 4.0
6259 */
6260 ImpFeatPub(pp, entry, seq_feats, *seq_id, col_data, ibp);
6261
6262 xinstall_gbparse_range_func(NULL, NULL);
6263 if (seq_feats.empty())
6264 return;
6265
6266 CRef<objects::CSeq_annot> annot(new objects::CSeq_annot);
6267 annot->SetData().SetFtable().swap(seq_feats);
6268
6269 bioseq.SetAnnot().push_back(annot);
6270 }
6271
6272 /**********************************************************/
GetBiomolFromToks(char * mRNA,char * tRNA,char * rRNA,char * snRNA,char * scRNA,char * uRNA,char * snoRNA)6273 static Uint1 GetBiomolFromToks(char* mRNA, char* tRNA, char* rRNA,
6274 char* snRNA, char* scRNA, char* uRNA,
6275 char* snoRNA)
6276 {
6277 char* p = NULL;
6278
6279 if(mRNA != NULL)
6280 p = mRNA;
6281 if(p == NULL || (tRNA != NULL && tRNA < p))
6282 p = tRNA;
6283 if(p == NULL || (rRNA != NULL && rRNA < p))
6284 p = rRNA;
6285 if(p == NULL || (snRNA != NULL && snRNA < p))
6286 p = snRNA;
6287 if(p == NULL || (scRNA != NULL && scRNA < p))
6288 p = scRNA;
6289 if(p == NULL || (uRNA != NULL && uRNA < p))
6290 p = uRNA;
6291 if(p == NULL || (snoRNA != NULL && snoRNA < p))
6292 p = snoRNA;
6293
6294 if(p == mRNA)
6295 return(Seq_descr_GIBB_mol_mRNA);
6296 if(p == tRNA)
6297 return(Seq_descr_GIBB_mol_tRNA);
6298 if(p == rRNA)
6299 return(Seq_descr_GIBB_mol_rRNA);
6300 if(p == snRNA || p == uRNA)
6301 return(Seq_descr_GIBB_mol_snRNA);
6302 if(p == snoRNA)
6303 return(Seq_descr_GIBB_mol_snoRNA);
6304 return(Seq_descr_GIBB_mol_scRNA);
6305 }
6306
6307 /**********************************************************/
GetFlatBiomol(int & biomol,Uint1 tech,char * molstr,ParserPtr pp,DataBlkPtr entry,const objects::COrg_ref * org_ref)6308 void GetFlatBiomol(int& biomol, Uint1 tech, char* molstr, ParserPtr pp,
6309 DataBlkPtr entry, const objects::COrg_ref* org_ref)
6310 {
6311 Int4 genomic;
6312 char* offset;
6313 Char c;
6314 DataBlkPtr dbp;
6315
6316 Int2 count;
6317 Int2 i;
6318 EntryBlkPtr ebp;
6319 IndexblkPtr ibp;
6320 const char *p;
6321
6322 char* q;
6323 char* r;
6324 char* mRNA = NULL;
6325 char* tRNA = NULL;
6326 char* rRNA = NULL;
6327 char* snRNA = NULL;
6328 char* scRNA = NULL;
6329 char* uRNA = NULL;
6330 char* snoRNA = NULL;
6331 bool stage;
6332 bool techok;
6333 bool same;
6334 bool is_syn;
6335
6336 ebp = (EntryBlkPtr) entry->data;
6337
6338 objects::CBioseq& bioseq = ebp->seq_entry->SetSeq();
6339 ibp = pp->entrylist[pp->curindx];
6340
6341 if(ibp->is_prot)
6342 {
6343 bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_aa);
6344 biomol = 8;
6345 return;
6346 }
6347
6348 if(StringCmp(ibp->division, "SYN") == 0 ||
6349 (org_ref != NULL && org_ref->IsSetOrgname() && org_ref->GetOrgname().IsSetDiv() &&
6350 org_ref->GetOrgname().GetDiv() == "SYN"))
6351 is_syn = true;
6352 else
6353 is_syn = false;
6354
6355 r = NULL;
6356 c = '\0';
6357 if(ibp->moltype != NULL)
6358 {
6359 if(pp->source == Parser::ESource::DDBJ && StringNICmp(molstr, "PRT", 3) == 0)
6360 return;
6361
6362 biomol = Seq_descr_GIBB_mol_genomic;
6363 bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_dna);
6364
6365 if(molstr != NULL)
6366 {
6367 q = molstr;
6368 r = molstr;
6369 if(pp->format == Parser::EFormat::EMBL || pp->format == Parser::EFormat::XML)
6370 while(*r != ';' && *r != '\n' && *r != '\0')
6371 r++;
6372 else
6373 {
6374 while(*r != ';' && *r != ' ' && *r != '\t' && *r != '\n' &&
6375 *r != '\0')
6376 r++;
6377 if(r - molstr > 10)
6378 r = molstr + 10;
6379 }
6380 c = *r;
6381 *r = '\0';
6382 if(q == r)
6383 q = (char*) "???";
6384 }
6385 else
6386 q = (char*) "???";
6387
6388 same = true;
6389 if(StringCmp(ibp->moltype, "genomic DNA") == 0)
6390 {
6391 biomol = Seq_descr_GIBB_mol_genomic;
6392 bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_dna);
6393
6394 if(pp->source == Parser::ESource::EMBL)
6395 {
6396 if(StringICmp(q, "DNA") != 0 &&
6397 StringICmp(ibp->moltype, q) != 0)
6398 same = false;
6399 }
6400 else if(StringICmp(q, "DNA") != 0)
6401 same = false;
6402 }
6403 else if(StringCmp(ibp->moltype, "genomic RNA") == 0)
6404 {
6405 biomol = Seq_descr_GIBB_mol_genomic;
6406 bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6407
6408 if (pp->source == Parser::ESource::EMBL)
6409 {
6410 if(StringICmp(q, "RNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6411 same = false;
6412 }
6413 else if(StringICmp(q, "RNA") != 0)
6414 same = false;
6415 }
6416 else if(StringCmp(ibp->moltype, "mRNA") == 0)
6417 {
6418 biomol = Seq_descr_GIBB_mol_mRNA;
6419 bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6420
6421 if(pp->source == Parser::ESource::EMBL)
6422 {
6423 if(StringICmp(q, "RNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6424 same = false;
6425 }
6426 else if(StringICmp(q, "mRNA") != 0)
6427 same = false;
6428 }
6429 else if(StringCmp(ibp->moltype, "tRNA") == 0)
6430 {
6431 biomol = Seq_descr_GIBB_mol_tRNA;
6432 bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6433
6434 if(pp->source == Parser::ESource::EMBL)
6435 {
6436 if(StringICmp(q, "RNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6437 same = false;
6438 }
6439 else if(StringICmp(q, "tRNA") != 0)
6440 same = false;
6441 }
6442 else if(StringCmp(ibp->moltype, "rRNA") == 0)
6443 {
6444 biomol = Seq_descr_GIBB_mol_rRNA;
6445 bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6446
6447 if(pp->source == Parser::ESource::EMBL)
6448 {
6449 if(StringICmp(q, "RNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6450 same = false;
6451 }
6452 else if(StringICmp(q, "rRNA") != 0)
6453 same = false;
6454 }
6455 else if(StringCmp(ibp->moltype, "snoRNA") == 0)
6456 {
6457 biomol = Seq_descr_GIBB_mol_snoRNA;
6458 bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6459
6460 if(pp->source == Parser::ESource::EMBL)
6461 {
6462 if(StringICmp(q, "RNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6463 same = false;
6464 }
6465 else if(StringICmp(q, "snoRNA") != 0)
6466 same = false;
6467 }
6468 else if(StringCmp(ibp->moltype, "snRNA") == 0)
6469 {
6470 biomol = Seq_descr_GIBB_mol_snRNA;
6471 bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6472
6473 if(pp->source == Parser::ESource::EMBL)
6474 {
6475 if(StringICmp(q, "RNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6476 same = false;
6477 }
6478 else if(StringICmp(q, "snRNA") != 0)
6479 same = false;
6480 }
6481 else if(StringCmp(ibp->moltype, "scRNA") == 0)
6482 {
6483 biomol = Seq_descr_GIBB_mol_scRNA;
6484 bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6485
6486 if(pp->source == Parser::ESource::EMBL)
6487 {
6488 if(StringICmp(q, "RNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6489 same = false;
6490 }
6491 else if(StringICmp(q, "scRNA") != 0)
6492 same = false;
6493 }
6494 else if(StringCmp(ibp->moltype, "pre-RNA") == 0)
6495 {
6496 biomol = Seq_descr_GIBB_mol_preRNA;
6497 bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6498
6499 if(pp->source == Parser::ESource::EMBL)
6500 {
6501 if(StringICmp(q, "RNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6502 same = false;
6503 }
6504 else if(StringICmp(q, "RNA") != 0)
6505 same = false;
6506 }
6507 else if(StringCmp(ibp->moltype, "pre-mRNA") == 0)
6508 {
6509 biomol = Seq_descr_GIBB_mol_preRNA;
6510 bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6511
6512 if(pp->source == Parser::ESource::EMBL)
6513 {
6514 if(StringICmp(q, "RNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6515 same = false;
6516 }
6517 else if(StringICmp(q, "RNA") != 0)
6518 same = false;
6519 }
6520 else if(StringCmp(ibp->moltype, "other RNA") == 0)
6521 {
6522 if(is_syn)
6523 biomol = Seq_descr_GIBB_mol_other_genetic;
6524 else
6525 biomol = Seq_descr_GIBB_mol_other;
6526 bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6527
6528 if (pp->source == Parser::ESource::EMBL)
6529 {
6530 if(StringICmp(q, "RNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6531 same = false;
6532 }
6533 else if(StringICmp(q, "RNA") != 0)
6534 same = false;
6535 }
6536 else if(StringCmp(ibp->moltype, "other DNA") == 0)
6537 {
6538 if(is_syn)
6539 biomol = Seq_descr_GIBB_mol_other_genetic;
6540 else
6541 biomol = Seq_descr_GIBB_mol_other;
6542 bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_dna);
6543
6544 if (pp->source == Parser::ESource::EMBL)
6545 {
6546 if(StringICmp(q, "DNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6547 same = false;
6548 }
6549 else if(StringICmp(q, "DNA") != 0)
6550 same = false;
6551 }
6552 else if(StringCmp(ibp->moltype, "unassigned RNA") == 0)
6553 {
6554 if(is_syn)
6555 biomol = Seq_descr_GIBB_mol_other_genetic;
6556 else
6557 biomol = Seq_descr_GIBB_mol_unknown;
6558 bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6559
6560 if (pp->source == Parser::ESource::EMBL)
6561 {
6562 if(StringICmp(q, "RNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6563 same = false;
6564 }
6565 else if(StringICmp(q, "RNA") != 0)
6566 same = false;
6567 }
6568 else if(StringCmp(ibp->moltype, "unassigned DNA") == 0)
6569 {
6570 if(is_syn)
6571 biomol = Seq_descr_GIBB_mol_other_genetic;
6572 else
6573 biomol = Seq_descr_GIBB_mol_unknown;
6574 bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_dna);
6575
6576 if (pp->source == Parser::ESource::EMBL)
6577 {
6578 if(StringICmp(q, "DNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6579 same = false;
6580 }
6581 else if(StringICmp(q, "DNA") != 0)
6582 same = false;
6583 }
6584 else if(StringCmp(ibp->moltype, "viral cRNA") == 0)
6585 {
6586 biomol = Seq_descr_GIBB_mol_cRNA;
6587 bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6588
6589 if (pp->source == Parser::ESource::EMBL)
6590 {
6591 if(StringICmp(q, "RNA") != 0 &&
6592 StringICmp(q, "cRNA") != 0 &&
6593 StringICmp(ibp->moltype, q) != 0)
6594 same = false;
6595 }
6596 else if(StringICmp(q, "cRNA") != 0)
6597 same = false;
6598 }
6599 else if(StringCmp(ibp->moltype, "transcribed RNA") == 0)
6600 {
6601 biomol = Seq_descr_GIBB_mol_trRNA;
6602 bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6603
6604 if (pp->source == Parser::ESource::EMBL)
6605 {
6606 if(StringICmp(q, "RNA") != 0 && StringICmp(ibp->moltype, q) != 0)
6607 same = false;
6608 }
6609 else if(StringICmp(q, "RNA") != 0)
6610 same = false;
6611 }
6612 else
6613 {
6614 ErrPostEx(SEV_REJECT, ERR_SOURCE_InvalidMolType,
6615 "Invalid /mol_type value \"%s\" provided in source features. Entry dropped.",
6616 ibp->moltype);
6617 ibp->drop = 1;
6618 if(molstr != NULL)
6619 *r = c;
6620 return;
6621 }
6622
6623 if(!same)
6624 {
6625 if(ibp->embl_new_ID)
6626 {
6627 ErrPostEx(SEV_REJECT, ERR_SOURCE_MolTypesDisagree,
6628 "Molecule type \"%s\" from the ID line disagrees with \"%s\" from the /mol_type qualifier.",
6629 q, ibp->moltype);
6630 ibp->drop = 1;
6631 if(molstr != NULL)
6632 *r = c;
6633 return;
6634 }
6635 ErrPostEx(SEV_ERROR, ERR_SOURCE_MolTypesDisagree,
6636 "Molecule type \"%s\" from the ID/LOCUS line disagrees with \"%s\" from the /mol_type qualifier.",
6637 q, ibp->moltype);
6638 }
6639
6640 if ((tech == objects::CMolInfo::eTech_sts || tech == objects::CMolInfo::eTech_htgs_0 ||
6641 tech == objects::CMolInfo::eTech_htgs_1 || tech == objects::CMolInfo::eTech_htgs_2 ||
6642 tech == objects::CMolInfo::eTech_htgs_3 || tech == objects::CMolInfo::eTech_wgs ||
6643 tech == objects::CMolInfo::eTech_survey) &&
6644 StringCmp(ibp->moltype, "genomic DNA") != 0)
6645 techok = false;
6646 else if ((tech == objects::CMolInfo::eTech_est || tech == objects::CMolInfo::eTech_fli_cdna ||
6647 tech == objects::CMolInfo::eTech_htc) && StringCmp(ibp->moltype, "mRNA") != 0)
6648 techok = false;
6649 else
6650 techok = true;
6651
6652 if(!techok)
6653 {
6654 if(tech == objects::CMolInfo::eTech_est)
6655 p = "EST";
6656 else if(tech == objects::CMolInfo::eTech_fli_cdna)
6657 p = "fli-cDNA";
6658 else if(tech == objects::CMolInfo::eTech_htc)
6659 p = "HTC";
6660 else if(tech == objects::CMolInfo::eTech_sts)
6661 p = "STS";
6662 else if(tech == objects::CMolInfo::eTech_wgs)
6663 p = "WGS";
6664 else if(tech == objects::CMolInfo::eTech_tsa)
6665 p = "TSA";
6666 else if(tech == objects::CMolInfo::eTech_targeted)
6667 p = "TLS";
6668 else if(tech == objects::CMolInfo::eTech_survey)
6669 p = "GSS";
6670 else
6671 p = "HTG";
6672 ErrPostEx(SEV_ERROR, ERR_SOURCE_MolTypeSeqTypeConflict,
6673 "Molecule type \"%s\" from the /mol_type qualifier disagrees with this record's sequence type: \"%s\".",
6674 ibp->moltype, p);
6675 }
6676
6677 if(molstr != NULL)
6678 *r = c;
6679 return;
6680 }
6681
6682 if(tech == objects::CMolInfo::eTech_est)
6683 {
6684 biomol = Seq_descr_GIBB_mol_mRNA;
6685 bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6686 return;
6687 }
6688
6689 if(pp->source == Parser::ESource::DDBJ || pp->source == Parser::ESource::LANL ||
6690 pp->source == Parser::ESource::NCBI)
6691 {
6692 biomol = Seq_descr_GIBB_mol_genomic;
6693 bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_dna);
6694 }
6695 else
6696 {
6697 biomol = Unknown;
6698 bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_na);
6699 }
6700
6701 if(molstr == NULL)
6702 genomic = -1;
6703 else
6704 {
6705 genomic = CheckNA(molstr);
6706 if(genomic < 0 && pp->source == Parser::ESource::DDBJ)
6707 genomic = CheckNADDBJ(molstr);
6708 }
6709
6710 if(genomic < 0 || genomic > 20)
6711 {
6712 if(pp->source == Parser::ESource::EMBL && StringNICmp(molstr, "XXX", 3) == 0)
6713 return;
6714 if(pp->source == Parser::ESource::DDBJ && StringNICmp(molstr, "PRT", 3) == 0)
6715 return;
6716 ibp->drop = 1;
6717 q = molstr;
6718 c = '\0';
6719 if(q != NULL)
6720 {
6721 if(pp->format == Parser::EFormat::EMBL)
6722 while(*q != ';' && *q != '\n' && *q != '\0')
6723 q++;
6724 else
6725 {
6726 while(*q != ';' && *q != ' ' && *q != '\t' && *q != '\n' &&
6727 *q != '\0')
6728 q++;
6729 if(q - molstr > 10)
6730 q = molstr + 10;
6731 }
6732
6733 c = *q;
6734 *q = '\0';
6735 }
6736 if(pp->source == Parser::ESource::DDBJ)
6737 p = "DDBJ";
6738 else if(pp->source == Parser::ESource::EMBL)
6739 p = "EMBL";
6740 else if(pp->source == Parser::ESource::LANL)
6741 p = "LANL";
6742 else
6743 p = "NCBI";
6744
6745 ErrPostEx(SEV_FATAL, ERR_FORMAT_InvalidMolType,
6746 "Molecule type \"%s\" from LOCUS/ID line is not legal value for records from source \"%s\". Sequence rejected.",
6747 (molstr == NULL) ? "???" : molstr, p);
6748 if(q != NULL)
6749 *q = c;
6750 return;
6751 }
6752
6753 if(genomic < 2)
6754 bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_na);
6755 else if(genomic > 1 && genomic < 6)
6756 bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_dna);
6757 else
6758 bioseq.SetInst().SetMol(objects::CSeq_inst::eMol_rna);
6759
6760 if(genomic != 6) /* Not just RNA */
6761 {
6762 if(genomic < 2) /* " ", "NA" or "cDNA" */
6763 biomol = Seq_descr_GIBB_mol_genomic;
6764 else if(genomic == 2) /* DNA */
6765 biomol = Seq_descr_GIBB_mol_genomic;
6766 else if(genomic == 3) /* genomic DNA */
6767 biomol = Seq_descr_GIBB_mol_genomic;
6768 else if(genomic == 4) /* other DNA */
6769 {
6770 if(is_syn)
6771 biomol = Seq_descr_GIBB_mol_other_genetic;
6772 else
6773 biomol = Seq_descr_GIBB_mol_other;
6774 }
6775 else if(genomic == 5) /* unassigned DNA */
6776 {
6777 if(is_syn)
6778 biomol = Seq_descr_GIBB_mol_other_genetic;
6779 else
6780 biomol = Seq_descr_GIBB_mol_unknown;
6781 }
6782 else if(genomic == 7) /* mRNA */
6783 biomol = Seq_descr_GIBB_mol_mRNA;
6784 else if(genomic == 8) /* rRNA */
6785 biomol = Seq_descr_GIBB_mol_rRNA;
6786 else if(genomic == 9) /* tRNA */
6787 biomol = Seq_descr_GIBB_mol_tRNA;
6788 else if(genomic == 10 || genomic == 12) /* uRNA -> snRNA */
6789 biomol = Seq_descr_GIBB_mol_snRNA;
6790 else if(genomic == 11) /* scRNA */
6791 biomol = Seq_descr_GIBB_mol_scRNA;
6792 else if(genomic == 13) /* snoRNA */
6793 biomol = Seq_descr_GIBB_mol_snoRNA;
6794 else if(genomic == 14) /* pre-RNA */
6795 biomol = Seq_descr_GIBB_mol_preRNA;
6796 else if(genomic == 15) /* pre-mRNA */
6797 biomol = Seq_descr_GIBB_mol_preRNA;
6798 else if(genomic == 16) /* genomic RNA */
6799 biomol = Seq_descr_GIBB_mol_genomic;
6800 else if(genomic == 17) /* other RNA */
6801 {
6802 if(is_syn)
6803 biomol = Seq_descr_GIBB_mol_other_genetic;
6804 else
6805 biomol = Seq_descr_GIBB_mol_other;
6806 }
6807 else if(genomic == 18) /* unassigned RNA */
6808 {
6809 if(is_syn)
6810 biomol = Seq_descr_GIBB_mol_other_genetic;
6811 else
6812 biomol = Seq_descr_GIBB_mol_unknown;
6813 }
6814 else if(genomic == 19 || genomic == 20) /* cRNA or viral cRNA */
6815 biomol = Seq_descr_GIBB_mol_cRNA;
6816 return;
6817 }
6818
6819 /* Here goes most complicated case with just RNA
6820 */
6821 const Char* div = NULL;
6822 if (org_ref != NULL && org_ref->IsSetOrgname() && org_ref->GetOrgname().IsSetDiv())
6823 div = org_ref->GetOrgname().GetDiv().c_str();
6824
6825 if(pp->source != Parser::ESource::EMBL || pp->format != Parser::EFormat::EMBL)
6826 {
6827 biomol = Seq_descr_GIBB_mol_genomic;
6828 if (div == NULL || StringNCmp(div, "VRL", 3) != 0)
6829 {
6830 ErrPostEx(SEV_ERROR, ERR_LOCUS_NonViralRNAMoltype,
6831 "Genomic RNA implied by presence of RNA moltype, but sequence is non-viral.");
6832 }
6833 return;
6834 }
6835
6836 count = 0;
6837 size_t len = 0;
6838 offset = SrchNodeType(entry, ParFlat_DE, &len);
6839 if(offset != NULL)
6840 {
6841 c = offset[len];
6842 offset[len] = '\0';
6843 mRNA = StringStr(offset, "mRNA");
6844 tRNA = StringStr(offset, "tRNA");
6845 rRNA = StringStr(offset, "rRNA");
6846 snRNA = StringStr(offset, "snRNA");
6847 scRNA = StringStr(offset, "scRNA");
6848 uRNA = StringStr(offset, "uRNA");
6849 snoRNA = StringStr(offset, "snoRNA");
6850 if(mRNA != NULL)
6851 count++;
6852 if(tRNA != NULL)
6853 count++;
6854 if(rRNA != NULL)
6855 count++;
6856 if(snRNA != NULL || uRNA != NULL)
6857 count++;
6858 if(scRNA != NULL)
6859 count++;
6860 if(snoRNA != NULL)
6861 count++;
6862 offset[len] = c;
6863 }
6864
6865 /* Non-viral division
6866 */
6867 if (div == NULL || StringNCmp(div, "VRL", 3) != 0)
6868 {
6869 biomol = Seq_descr_GIBB_mol_mRNA;
6870
6871 if(count > 1)
6872 {
6873 ErrPostEx(SEV_WARNING, ERR_DEFINITION_DifferingRnaTokens,
6874 "More than one of mRNA, tRNA, rRNA, snRNA (uRNA), scRNA, snoRNA present in defline.");
6875 }
6876
6877 if(tRNA != NULL)
6878 {
6879 for(p = tRNA + 4; *p == ' ' || *p == '\t';)
6880 p++;
6881 if(*p == '\n')
6882 {
6883 p++;
6884 if(StringNCmp(p, "DE ", 5) == 0)
6885 p += 5;
6886 }
6887 if(StringNICmp(p, "Synthetase", 10) == 0)
6888 return;
6889 }
6890
6891 if(count > 0)
6892 biomol = GetBiomolFromToks(mRNA, tRNA, rRNA, snRNA, scRNA, uRNA,
6893 snoRNA);
6894 return;
6895 }
6896
6897 /* Viral division
6898 */
6899 if (org_ref != NULL && org_ref->IsSetOrgname() && org_ref->GetOrgname().IsSetLineage() &&
6900 StringIStr(org_ref->GetOrgname().GetLineage().c_str(), "no DNA stage") != NULL)
6901 stage = true;
6902 else
6903 stage = false;
6904
6905 dbp = TrackNodeType(entry, ParFlat_FH);
6906 if(dbp == NULL)
6907 return;
6908 dbp = (DataBlkPtr) dbp->data;
6909 for(i = 0; dbp != NULL && i < 2; dbp = dbp->next)
6910 {
6911 if(dbp->offset == NULL)
6912 continue;
6913 offset = dbp->offset + ParFlat_COL_FEATKEY;
6914 if(StringNCmp(offset, "CDS", 3) == 0)
6915 i++;
6916 }
6917 if(i > 1)
6918 {
6919 biomol = Seq_descr_GIBB_mol_genomic;
6920 if(!stage)
6921 {
6922 ErrPostEx(SEV_WARNING, ERR_SOURCE_GenomicViralRnaAssumed,
6923 "This sequence is assumed to be genomic due to multiple coding region but lack of a DNA stage is not indicated in taxonomic lineage.");
6924 }
6925 return;
6926 }
6927
6928 if(count == 0)
6929 {
6930 biomol = Seq_descr_GIBB_mol_genomic;
6931 if(!stage)
6932 {
6933 ErrPostEx(SEV_ERROR, ERR_SOURCE_UnclassifiedViralRna,
6934 "Cannot determine viral molecule type (genomic vs a specific type of RNA) based on definition line, CDS content, or taxonomic lineage. So this sequence has been classified as genomic by default (perhaps in error).");
6935 }
6936 else
6937 {
6938 ErrPostEx(SEV_WARNING, ERR_SOURCE_LineageImpliesGenomicViralRna,
6939 "This sequence lacks indication of specific RNA type in the definition line, but the taxonomic lineage mentions lack of a DNA stage, so it is classified as genomic.");
6940 }
6941 return;
6942 }
6943
6944 if(count > 1)
6945 {
6946 ErrPostEx(SEV_WARNING, ERR_DEFINITION_DifferingRnaTokens,
6947 "More than one of mRNA, tRNA, rRNA, snRNA (uRNA), scRNA, snoRNA present in defline.");
6948 }
6949
6950 biomol = GetBiomolFromToks(mRNA, tRNA, rRNA, snRNA, scRNA, uRNA, snoRNA);
6951 }
6952
6953 END_NCBI_SCOPE
6954