1 /* fta_src.c
2  *
3  * ===========================================================================
4  *
5  *                            PUBLIC DOMAIN NOTICE
6  *               National Center for Biotechnology Information
7  *
8  *  This software/database is a "United States Government Work" under the
9  *  terms of the United States Copyright Act.  It was written as part of
10  *  the author's official duties as a United States Government employee and
11  *  thus cannot be copyrighted.  This software/database is freely available
12  *  to the public for use. The National Library of Medicine and the U.S.
13  *  Government have not placed any restriction on its use or reproduction.
14  *
15  *  Although all reasonable efforts have been taken to ensure the accuracy
16  *  and reliability of the software and data, the NLM and the U.S.
17  *  Government do not and cannot warrant the performance or results that
18  *  may be obtained by using this software or data. The NLM and the U.S.
19  *  Government disclaim all warranties, express or implied, including
20  *  warranties of performance, merchantability or fitness for any particular
21  *  purpose.
22  *
23  *  Please cite the author in any work or product based on this material.
24  *
25  * ===========================================================================
26  *
27  * File Name:  fta_src.c
28  *
29  * Author: Sergey Bazhin
30  *
31  * File Description:
32  * -----------------
33  *      Messes about source features.
34  */
35 #include <ncbi_pch.hpp>
36 
37 #include "ftacpp.hpp"
38 
39 #include <objects/seqfeat/Gb_qual.hpp>
40 #include <objects/seq/Seq_annot.hpp>
41 #include <objects/seq/Seq_annot_.hpp>
42 #include <objects/seqfeat/Org_ref.hpp>
43 #include <objects/seqfeat/OrgName.hpp>
44 #include <objects/seqfeat/OrgMod.hpp>
45 #include <objects/general/Dbtag.hpp>
46 #include <objects/general/Object_id.hpp>
47 #include <objects/seqfeat/SubSource.hpp>
48 #include <objects/seqfeat/BioSource.hpp>
49 #include <objects/seq/Seq_descr.hpp>
50 
51 #include "index.h"
52 
53 #include <objtools/flatfile/flatdefn.h>
54 #include "ftanet.h"
55 
56 #include "ftaerr.hpp"
57 #include "asci_blk.h"
58 #include "loadfeat.h"
59 #include "utilfeat.h"
60 #include "add.h"
61 #include "utilfun.h"
62 
63 #ifdef THIS_FILE
64 #    undef THIS_FILE
65 #endif
66 #define THIS_FILE "fta_src.cpp"
67 
68 BEGIN_NCBI_SCOPE
69 
70 
71 typedef struct {
72     const char *name;
73     Uint1      num;
74 } CharUInt1;
75 
76 #define USE_CULTIVAR                         00001
77 #define USE_ISOLATE                          00002
78 #define USE_SEROTYPE                         00004
79 #define USE_SEROVAR                          00010
80 #define USE_SPECIMEN_VOUCHER                 00020
81 #define USE_STRAIN                           00040
82 #define USE_SUB_SPECIES                      00100
83 #define USE_SUB_STRAIN                       00200
84 #define USE_VARIETY                          00400
85 #define USE_ECOTYPE                          01000
86 #define USE_ALL                              01777
87 
88 #define BIOSOURCES_THRESHOLD                 20
89 
90 typedef struct _pcr_primers {
91     char*                  fwd_name;
92     char*                  fwd_seq;
93     char*                  rev_name;
94     char*                  rev_seq;
95     struct _pcr_primers* next;
96 } PcrPrimers, *PcrPrimersPtr;
97 
98 typedef struct _source_feat_blk {
99     char*                      name;
100     char*                      strain;
101     char*                      organelle;
102     char*                      isolate;
103     char*                      namstr;
104     char*                      location;
105     char*                      moltype;
106     char*                      genomename;
107     char*                      submitter_seqid;
108 
109     TQualVector quals;
110     CRef<objects::CBioSource> bio_src;
111     CRef<objects::COrgName> orgname;
112 
113     bool                      full;
114     bool                      focus;
115     bool                      tg;
116     bool                      lookup;
117     bool                      skip;
118     bool                      useit;
119 
120     Uint1                        genome;
121     struct _source_feat_blk* next;
122 
_source_feat_blk_source_feat_blk123     _source_feat_blk() :
124         name(NULL),
125         strain(NULL),
126         organelle(NULL),
127         isolate(NULL),
128         namstr(NULL),
129         location(NULL),
130         moltype(NULL),
131         genomename(NULL),
132         submitter_seqid(NULL),
133         full(false),
134         focus(false),
135         tg(false),
136         lookup(false),
137         skip(false),
138         useit(false),
139         genome(0),
140         next(NULL)
141     {}
142 
143 } SourceFeatBlk, *SourceFeatBlkPtr;
144 
145 typedef struct _min_max {
146     char*              orgname;       /* Do not free! It's just a pointer */
147     Int4                 min;
148     Int4                 max;
149     bool                 skip;
150     struct _min_max* next;
151 } MinMax, *MinMaxPtr;
152 
153 static const char *ObsoleteSourceDbxrefTag[] = {
154     "IFO",
155     NULL
156 };
157 
158 static const char *DENLRSourceDbxrefTag[] = {   /* DENL = DDBJ + EMBL + NCBI +
159                                                           LANL + RefSeq */
160     "AFTOL",
161     "ANTWEB",
162     "ATCC",
163     "ATCC(DNA)",
164     "ATCC(IN HOST)",
165     "BEI",
166     "BOLD",
167     "FBOL",
168     "FUNGORUM",
169     "GREENGENES",
170     "GRIN",
171     "HMP",
172     "HOMD",
173     "IKMC",
174     "ISHAM-ITS",
175     "JCM",
176     "NBRC",
177     "RBGE_GARDEN",
178     "RBGE_HERBARIUM",
179     "RZPD",
180     "UNILIB",
181     NULL
182 };
183 
184 static const char *DESourceDbxrefTag[] = {      /* DE = DDBJ + EMBL */
185     "FANTOM_DB",
186     "IMGT/HLA",
187     "IMGT/LIGM",
188     "MGD",
189     "MGI",
190     NULL
191 };
192 
193 static const char *ESourceDbxrefTag[] = {       /* E = EMBL */
194     "UNITE",
195     NULL
196 };
197 
198 static const char *NLRSourceDbxrefTag[] = {     /* N = NCBI + LANL + RefSeq */
199     "FLYBASE",
200     NULL
201 };
202 
203 static const char *exempt_quals[] = {
204     "transposon",
205     "insertion_seq",
206     NULL
207 };
208 
209 static const char *special_orgs[] = {
210     "synthetic construct",
211     "artificial sequence",
212     "eukaryotic synthetic construct",
213     NULL
214 };
215 
216 static const char *unusual_toks[] = {
217     "complement",
218     NULL
219 };
220 
221 static const char *source_genomes[] = {
222     "mitochondr",
223     "chloroplast",
224     "kinetoplas",
225     "cyanelle",
226     "plastid",
227     "chromoplast",
228     "macronuclear",
229     "extrachrom",
230     "plasmid",
231     NULL
232 };
233 
234 static const char *SourceBadQuals[] = {
235     "label",
236     "usedin",
237     "citation",
238     NULL
239 };
240 
241 static const char *SourceSubSources[] = {
242     "chromosome",                       /*  1 */
243     "map",                              /*  2 */
244     "clone",                            /*  3 */
245     "sub_clone",                        /*  4 */
246     "haplotype",                        /*  5 */
247     "genotype",                         /*  6 */
248     "sex",                              /*  7 */
249     "cell_line",                        /*  8 */
250     "cell_type",                        /*  9 */
251     "tissue_type",                      /* 10 */
252     "clone_lib",                        /* 11 */
253     "dev_stage",                        /* 12 */
254     "frequency",                        /* 13 */
255     "germline",                         /* 14 */
256     "rearranged",                       /* 15 */
257     "lab_host",                         /* 16 */
258     "pop_variant",                      /* 17 */
259     "tissue_lib",                       /* 18 */
260     "plasmid",                          /* 19 */
261     "transposon",                       /* 20 */
262     "insertion_seq",                    /* 21 */
263     "plastid",                          /* 22 */
264     "",                                 /* 23 */
265     "segment",                          /* 24 */
266     "",                                 /* 25 */
267     "transgenic",                       /* 26 */
268     "environmental_sample",             /* 27 */
269     "isolation_source",                 /* 28 */
270     "lat_lon",                          /* 29 */
271     "collection_date",                  /* 30 */
272     "collected_by",                     /* 31 */
273     "identified_by",                    /* 32 */
274     "",                                 /* 33 */
275     "",                                 /* 34 */
276     "",                                 /* 35 */
277     "",                                 /* 36 */
278     "metagenomic",                      /* 37 */
279     "mating_type",                      /* 38 */
280     NULL
281 };
282 
283 static CharUInt1 SourceOrgMods[] = {
284     {"strain",              2},
285     {"sub_strain",          3},
286     {"variety",             6},
287     {"serotype",            7},
288     {"serovar",             9},
289     {"cultivar",           10},
290     {"isolate",            17},
291     {"specific_host",      21},
292     {"host",               21},
293     {"sub_species",        22},
294     {"specimen_voucher",   23},
295     {"ecotype",            27},
296     {"culture_collection", 35},
297     {"bio_material",       36},
298     {"metagenome_source",  37},
299     {"type_material",      38},
300     {NULL,                  0}
301 };
302 
303 static const char *GenomicSourceFeatQual[] = {
304     "unknown",
305     "unknown",
306     "chloroplast",
307     "chromoplast",
308     "kinetoplast",
309     "mitochondrion",
310     "plastid",
311     "macronuclear",
312     "extrachrom",
313     "plasmid",
314     "transposon",
315     "insertion-seq",
316     "cyanelle",
317     "proviral",
318     "virion",
319     "nucleomorph",
320     "apicoplast",
321     "leucoplast",
322     "proplastid",                       /* 18 */
323     "",                                 /* 19 */
324     "",                                 /* 20 */
325     "",                                 /* 21 */
326     "chromatophore",                    /* 22 */
327     NULL
328 };
329 
330 static const char *OrganelleFirstToken[] = {
331     "chromatophore",
332     "hydrogenosome",
333     "mitochondrion",
334     "nucleomorph",
335     "plastid",
336     NULL
337 };
338 
339 /**********************************************************/
SourceFeatBlkNew(void)340 static SourceFeatBlkPtr SourceFeatBlkNew(void)
341 {
342     return new SourceFeatBlk;
343 }
344 
345 /**********************************************************/
SourceFeatBlkFree(SourceFeatBlkPtr sfbp)346 static void SourceFeatBlkFree(SourceFeatBlkPtr sfbp)
347 {
348     if (sfbp->name != NULL)
349         MemFree(sfbp->name);
350     if(sfbp->strain != NULL)
351         MemFree(sfbp->strain);
352     if(sfbp->organelle != NULL)
353         MemFree(sfbp->organelle);
354     if(sfbp->isolate != NULL)
355         MemFree(sfbp->isolate);
356     if(sfbp->namstr != NULL)
357         MemFree(sfbp->namstr);
358     if(sfbp->location != NULL)
359         MemFree(sfbp->location);
360     if(sfbp->moltype != NULL)
361         MemFree(sfbp->moltype);
362     if(sfbp->genomename != NULL)
363         MemFree(sfbp->genomename);
364 
365     delete sfbp;
366 }
367 
368 /**********************************************************/
SourceFeatBlkSetFree(SourceFeatBlkPtr sfbp)369 static void SourceFeatBlkSetFree(SourceFeatBlkPtr sfbp)
370 {
371     SourceFeatBlkPtr tsfbp;
372 
373     for(tsfbp = sfbp; tsfbp != NULL; tsfbp = sfbp)
374     {
375         sfbp = tsfbp->next;
376         SourceFeatBlkFree(tsfbp);
377     }
378 }
379 
380 /**********************************************************/
CollectSourceFeats(DataBlkPtr dbp,Int2 type)381 static SourceFeatBlkPtr CollectSourceFeats(DataBlkPtr dbp, Int2 type)
382 {
383     SourceFeatBlkPtr sfbp;
384     SourceFeatBlkPtr tsfbp;
385     DataBlkPtr       tdbp;
386     FeatBlkPtr       fbp;
387 
388     sfbp = SourceFeatBlkNew();
389     tsfbp = sfbp;
390 
391     for(; dbp != NULL; dbp = dbp->next)
392     {
393         if(dbp->type != type)
394             continue;
395         for(tdbp = (DataBlkPtr) dbp->data; tdbp != NULL; tdbp = tdbp->next)
396         {
397             fbp = (FeatBlkPtr) tdbp->data;
398             if(fbp == NULL || fbp->key == NULL ||
399                StringCmp(fbp->key, "source") != 0)
400                 continue;
401             tsfbp->next = SourceFeatBlkNew();
402             tsfbp = tsfbp->next;
403             if(fbp->location != NULL)
404                 tsfbp->location = StringSave(fbp->location);
405             tsfbp->quals = fbp->quals;
406         }
407     }
408     tsfbp = sfbp->next;
409     delete sfbp;
410     //MemFree(sfbp);
411     return(tsfbp);
412 }
413 
414 /**********************************************************/
RemoveStringSpaces(char * line)415 static void RemoveStringSpaces(char* line)
416 {
417     char* p;
418     char* q;
419 
420     if(line == NULL || *line == '\0')
421         return;
422 
423     for(p = line, q = line; *p != '\0'; p++)
424         if(*p != ' ' && *p != '\t')
425             *q++ = *p;
426     *q = '\0';
427 }
428 
429 /**********************************************************/
RemoveSourceFeatSpaces(SourceFeatBlkPtr sfbp)430 static void RemoveSourceFeatSpaces(SourceFeatBlkPtr sfbp)
431 {
432     for(; sfbp != NULL; sfbp = sfbp->next)
433     {
434         RemoveStringSpaces(sfbp->location);
435         NON_CONST_ITERATE(TQualVector, cur, sfbp->quals)
436         {
437             if ((*cur)->IsSetQual())
438             {
439                 std::vector<char> buf((*cur)->GetQual().begin(), (*cur)->GetQual().end());
440                 buf.push_back(0);
441                 ShrinkSpaces(&buf[0]);
442                 (*cur)->SetQual(&buf[0]);
443             }
444 
445             if ((*cur)->IsSetVal())
446             {
447                 std::vector<char> buf((*cur)->GetVal().begin(), (*cur)->GetVal().end());
448                 buf.push_back(0);
449                 ShrinkSpaces(&buf[0]);
450                 (*cur)->SetVal(&buf[0]);
451             }
452         }
453     }
454 }
455 
456 /**********************************************************/
CheckForExemption(SourceFeatBlkPtr sfbp)457 static void CheckForExemption(SourceFeatBlkPtr sfbp)
458 {
459     const char   **b;
460 
461     for(; sfbp != NULL; sfbp = sfbp->next)
462     {
463         ITERATE(TQualVector, cur, sfbp->quals)
464         {
465             for (b = exempt_quals; *b != NULL; b++)
466             {
467                 if ((*cur)->GetQual() == *b)
468                     break;
469             }
470             if(*b != NULL)
471             {
472                 sfbp->skip = true;
473                 break;
474             }
475         }
476     }
477 }
478 
479 /**********************************************************/
PopulateSubNames(char * namstr,const Char * name,const Char * value,Uint1 subtype,TOrgModList & mods)480 static void PopulateSubNames(char* namstr, const Char *name,
481                              const Char* value, Uint1 subtype, TOrgModList& mods)
482 {
483     CRef<objects::COrgMod> mod(new objects::COrgMod);
484 
485     StringCat(namstr, name);
486     StringCat(namstr, value);
487     StringCat(namstr, ")");
488 
489     mod->SetSubtype(subtype);
490     mod->SetSubname(value);
491 
492     mods.push_front(mod);
493 }
494 
495 /**********************************************************/
CollectSubNames(SourceFeatBlkPtr sfbp,Int4 use_what,const Char * name,const Char * cultivar,const Char * isolate,const Char * serotype,const Char * serovar,const Char * specimen_voucher,const Char * strain,const Char * sub_species,const Char * sub_strain,const Char * variety,const Char * ecotype)496 static void CollectSubNames(SourceFeatBlkPtr sfbp, Int4 use_what, const Char* name,
497                             const Char* cultivar, const Char* isolate,
498                             const Char* serotype, const Char* serovar,
499                             const Char* specimen_voucher, const Char* strain,
500                             const Char* sub_species, const Char* sub_strain,
501                             const Char* variety, const Char* ecotype)
502 {
503     if(sfbp == NULL)
504        return;
505 
506     if(sfbp->namstr != NULL)
507         MemFree(sfbp->namstr);
508     sfbp->namstr = NULL;
509 
510     if (sfbp->orgname.NotEmpty())
511         sfbp->orgname.Reset();
512 
513     if(name == NULL)
514        return;
515 
516     size_t i = StringLen(name) + 1;
517     size_t j = i;
518     if((use_what & USE_CULTIVAR) == USE_CULTIVAR && cultivar != NULL)
519         i += (StringLen(cultivar) + StringLen("cultivar") + 5);
520     if((use_what & USE_ISOLATE) == USE_ISOLATE && isolate != NULL)
521         i += (StringLen(isolate) + StringLen("isolate") + 5);
522     if((use_what & USE_SEROTYPE) == USE_SEROTYPE && serotype != NULL)
523         i += (StringLen(serotype) + StringLen("serotype") + 5);
524     if((use_what & USE_SEROVAR) == USE_SEROVAR && serovar != NULL)
525         i += (StringLen(serovar) + StringLen("serovar") + 5);
526     if((use_what & USE_SPECIMEN_VOUCHER) == USE_SPECIMEN_VOUCHER &&
527        specimen_voucher != NULL)
528         i += (StringLen(specimen_voucher) + StringLen("specimen_voucher") + 5);
529     if((use_what & USE_STRAIN) == USE_STRAIN && strain != NULL)
530         i += (StringLen(strain) + StringLen("strain") + 5);
531     if((use_what & USE_SUB_SPECIES) == USE_SUB_SPECIES && sub_species != NULL)
532         i += (StringLen(sub_species) + StringLen("sub_species") + 5);
533     if((use_what & USE_SUB_STRAIN) == USE_SUB_STRAIN && sub_strain != NULL)
534         i += (StringLen(sub_strain) + StringLen("sub_strain") + 5);
535     if((use_what & USE_VARIETY) == USE_VARIETY && variety != NULL)
536         i += (StringLen(variety) + StringLen("variety") + 5);
537     if((use_what & USE_ECOTYPE) == USE_ECOTYPE && ecotype != NULL)
538         i += (StringLen(ecotype) + StringLen("ecotype") + 5);
539     sfbp->namstr = (char*) MemNew(i);
540     StringCpy(sfbp->namstr, name);
541     if(i == j)
542         return;
543 
544     sfbp->orgname = new objects::COrgName;
545     TOrgModList& mods = sfbp->orgname->SetMod();
546 
547     if((use_what & USE_CULTIVAR) == USE_CULTIVAR && cultivar != NULL)
548         PopulateSubNames(sfbp->namstr, "  (cultivar ", cultivar, 10, mods);
549     if((use_what & USE_ISOLATE) == USE_ISOLATE && isolate != NULL)
550         PopulateSubNames(sfbp->namstr, "  (isolate ", isolate, 17, mods);
551     if((use_what & USE_SEROTYPE) == USE_SEROTYPE && serotype != NULL)
552         PopulateSubNames(sfbp->namstr, "  (serotype ", serotype, 7, mods);
553     if((use_what & USE_SEROVAR) == USE_SEROVAR && serovar != NULL)
554         PopulateSubNames(sfbp->namstr, "  (serovar ", serovar, 9, mods);
555     if((use_what & USE_SPECIMEN_VOUCHER) == USE_SPECIMEN_VOUCHER &&
556        specimen_voucher != NULL)
557         PopulateSubNames(sfbp->namstr, "  (specimen_voucher ", specimen_voucher, 23, mods);
558     if((use_what & USE_STRAIN) == USE_STRAIN && strain != NULL)
559         PopulateSubNames(sfbp->namstr, "  (strain ", strain, 2, mods);
560     if((use_what & USE_SUB_SPECIES) == USE_SUB_SPECIES && sub_species != NULL)
561         PopulateSubNames(sfbp->namstr, "  (sub_species ", sub_species, 22, mods);
562     if((use_what & USE_SUB_STRAIN) == USE_SUB_STRAIN && sub_strain != NULL)
563         PopulateSubNames(sfbp->namstr, "  (sub_strain ", sub_strain, 3, mods);
564     if((use_what & USE_VARIETY) == USE_VARIETY && variety != NULL)
565         PopulateSubNames(sfbp->namstr, "  (variety ", variety, 6, mods);
566     if((use_what & USE_ECOTYPE) == USE_ECOTYPE && ecotype != NULL)
567         PopulateSubNames(sfbp->namstr, "  (ecotype ", ecotype, 27, mods);
568 }
569 
570 /**********************************************************/
SourceFeatStructFillIn(IndexblkPtr ibp,SourceFeatBlkPtr sfbp,Int4 use_what)571 static bool SourceFeatStructFillIn(IndexblkPtr ibp, SourceFeatBlkPtr sfbp, Int4 use_what)
572 {
573     const Char **b;
574 
575     const Char*    name;
576     const Char*    cultivar;
577     const Char*    isolate;
578     const Char*    organelle;
579     const Char*    serotype;
580     const Char*    serovar;
581     const Char*    ecotype;
582     const Char*    specimen_voucher;
583     const Char*    strain;
584     const Char*    sub_species;
585     const Char*    sub_strain;
586     const Char*    variety;
587     char*    genomename;
588     const Char*    p;
589     char*    q;
590     bool       ret;
591     Int4       i;
592 
593     for(ret = true; sfbp != NULL; sfbp = sfbp->next)
594     {
595         name = NULL;
596         cultivar = NULL;
597         isolate = NULL;
598         organelle = NULL;
599         serotype = NULL;
600         serovar = NULL;
601         ecotype = NULL;
602         specimen_voucher = NULL;
603         strain = NULL;
604         sub_species = NULL;
605         sub_strain = NULL;
606         variety = NULL;
607         genomename = NULL;
608 
609         ITERATE(TQualVector, cur, sfbp->quals)
610         {
611             if (!(*cur)->IsSetQual())
612                 continue;
613 
614             const std::string& qual_str = (*cur)->GetQual();
615             const Char* val_ptr = (*cur)->IsSetVal() ? (*cur)->GetVal().c_str() : NULL;
616 
617             if (qual_str == "db_xref")
618             {
619                 q = StringChr(val_ptr, ':');
620                 if(q == NULL || q[1] == '\0')
621                     continue;
622                 *q = '\0';
623                 if (StringICmp(val_ptr, "taxon") == 0)
624                     if(ibp->taxid < 1)
625                         ibp->taxid = atoi(q + 1);
626                 *q = ':';
627                 continue;
628             }
629             if (qual_str == "focus")
630             {
631                 sfbp->focus = true;
632                 continue;
633             }
634             if (qual_str == "transgenic")
635             {
636                 sfbp->tg = true;
637                 continue;
638             }
639             if (qual_str == "cultivar")
640             {
641                 cultivar = val_ptr;
642                 continue;
643             }
644             if (qual_str == "isolate")
645             {
646                 if(isolate == NULL)
647                     isolate = val_ptr;
648                 continue;
649             }
650             if (qual_str == "mol_type")
651             {
652                 if(sfbp->moltype != NULL)
653                     ret = false;
654                 else if (val_ptr != NULL)
655                     sfbp->moltype = StringSave(val_ptr);
656                 continue;
657             }
658             if (qual_str == "organelle")
659             {
660                 if(organelle == NULL)
661                     organelle = val_ptr;
662                 continue;
663             }
664             if (qual_str == "serotype")
665             {
666                 serotype = val_ptr;
667                 continue;
668             }
669             if (qual_str == "serovar")
670             {
671                 serovar = val_ptr;
672                 continue;
673             }
674             if (qual_str == "ecotype")
675             {
676                 ecotype = val_ptr;
677                 continue;
678             }
679             if (qual_str == "specimen_voucher")
680             {
681                 specimen_voucher = val_ptr;
682                 continue;
683             }
684             if (qual_str == "strain")
685             {
686                 if(strain == NULL)
687                     strain = val_ptr;
688                 continue;
689             }
690             if (qual_str == "sub_species")
691             {
692                 sub_species = val_ptr;
693                 continue;
694             }
695             if (qual_str == "sub_strain")
696             {
697                 sub_strain = val_ptr;
698                 continue;
699             }
700             if (qual_str == "variety")
701             {
702                 variety = val_ptr;
703                 continue;
704             }
705             if(qual_str == "submitter_seqid")
706             {
707                 if(sfbp->submitter_seqid != NULL)
708                 {
709                     MemFree(sfbp->submitter_seqid);
710                     sfbp->submitter_seqid = StringSave("");
711                 }
712                 else
713                     sfbp->submitter_seqid = StringSave(val_ptr);
714                 if(ibp->submitter_seqid == NULL)
715                     ibp->submitter_seqid = StringSave(val_ptr);
716                 continue;
717             }
718 
719             if (qual_str != "organism" ||
720                 val_ptr == NULL || val_ptr[0] == '\0')
721                 continue;
722 
723             if(ibp->organism == NULL)
724                 ibp->organism = StringSave(val_ptr);
725 
726             p = StringChr(val_ptr, ' ');
727 
728             std::string str_to_find;
729             if (p != NULL)
730                 str_to_find.assign(val_ptr, p);
731             else
732                 str_to_find.assign(val_ptr);
733 
734             for(i = 0, b = source_genomes; *b != NULL; b++, i++)
735                 if (StringNICmp(str_to_find.c_str(), *b, StringLen(*b)) == 0)
736                     break;
737             if(*b != NULL && i != 8)
738             {
739                 if(genomename != NULL)
740                     MemFree(genomename);
741                 genomename = StringSave(str_to_find.c_str());
742             }
743 
744             if(p != NULL)
745                 ++p;
746 
747             if(*b == NULL)
748                 p = val_ptr;
749             else
750             {
751                 if(i == 0)
752                     sfbp->genome = 5;   /* Mitochondrion */
753                 else if(i == 1)
754                     sfbp->genome = 2;   /* Chloroplast */
755                 else if(i == 2)
756                     sfbp->genome = 4;   /* Kinetoplast */
757                 else if(i == 3)
758                     sfbp->genome = 12;  /* Cyanelle */
759                 else if(i == 4)
760                     sfbp->genome = 6;   /* Plastid */
761                 else if(i == 5)
762                     sfbp->genome = 3;   /* Chromoplast */
763                 else if(i == 6)
764                     sfbp->genome = 7;   /* Macronuclear */
765                 else if(i == 7)
766                     sfbp->genome = 8;   /* Extrachrom */
767                 else if(i == 8)
768                 {
769                     p = val_ptr;
770                     sfbp->genome = 9;   /* Plasmid */
771                 }
772             }
773             name = p;
774         }
775 
776         if(sfbp->name != NULL)
777             MemFree(sfbp->name);
778         sfbp->name = (name == NULL) ? NULL : StringSave(name);
779 
780         if(sfbp->genomename != NULL)
781             MemFree(sfbp->genomename);
782         sfbp->genomename = genomename;
783 
784         if(strain != NULL && sfbp->strain == NULL)
785             sfbp->strain = StringSave(strain);
786         if(isolate != NULL && sfbp->isolate == NULL)
787             sfbp->isolate = StringSave(isolate);
788         if(organelle != NULL && sfbp->organelle == NULL)
789             sfbp->organelle = StringSave(organelle);
790 
791         CollectSubNames(sfbp, use_what, name, cultivar, isolate, serotype,
792                         serovar, specimen_voucher, strain, sub_species,
793                         sub_strain, variety, ecotype);
794     }
795     return(ret);
796 }
797 
798 /**********************************************************/
CheckSourceFeatFocusAndTransposon(SourceFeatBlkPtr sfbp)799 static char* CheckSourceFeatFocusAndTransposon(SourceFeatBlkPtr sfbp)
800 {
801     for (; sfbp != NULL; sfbp = sfbp->next)
802     {
803         if (sfbp->focus && sfbp->skip)
804             break;
805     }
806 
807     if(sfbp != NULL)
808         return(sfbp->location);
809     return(NULL);
810 }
811 
812 /**********************************************************/
CheckSourceFeatOrgs(SourceFeatBlkPtr sfbp,int * status)813 static char* CheckSourceFeatOrgs(SourceFeatBlkPtr sfbp, int* status)
814 {
815     *status = 0;
816     for(; sfbp != NULL; sfbp = sfbp->next)
817     {
818 /**        if(sfbp->namstr != NULL)*/
819         if(sfbp->name != NULL)
820             continue;
821 
822         *status = (sfbp->genome == 0) ? 1 : 2;
823         break;
824     }
825     if(sfbp != NULL)
826         return(sfbp->location);
827     return(NULL);
828 }
829 
830 /**********************************************************/
CheckSourceFeatLocFuzz(SourceFeatBlkPtr sfbp)831 static bool CheckSourceFeatLocFuzz(SourceFeatBlkPtr sfbp)
832 {
833     const char **b;
834     char*    p;
835     char*    q;
836     Int4       count;
837     bool    partial;
838     bool    invalid;
839     bool    ret;
840 
841     ret = true;
842     for(; sfbp != NULL; sfbp = sfbp->next)
843     {
844         if(sfbp->location == NULL || sfbp->location[0] == '\0')
845             break;
846         if(sfbp->skip)
847             continue;
848 
849         ITERATE(TQualVector, cur, sfbp->quals)
850         {
851             if ((*cur)->GetQual() != "partial")
852                 continue;
853 
854             ErrPostEx(SEV_ERROR, ERR_SOURCE_PartialQualifier,
855                       "Source feature location has /partial qualifier. Qualifier has been ignored: \"%s\".",
856                       (sfbp->location == NULL) ? "?empty?" : sfbp->location);
857             break;
858         }
859 
860         for(b = unusual_toks; *b != NULL; b++)
861         {
862             p = StringStr(sfbp->location, *b);
863             if(p == NULL)
864                 continue;
865             q = p + StringLen(*b);
866             if(p > sfbp->location)
867                 p--;
868             if((p == sfbp->location || *p == '(' || *p == ')' ||
869                 *p == ':' || *p == ',' || *p == '.') &&
870                (*q == '\0' || *q == '(' || *q == ')' || *q == ',' ||
871                 *q == ':' || *q == '.'))
872             {
873                 ErrPostEx(SEV_ERROR, ERR_SOURCE_UnusualLocation,
874                           "Source feature has an unusual location: \"%s\".",
875                           (sfbp->location == NULL) ? "?empty?" : sfbp->location);
876                 break;
877             }
878         }
879 
880         partial = false;
881         invalid = false;
882         for(count = 0, p = sfbp->location; *p != '\0'; p++)
883         {
884             if(*p == '^')
885                 invalid = true;
886             else if(*p == '>' || *p == '<')
887                 partial = true;
888             else if(*p == '(')
889                 count++;
890             else if(*p == ')')
891                 count--;
892             else if(*p == '.' && p[1] == '.')
893                 p++;
894             else if(*p == '.' && p[1] != '.')
895             {
896                 for(q = p + 1; *q >= '0' && *q <= '9';)
897                     q++;
898                 if(q == p || *q != ':')
899                     invalid = true;
900             }
901         }
902         if(partial)
903         {
904             ErrPostEx(SEV_ERROR, ERR_SOURCE_PartialLocation,
905                       "Source feature location is partial; partiality flags have been ignored: \"%s\".",
906                       (sfbp->location == NULL) ? "?empty?" : sfbp->location);
907         }
908         if(invalid || count != 0)
909         {
910             ErrPostEx(SEV_REJECT, ERR_SOURCE_InvalidLocation,
911                       "Invalid location for source feature at \"%s\". Entry dropped.",
912                       (sfbp->location == NULL) ? "?empty?" : sfbp->location);
913             ret = false;
914         }
915     }
916     return(ret);
917 }
918 
919 /**********************************************************/
CheckSourceFeatLocAccs(SourceFeatBlkPtr sfbp,char * acc)920 static char* CheckSourceFeatLocAccs(SourceFeatBlkPtr sfbp, char* acc)
921 {
922     char* p;
923     char* q;
924     char* r;
925     Int4    i;
926 
927     for(; sfbp != NULL; sfbp = sfbp->next)
928     {
929         if(sfbp->location == NULL || sfbp->location[0] == '\0')
930             continue;
931         for(p = sfbp->location + 1; *p != '\0'; p++)
932         {
933             if(*p != ':')
934                 continue;
935             for(r = NULL, q = p - 1;; q--)
936             {
937                 if(q == sfbp->location)
938                 {
939                     if(*q != '_' && (*q < '0' || *q > '9') &&
940                        (*q < 'a' || *q > 'z') && (*q < 'A' || *q > 'Z'))
941                         q++;
942                     break;
943                 }
944                 if(*q == '.')
945                 {
946                     if(r == NULL)
947                     {
948                         r = q;
949                         continue;
950                     }
951                     q++;
952                     break;
953                 }
954                 if(*q != '_' && (*q < '0' || *q > '9') &&
955                    (*q < 'a' || *q > 'z') && (*q < 'A' || *q > 'Z'))
956                 {
957                     q++;
958                     break;
959                 }
960             }
961             if(q == p)
962                 continue;
963             if(r != NULL)
964                 *r = '\0';
965             else
966                 *p = '\0';
967             i = StringICmp(q, acc);
968             if(r != NULL)
969                 *r = '.';
970             else
971                 *p = ':';
972             if(i != 0)
973                 break;
974         }
975         if(*p != '\0')
976             break;
977     }
978     if(sfbp == NULL)
979         return(NULL);
980     return(sfbp->location);
981 }
982 
983 /**********************************************************/
MinMaxFree(MinMaxPtr mmp)984 static void MinMaxFree(MinMaxPtr mmp)
985 {
986     MinMaxPtr tmmp;
987 
988     for(; mmp != NULL; mmp = tmmp)
989     {
990         tmmp = mmp->next;
991         MemFree(mmp);
992     }
993 }
994 
995 /**********************************************************/
fta_if_special_org(const Char * name)996 bool fta_if_special_org(const Char* name)
997 {
998     const char **b;
999 
1000     if(name == NULL || *name == '\0')
1001         return false;
1002 
1003     for(b = special_orgs; *b != NULL; b++)
1004         if(StringICmp(*b, name) == 0)
1005             break;
1006     if(*b != NULL || StringIStr(name, "vector") != NULL)
1007         return true;
1008     return false;
1009 }
1010 
1011 /**********************************************************/
CheckSourceFeatCoverage(SourceFeatBlkPtr sfbp,MinMaxPtr mmp,size_t len)1012 static Int4 CheckSourceFeatCoverage(SourceFeatBlkPtr sfbp, MinMaxPtr mmp,
1013                                     size_t len)
1014 {
1015     SourceFeatBlkPtr tsfbp;
1016     MinMaxPtr        tmmp;
1017     MinMaxPtr        mmpnext;
1018     char*          p;
1019     char*          q;
1020     char*          r;
1021     char*          loc;
1022     Int4             count;
1023     Int4             min;
1024     Int4             max;
1025     Int4             i;
1026     Int4             tgs;
1027     Int4             sporg;
1028 
1029     loc = NULL;
1030     tmmp = mmp;
1031     for(tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1032     {
1033         if(tsfbp->location == NULL || tsfbp->location[0] == '\0' ||
1034            tsfbp->name == NULL || tsfbp->name[0] == '\0')
1035             continue;
1036         if(loc != NULL)
1037             MemFree(loc);
1038         loc = StringSave(tsfbp->location);
1039         for(p = loc; *p != '\0'; p++)
1040             if(*p == ',' || *p == '(' || *p == ')' || *p == ':' ||
1041                *p == ';' || *p == '^')
1042                 *p = ' ';
1043         for(p = loc, q = loc; *p != '\0';)
1044         {
1045             if(*p == '>' || *p == '<')
1046             {
1047                 p++;
1048                 continue;
1049             }
1050             *q++ = *p;
1051             if(*p == ' ')
1052                 while(*p == ' ')
1053                     p++;
1054             else
1055                 p++;
1056         }
1057         if(q > loc && *(q - 1) == ' ')
1058             q--;
1059         *q = '\0';
1060 
1061         q = (*loc == ' ') ? (loc + 1) : loc;
1062         for(p = q;;)
1063         {
1064             min = 0;
1065             max = 0;
1066             p = StringChr(p, ' ');
1067             if(p != NULL)
1068                 *p++ = '\0';
1069             for(r = q; *r >= '0' && *r <= '9';)
1070                 r++;
1071             if(*r == '\0')
1072             {
1073                 i = atoi(q);
1074                 if(i > 0)
1075                 {
1076                     min = i;
1077                     max = i;
1078                 }
1079             }
1080             else if(*r == '.' && r[1] == '.')
1081             {
1082                 *r++ = '\0';
1083                 min = atoi(q);
1084                 if(min > 0)
1085                 {
1086                     for(q = ++r; *r >= '0' && *r <= '9';)
1087                         r++;
1088                     if(*r == '\0')
1089                         max = atoi(q);
1090                 }
1091             }
1092             if(min > 0 && max > 0)
1093             {
1094                 if(min == 1 && (size_t) max == len)
1095                     tsfbp->full = true;
1096                 for(tmmp = mmp;; tmmp = tmmp->next)
1097                 {
1098                     if(min < tmmp->min)
1099                     {
1100                         mmpnext = tmmp->next;
1101                         tmmp->next = (MinMaxPtr) MemNew(sizeof(MinMax));
1102                         tmmp->next->orgname = tmmp->orgname;
1103                         tmmp->next->min = tmmp->min;
1104                         tmmp->next->max = tmmp->max;
1105                         tmmp->next->skip = tmmp->skip;
1106                         tmmp->next->next = mmpnext;
1107                         tmmp->orgname = tsfbp->name;
1108                         tmmp->min = min;
1109                         tmmp->max = max;
1110                         tmmp->skip = tsfbp->skip;
1111                         break;
1112                     }
1113                     if(tmmp->next == NULL)
1114                     {
1115                         tmmp->next = (MinMaxPtr) MemNew(sizeof(MinMax));
1116                         tmmp->next->orgname = tsfbp->name;
1117                         tmmp->next->min = min;
1118                         tmmp->next->max = max;
1119                         tmmp->next->skip = tsfbp->skip;
1120                         break;
1121                     }
1122                 }
1123             }
1124 
1125             if(p == NULL)
1126                 break;
1127             q = p;
1128         }
1129     }
1130     if(loc != NULL)
1131         MemFree(loc);
1132 
1133     mmp = mmp->next;
1134     if(mmp == NULL || mmp->min != 1)
1135         return(1);
1136 
1137     for(max = mmp->max; mmp != NULL; mmp = mmp->next)
1138         if(mmp->max > max && mmp->min <= max + 1)
1139             max = mmp->max;
1140 
1141     if((size_t) max < len)
1142         return(1);
1143 
1144     tgs = 0;
1145     count = 0;
1146     sporg = 0;
1147     for(tsfbp = sfbp, i = 0; tsfbp != NULL; tsfbp = tsfbp->next, i++)
1148     {
1149         if(!tsfbp->full)
1150             continue;
1151 
1152         if(fta_if_special_org(tsfbp->name))
1153             sporg++;
1154 
1155         count++;
1156         if(tsfbp->tg)
1157             tgs++;
1158     }
1159 
1160     if(count < 2)
1161         return(0);
1162     if(count > 2 || i > count || (tgs != 1 && sporg != 1))
1163         return(2);
1164     return(0);
1165 }
1166 
1167 /**********************************************************/
CheckWholeSourcesVersusFocused(SourceFeatBlkPtr sfbp)1168 static char* CheckWholeSourcesVersusFocused(SourceFeatBlkPtr sfbp)
1169 {
1170     char* p = NULL;
1171     bool whole = false;
1172 
1173     for(; sfbp != NULL; sfbp = sfbp->next)
1174     {
1175         if(sfbp->full)
1176             whole = true;
1177         else if(sfbp->focus)
1178             p = sfbp->location;
1179     }
1180 
1181     if(whole)
1182         return(p);
1183     return(NULL);
1184 }
1185 
1186 /**********************************************************/
CheckSYNTGNDivision(SourceFeatBlkPtr sfbp,char * div)1187 static bool CheckSYNTGNDivision(SourceFeatBlkPtr sfbp, char* div)
1188 {
1189     char* p;
1190     bool got;
1191     bool ret;
1192     Int4    syntgndiv;
1193     Char    ch;
1194 
1195     syntgndiv = 0;
1196     if(div != NULL && *div != '\0')
1197     {
1198         if(StringCmp(div, "SYN") == 0)
1199             syntgndiv = 1;
1200         else if(StringCmp(div, "TGN") == 0)
1201             syntgndiv = 2;
1202     }
1203 
1204     for(ret = true, got = false; sfbp != NULL; sfbp = sfbp->next)
1205     {
1206         if(!sfbp->tg)
1207             continue;
1208 
1209         if(syntgndiv == 0)
1210         {
1211             p = sfbp->location;
1212             if(p != NULL && StringLen(p) > 50)
1213             {
1214                 ch = p[50];
1215                 p[50] = '\0';
1216             }
1217             else
1218                 ch = '\0';
1219             ErrPostEx(SEV_REJECT, ERR_DIVISION_TransgenicNotSYN_TGN,
1220                       "Source feature located at \"%s\" has a /transgenic qualifier, but this record is not in the SYN or TGN division.",
1221                       (p == NULL) ? "unknown" : p);
1222             if(ch != '\0')
1223                 p[50] = ch;
1224             ret = false;
1225         }
1226 
1227         if(sfbp->full)
1228             got = true;
1229     }
1230 
1231     if(syntgndiv == 2 && !got)
1232         ErrPostEx(SEV_ERROR, ERR_DIVISION_TGNnotTransgenic,
1233                   "This record uses the TGN division code, but there is no full-length /transgenic source feature.");
1234     return(ret);
1235 }
1236 
1237 /**********************************************************/
CheckTransgenicSourceFeats(SourceFeatBlkPtr sfbp)1238 static Int4 CheckTransgenicSourceFeats(SourceFeatBlkPtr sfbp)
1239 {
1240     SourceFeatBlkPtr tsfbp;
1241     char*          taxname;
1242     bool             same;
1243     bool             tgfull;
1244 
1245     if(sfbp == NULL)
1246         return(0);
1247 
1248     Int4 ret = 0;
1249     bool tgs = false;
1250     bool focus = false;
1251     for(tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1252     {
1253         if(tsfbp->tg)
1254         {
1255             if(!tsfbp->full)
1256                 ret = 1;                /* /transgenic on not full-length */
1257             else if(tgs)
1258                 ret = 3;                /* multiple /transgenics */
1259             if(ret != 0)
1260                 break;
1261             tgs = true;
1262         }
1263         if(tsfbp->focus)
1264             focus = true;
1265         if(tgs && focus)
1266         {
1267             ret = 2;                    /* /focus and /transgenic */
1268             break;
1269         }
1270     }
1271 
1272     if(ret != 0)
1273         return(ret);
1274 
1275     same = true;
1276     tgfull = false;
1277     taxname = NULL;
1278     for(tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1279     {
1280         if(tsfbp->skip)
1281             continue;
1282         if(taxname == NULL)
1283             taxname = tsfbp->name;
1284         else if(same && !fta_strings_same(taxname, tsfbp->name))
1285             same = false;
1286         if(tsfbp->tg && tsfbp->full)
1287             tgfull = true;
1288         if(tsfbp->focus)
1289             focus = true;
1290     }
1291 
1292     if(same == false && tgfull == false && focus == false)
1293         return(4);
1294 
1295     if(sfbp->next == NULL || !tgs)
1296         return(0);
1297 
1298     for(tsfbp = sfbp->next; tsfbp != NULL; tsfbp = tsfbp->next)
1299         if(fta_strings_same(sfbp->name, tsfbp->name) == false ||
1300            fta_strings_same(sfbp->strain, tsfbp->strain) == false ||
1301            fta_strings_same(sfbp->isolate, tsfbp->isolate) == false ||
1302            fta_strings_same(sfbp->organelle, tsfbp->organelle) == false)
1303             break;
1304 
1305     if(tsfbp == NULL)
1306         return(5);                      /* all source features have the same
1307                                            /organism, /strain, /isolate and
1308                                            /organelle qualifiers */
1309     return(0);
1310 }
1311 
1312 /**********************************************************/
CheckFocusInOrgs(SourceFeatBlkPtr sfbp,size_t len,int * status)1313 static Int4 CheckFocusInOrgs(SourceFeatBlkPtr sfbp, size_t len, int* status)
1314 {
1315     SourceFeatBlkPtr tsfbp;
1316     const char       **b;
1317     char*          name;
1318     Char             pat[100];
1319     Int4             count;
1320     bool             same;
1321 
1322     count = 0;
1323     name = NULL;
1324     same = true;
1325     for(tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1326     {
1327         if(tsfbp->name == NULL)
1328             continue;
1329         if(tsfbp->focus)
1330             count++;
1331         if(name == NULL)
1332             name = tsfbp->name;
1333         else if(StringICmp(name, tsfbp->name) != 0)
1334             same = false;
1335     }
1336     if(same && count > 0)
1337         (*status)++;
1338 
1339     name = NULL;
1340     for(tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1341     {
1342         if(!tsfbp->focus || tsfbp->name == NULL)
1343             continue;
1344         if(name == NULL)
1345             name = tsfbp->name;
1346         else if(StringICmp(name, tsfbp->name) != 0)
1347             break;
1348     }
1349     if(tsfbp != NULL)
1350         return(2);
1351 
1352     if(same || count != 0)
1353         return(0);
1354 
1355     name = NULL;
1356     sprintf(pat, "1..%ld", len);
1357     for(tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1358     {
1359         if(tsfbp->name == NULL || tsfbp->location == NULL ||
1360            tsfbp->skip)
1361             continue;
1362 
1363         for (b = special_orgs; *b != NULL; b++)
1364         {
1365             if (StringICmp(*b, tsfbp->name) == 0 &&
1366                 StringCmp(tsfbp->location, pat) == 0)
1367                 break;
1368         }
1369         if(*b != NULL)
1370             continue;
1371 
1372         if(name == NULL)
1373 /**            name = tsfbp->namstr;*/
1374             name = tsfbp->name;
1375 /**        else if(StringICmp(name, tsfbp->namstr) != 0)*/
1376         else if(StringICmp(name, tsfbp->name) != 0)
1377             break;
1378     }
1379 
1380     if(tsfbp == NULL)
1381         return(0);
1382 
1383     for (tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1384     {
1385         if (tsfbp->full && tsfbp->tg && !tsfbp->skip)
1386             break;
1387     }
1388 
1389     if(tsfbp != NULL)
1390         return(0);
1391     return(3);
1392 }
1393 
1394 /**********************************************************/
IfSpecialFeat(MinMaxPtr mmp,size_t len)1395 static bool IfSpecialFeat(MinMaxPtr mmp, size_t len)
1396 {
1397     if((mmp->min == 1 && (size_t) mmp->max == len) || mmp->skip)
1398         return true;
1399     return false;
1400 }
1401 
1402 /**********************************************************/
CheckSourceOverlap(MinMaxPtr mmp,size_t len)1403 static char* CheckSourceOverlap(MinMaxPtr mmp, size_t len)
1404 {
1405     MinMaxPtr tmmp;
1406     char*   res;
1407 
1408     for(; mmp != NULL; mmp = mmp->next)
1409     {
1410         if(IfSpecialFeat(mmp, len))
1411             continue;
1412         for(tmmp = mmp->next; tmmp != NULL; tmmp = tmmp->next)
1413         {
1414             if(IfSpecialFeat(tmmp, len))
1415                 continue;
1416             if(StringICmp(mmp->orgname, tmmp->orgname) == 0)
1417                 continue;
1418             if(tmmp->min <= mmp->max && tmmp->max >= mmp->min)
1419                 break;
1420         }
1421         if(tmmp != NULL)
1422             break;
1423     }
1424     if(mmp == NULL)
1425         return(NULL);
1426 
1427     res = (char*) MemNew(1024);
1428     sprintf(res, "\"%s\" at %d..%d vs \"%s\" at %d..%d", mmp->orgname,
1429             mmp->min, mmp->max, tmmp->orgname, tmmp->min, tmmp->max);
1430     return(res);
1431 }
1432 
1433 /**********************************************************/
CheckForUnusualFullLengthOrgs(SourceFeatBlkPtr sfbp)1434 static char* CheckForUnusualFullLengthOrgs(SourceFeatBlkPtr sfbp)
1435 {
1436     SourceFeatBlkPtr tsfbp;
1437     const char       **b;
1438 
1439     if(sfbp == NULL || sfbp->next == NULL)
1440         return(NULL);
1441 
1442     for(tsfbp = sfbp->next; tsfbp != NULL; tsfbp = tsfbp->next)
1443         if(StringICmp(sfbp->name, tsfbp->name) != 0)
1444             break;
1445 
1446     if(tsfbp == NULL)
1447         return(NULL);
1448 
1449     for(tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1450         if(tsfbp->full && tsfbp->tg)
1451             break;
1452 
1453     if(tsfbp != NULL)
1454         return(NULL);
1455 
1456     for(; sfbp != NULL; sfbp = sfbp->next)
1457     {
1458         if(!sfbp->full || sfbp->tg)
1459             continue;
1460 
1461         for(b = special_orgs; *b != NULL; b++)
1462             if(StringICmp(*b, sfbp->name) == 0)
1463                 break;
1464 
1465         if(*b != NULL)
1466             continue;
1467 
1468         if(StringIStr(sfbp->name, "vector") == NULL)
1469             break;
1470     }
1471     if(sfbp == NULL)
1472         return(NULL);
1473     return(sfbp->name);
1474 }
1475 
1476 /**********************************************************/
CreateRawBioSources(ParserPtr pp,SourceFeatBlkPtr sfbp,Int4 use_what)1477 static void CreateRawBioSources(ParserPtr pp, SourceFeatBlkPtr sfbp,
1478                                 Int4 use_what)
1479 {
1480     SourceFeatBlkPtr tsfbp;
1481     char*          namstr;
1482     const Char*      cultivar;
1483     const Char*      isolate;
1484     const Char*      serotype;
1485     const Char*      serovar;
1486     const Char*      ecotype;
1487     const Char*      specimen_voucher;
1488     const Char*      strain;
1489     const Char*      sub_species;
1490     const Char*      sub_strain;
1491     const Char*      variety;
1492 
1493     for(; sfbp != NULL; sfbp = sfbp->next)
1494     {
1495         if (sfbp->bio_src.NotEmpty())
1496             continue;
1497 
1498         namstr = StringSave(sfbp->namstr);
1499         CRef<objects::COrg_ref> org_ref(new objects::COrg_ref);
1500         org_ref->SetTaxname(sfbp->name);
1501 
1502         if (sfbp->orgname.NotEmpty())
1503         {
1504             org_ref->SetOrgname(*sfbp->orgname);
1505         }
1506 
1507         CRef<objects::COrg_ref> t_org_ref(new objects::COrg_ref);
1508         t_org_ref->Assign(*org_ref);
1509         fta_fix_orgref(pp, *org_ref, &pp->entrylist[pp->curindx]->drop, sfbp->genomename);
1510 
1511         if (t_org_ref->Equals(*org_ref))
1512             sfbp->lookup = false;
1513         else
1514         {
1515             sfbp->lookup = true;
1516             MemFree(sfbp->name);
1517             sfbp->name = StringSave(org_ref->GetTaxname().c_str());
1518 
1519             sfbp->orgname.Reset();
1520 
1521             cultivar = NULL;
1522             isolate = NULL;
1523             serotype = NULL;
1524             serovar = NULL;
1525             ecotype = NULL;
1526             specimen_voucher = NULL;
1527             strain = NULL;
1528             sub_species = NULL;
1529             sub_strain = NULL;
1530             variety = NULL;
1531             if (org_ref->IsSetOrgname() && org_ref->IsSetOrgMod())
1532             {
1533                 ITERATE(objects::COrgName::TMod, mod, org_ref->GetOrgname().GetMod())
1534                 {
1535                     switch ((*mod)->GetSubtype())
1536                     {
1537                     case 10:
1538                         cultivar = (*mod)->GetSubname().c_str();
1539                         break;
1540                     case 17:
1541                         isolate = (*mod)->GetSubname().c_str();
1542                         break;
1543                     case 7:
1544                         serotype = (*mod)->GetSubname().c_str();
1545                         break;
1546                     case 9:
1547                         serovar = (*mod)->GetSubname().c_str();
1548                         break;
1549                     case 27:
1550                         ecotype = (*mod)->GetSubname().c_str();
1551                         break;
1552                     case 23:
1553                         specimen_voucher = (*mod)->GetSubname().c_str();
1554                         break;
1555                     case 2:
1556                         strain = (*mod)->GetSubname().c_str();
1557                         break;
1558                     case 22:
1559                         sub_species = (*mod)->GetSubname().c_str();
1560                         break;
1561                     case 3:
1562                         sub_strain = (*mod)->GetSubname().c_str();
1563                         break;
1564                     case 6:
1565                         variety = (*mod)->GetSubname().c_str();
1566                         break;
1567                     }
1568                 }
1569             }
1570             CollectSubNames(sfbp, use_what, sfbp->name, cultivar, isolate,
1571                             serotype, serovar, specimen_voucher, strain,
1572                             sub_species, sub_strain, variety, ecotype);
1573         }
1574 
1575         sfbp->bio_src.Reset(new objects::CBioSource);
1576         sfbp->bio_src->SetOrg(*org_ref);
1577 
1578         for(tsfbp = sfbp->next; tsfbp != NULL; tsfbp = tsfbp->next)
1579         {
1580             if(tsfbp->bio_src.NotEmpty() || StringICmp(namstr, tsfbp->namstr) != 0)
1581                 continue;
1582 
1583             tsfbp->lookup = sfbp->lookup;
1584 
1585             tsfbp->bio_src.Reset(new objects::CBioSource);
1586             tsfbp->bio_src->Assign(*sfbp->bio_src);
1587 
1588             if(!sfbp->lookup)
1589                 continue;
1590 
1591             MemFree(tsfbp->name);
1592             tsfbp->name = StringSave(sfbp->name);
1593 
1594             MemFree(tsfbp->namstr);
1595             tsfbp->namstr = StringSave(sfbp->namstr);
1596         }
1597         MemFree(namstr);
1598     }
1599 }
1600 
1601 /**********************************************************/
SourceFeatMoveOneUp(SourceFeatBlkPtr where,SourceFeatBlkPtr what)1602 static SourceFeatBlkPtr SourceFeatMoveOneUp(SourceFeatBlkPtr where,
1603                                             SourceFeatBlkPtr what)
1604 {
1605     SourceFeatBlkPtr prev;
1606     SourceFeatBlkPtr tsfbp;
1607 
1608     if(what == where)
1609         return(where);
1610 
1611     prev = where;
1612     for(tsfbp = where->next; tsfbp != NULL; tsfbp = tsfbp->next)
1613     {
1614         if(tsfbp == what)
1615             break;
1616         prev = tsfbp;
1617     }
1618     if(tsfbp == NULL)
1619         return(where);
1620 
1621     prev->next = what->next;
1622     what->next = where;
1623     return(what);
1624 }
1625 
1626 /**********************************************************/
SourceFeatRemoveDups(SourceFeatBlkPtr sfbp)1627 static SourceFeatBlkPtr SourceFeatRemoveDups(SourceFeatBlkPtr sfbp)
1628 {
1629     SourceFeatBlkPtr tsfbp;
1630     SourceFeatBlkPtr prev;
1631     SourceFeatBlkPtr next;
1632 
1633     for(prev = sfbp, tsfbp = sfbp->next; tsfbp != NULL; tsfbp = next)
1634     {
1635         next = tsfbp->next;
1636         if(!tsfbp->useit)
1637         {
1638             prev = tsfbp;
1639             continue;
1640         }
1641 
1642         bool different = false;
1643         ITERATE(TQualVector, cur, tsfbp->quals)
1644         {
1645             const std::string& cur_qual = (*cur)->GetQual();
1646             if (cur_qual == "focus")
1647                 continue;
1648 
1649             bool found = false;
1650             ITERATE(TQualVector, next, sfbp->quals)
1651             {
1652                 const std::string& next_qual = (*next)->GetQual();
1653 
1654                 if (next_qual == "focus" || next_qual != cur_qual)
1655                     continue;
1656 
1657                 if (!(*cur)->IsSetVal() && !(*next)->IsSetVal())
1658                 {
1659                     found = true;
1660                     break;
1661                 }
1662 
1663                 if ((*cur)->IsSetVal() && (*next)->IsSetVal() &&
1664                     (*cur)->GetVal() == (*next)->GetVal())
1665                 {
1666                     found = true;
1667                     break;
1668                 }
1669             }
1670 
1671             if (!found)              /* Different, leave as is */
1672             {
1673                 different = true;
1674                 break;
1675             }
1676         }
1677 
1678         if (different)                /* Different, leave as is */
1679         {
1680             prev = tsfbp;
1681             continue;
1682         }
1683         prev->next = tsfbp->next;
1684         tsfbp->next = NULL;
1685         SourceFeatBlkFree(tsfbp);
1686     }
1687     return(sfbp);
1688 }
1689 
1690 /**********************************************************/
SourceFeatDerive(SourceFeatBlkPtr sfbp,SourceFeatBlkPtr res)1691 static SourceFeatBlkPtr SourceFeatDerive(SourceFeatBlkPtr sfbp,
1692                                          SourceFeatBlkPtr res)
1693 {
1694     SourceFeatBlkPtr tsfbp;
1695 
1696     if(res == NULL)
1697         return(sfbp);
1698 
1699     tsfbp = SourceFeatBlkNew();
1700     tsfbp->name = (res->name == NULL) ? NULL : StringSave(res->name);
1701     tsfbp->namstr = (res->namstr == NULL) ? NULL : StringSave(res->namstr);
1702     tsfbp->location = (res->location == NULL) ? NULL : StringSave(res->location);
1703     tsfbp->full = res->full;
1704     tsfbp->focus = res->focus;
1705     tsfbp->lookup = res->lookup;
1706     tsfbp->genome = res->genome;
1707     tsfbp->next = NULL;
1708 
1709     tsfbp->bio_src.Reset(new objects::CBioSource);
1710     tsfbp->bio_src->Assign(*res->bio_src);
1711 
1712     tsfbp->orgname.Reset(new objects::COrgName);
1713     if (res->orgname.NotEmpty())
1714         tsfbp->orgname->Assign(*res->orgname);
1715 
1716     tsfbp->quals = res->quals;
1717     tsfbp->next = sfbp;
1718     sfbp = tsfbp;
1719 
1720     for (TQualVector::iterator cur = sfbp->quals.begin(); cur != sfbp->quals.end(); )
1721     {
1722         const std::string& cur_qual = (*cur)->GetQual();
1723         if (cur_qual == "focus")
1724         {
1725             ++cur;
1726             continue;
1727         }
1728 
1729         for(tsfbp = sfbp->next; tsfbp != NULL; tsfbp = tsfbp->next)
1730         {
1731             if(tsfbp == res || !tsfbp->useit)
1732                 continue;
1733 
1734             bool found = false;
1735             ITERATE(TQualVector, next, tsfbp->quals)
1736             {
1737                 const std::string& next_qual = (*next)->GetQual();
1738 
1739                 if (next_qual == "focus" || next_qual != cur_qual)
1740                     continue;
1741 
1742                 if (!(*cur)->IsSetVal() && !(*next)->IsSetVal())
1743                 {
1744                     found = true;
1745                     break;
1746                 }
1747 
1748                 if ((*cur)->IsSetVal() && (*next)->IsSetVal() &&
1749                     (*cur)->GetVal() == (*next)->GetVal())
1750                 {
1751                     found = true;
1752                     break;
1753                 }
1754             }
1755 
1756             if (!found)            /* Not found */
1757                 break;
1758         }
1759 
1760         if (tsfbp == NULL)               /* Got the match */
1761         {
1762             ++cur;
1763             continue;
1764         }
1765 
1766         cur = sfbp->quals.erase(cur);
1767     }
1768 
1769     return(SourceFeatRemoveDups(sfbp));
1770 }
1771 
1772 /**********************************************************/
PickTheDescrSource(SourceFeatBlkPtr sfbp)1773 static SourceFeatBlkPtr PickTheDescrSource(SourceFeatBlkPtr sfbp)
1774 {
1775     SourceFeatBlkPtr res;
1776     SourceFeatBlkPtr tsfbp;
1777 
1778     if(sfbp->next == NULL)
1779     {
1780         if(!sfbp->full)
1781         {
1782             ErrPostEx(SEV_WARNING, ERR_SOURCE_SingleSourceTooShort,
1783                       "Source feature does not span the entire length of the sequence.");
1784         }
1785         return(sfbp);
1786     }
1787 
1788     Int4 count_skip = 0;
1789     Int4 count_noskip = 0;
1790     bool same = true;
1791     for(tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1792     {
1793         if(StringICmp(tsfbp->name, sfbp->name) != 0)
1794         {
1795             same = false;
1796             break;
1797         }
1798 
1799         if(!tsfbp->skip)
1800         {
1801             res = tsfbp;
1802             count_noskip++;
1803         }
1804         else
1805             count_skip++;
1806     }
1807 
1808     if(same)
1809     {
1810         if(count_noskip == 1)
1811         {
1812             sfbp = SourceFeatMoveOneUp(sfbp, res);
1813             return(SourceFeatRemoveDups(sfbp));
1814         }
1815         for(res = NULL, tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1816         {
1817             if(count_noskip != 0 && tsfbp->skip)
1818                 continue;
1819             tsfbp->useit = true;
1820             if(res == NULL)
1821                 res = tsfbp;
1822         }
1823         return(SourceFeatDerive(sfbp, res));
1824     }
1825 
1826     for (tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1827     {
1828         if (tsfbp->tg)
1829             break;
1830     }
1831     if(tsfbp != NULL)
1832         return(SourceFeatMoveOneUp(sfbp, tsfbp));
1833 
1834     for(res = NULL, tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1835     {
1836         if(!tsfbp->focus)
1837             continue;
1838         res = tsfbp;
1839         if(!tsfbp->skip)
1840             break;
1841     }
1842 
1843     if(res != NULL)
1844     {
1845         count_skip = 0;
1846         count_noskip = 0;
1847         for(tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1848         {
1849             if(StringICmp(res->name, tsfbp->name) != 0)
1850                 continue;
1851             tsfbp->useit = true;
1852             if(tsfbp->skip)
1853                 count_skip++;
1854             else
1855                 count_noskip++;
1856         }
1857         if(count_noskip > 0)
1858         {
1859             for(tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1860             {
1861                 if(StringICmp(res->name, tsfbp->name) != 0)
1862                     continue;
1863                 if(res != tsfbp && tsfbp->skip)
1864                     tsfbp->useit = false;
1865             }
1866         }
1867         return(SourceFeatDerive(sfbp, res));
1868     }
1869 
1870     for(tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1871     {
1872         if(!tsfbp->full)
1873             continue;
1874         res = tsfbp;
1875         break;
1876     }
1877     if(res != NULL)
1878     {
1879         sfbp = SourceFeatMoveOneUp(sfbp, res);
1880         return(SourceFeatRemoveDups(sfbp));
1881     }
1882 
1883     SourceFeatBlkSetFree(sfbp);
1884     ErrPostEx(SEV_ERROR, ERR_SOURCE_MissingSourceFeatureForDescr,
1885               "Could not select the right source feature among different organisms to create descriptor: no /focus and 1..N one. Entry dropped.");
1886     return(NULL);
1887 }
1888 
1889 /**********************************************************/
AddOrgMod(objects::COrg_ref & org_ref,const Char * val,Uint1 type)1890 static void AddOrgMod(objects::COrg_ref& org_ref, const Char* val, Uint1 type)
1891 {
1892     objects::COrgName& orgname = org_ref.SetOrgname();
1893 
1894     CRef<objects::COrgMod> mod(new objects::COrgMod);
1895     mod->SetSubtype(type);
1896     mod->SetSubname((val == NULL) ? "" : val);
1897 
1898     orgname.SetMod().push_back(mod);
1899 }
1900 
1901 /**********************************************************/
FTASubSourceAdd(objects::CBioSource & bio,const Char * val,Uint1 type)1902 static void FTASubSourceAdd(objects::CBioSource& bio, const Char* val, Uint1 type)
1903 {
1904     if (type != 12)                      /* dev-stage */
1905     {
1906         bool found = false;
1907         ITERATE(objects::CBioSource::TSubtype, subtype, bio.GetSubtype())
1908         {
1909             if ((*subtype)->GetSubtype() == type)
1910             {
1911                 found = true;
1912                 break;
1913             }
1914         }
1915 
1916         if (found)
1917             return;
1918     }
1919 
1920     CRef<objects::CSubSource> sub(new objects::CSubSource);
1921     sub->SetSubtype(type);
1922     sub->SetName((val == NULL) ? "" : val);
1923     bio.SetSubtype().push_back(sub);
1924 }
1925 
1926 /**********************************************************/
CheckQualsInSourceFeat(objects::CBioSource & bio,TQualVector & quals,Uint1 taxserver)1927 static void CheckQualsInSourceFeat(objects::CBioSource& bio, TQualVector& quals,
1928                                    Uint1 taxserver)
1929 {
1930     const Char **b;
1931 
1932     char*    p;
1933 
1934     if (!bio.CanGetOrg())
1935         return;
1936 
1937     std::vector<std::string> modnames;
1938 
1939     if (bio.GetOrg().CanGetOrgname() && bio.GetOrg().GetOrgname().CanGetMod())
1940     {
1941         ITERATE(objects::COrgName::TMod, mod, bio.GetOrg().GetOrgname().GetMod())
1942         {
1943             for (size_t i = 0; SourceOrgMods[i].name != NULL; ++i)
1944             {
1945                 if(SourceOrgMods[i].num != (*mod)->GetSubtype())
1946                     continue;
1947 
1948                 modnames.push_back(SourceOrgMods[i].name);
1949                 break;
1950             }
1951         }
1952     }
1953 
1954     ITERATE(TQualVector, cur, quals)
1955     {
1956         if (!(*cur)->IsSetQual() || (*cur)->GetQual() == "organism")
1957             continue;
1958 
1959         const std::string& cur_qual = (*cur)->GetQual();
1960         const Char* val_ptr = (*cur)->IsSetVal() ? (*cur)->GetVal().c_str() : NULL;
1961 
1962         if (cur_qual == "note")
1963         {
1964             FTASubSourceAdd(bio, val_ptr, 255);
1965             continue;
1966         }
1967 
1968         for(b = SourceBadQuals; *b != NULL; b++)
1969         {
1970             if (cur_qual != *b)
1971                 continue;
1972 
1973             if (val_ptr == NULL || val_ptr[0] == '\0')
1974                 p = StringSave("???");
1975             else
1976                 p = StringSave(val_ptr);
1977             if(StringLen(p) > 50)
1978                 p[50] = '\0';
1979             ErrPostEx(SEV_WARNING, ERR_SOURCE_UnwantedQualifiers,
1980                       "Unwanted qualifier on source feature: %s=%s",
1981                       cur_qual.c_str(), p);
1982             MemFree(p);
1983         }
1984 
1985         b = SourceSubSources;
1986         for (size_t i = 1; *b != NULL; i++, b++)
1987         {
1988             if (**b != '\0' && cur_qual == *b)
1989             {
1990                 FTASubSourceAdd(bio, val_ptr, (Uint1)i);
1991                 break;
1992             }
1993         }
1994 
1995         if (cur_qual == "organism" ||
1996            (taxserver != 0 && cur_qual == "type_material"))
1997             continue;
1998 
1999         if (find(modnames.begin(), modnames.end(), cur_qual) != modnames.end())
2000             continue;
2001 
2002         for (size_t i = 0; SourceOrgMods[i].name != NULL; i++)
2003         {
2004             if (cur_qual == SourceOrgMods[i].name)
2005             {
2006                 AddOrgMod(bio.SetOrg(), val_ptr, SourceOrgMods[i].num);
2007                 break;
2008             }
2009         }
2010     }
2011 }
2012 
2013 /**********************************************************/
GetSourceDbtag(CRef<objects::CGb_qual> & qual,Parser::ESource source)2014 static CRef<objects::CDbtag> GetSourceDbtag(CRef<objects::CGb_qual>& qual, Parser::ESource source)
2015 {
2016     const char **b;
2017     const char *q;
2018     char*    line;
2019     char*    p;
2020 
2021     CRef<objects::CDbtag> tag;
2022 
2023     if (qual->GetQual() != "db_xref")
2024         return tag;
2025 
2026     std::vector<Char> val_buf(qual->GetVal().begin(), qual->GetVal().end());
2027     val_buf.push_back(0);
2028 
2029     p = StringChr(&val_buf[0], ':');
2030     if(p == NULL || p[1] == '\0')
2031         return tag;
2032 
2033     *p = '\0';
2034     if (StringICmp(&val_buf[0], "taxon") == 0)
2035     {
2036         *p = ':';
2037         return tag;
2038     }
2039 
2040     if(source == Parser::ESource::NCBI)
2041         q = "NCBI";
2042     else if(source == Parser::ESource::EMBL)
2043         q = "EMBL";
2044     else if(source == Parser::ESource::DDBJ)
2045         q = "DDBJ";
2046     else if(source == Parser::ESource::SPROT)
2047         q = "SwissProt";
2048     else if(source == Parser::ESource::PIR)
2049         q = "PIR";
2050     else if(source == Parser::ESource::LANL)
2051         q = "LANL";
2052     else if(source == Parser::ESource::Refseq)
2053         q = "RefSeq";
2054     else
2055         q = "Unknown";
2056 
2057     if(source != Parser::ESource::NCBI && source != Parser::ESource::DDBJ &&
2058        source != Parser::ESource::EMBL && source != Parser::ESource::LANL &&
2059        source != Parser::ESource::Refseq)
2060     {
2061         *p = ':';
2062         ErrPostEx(SEV_ERROR, ERR_SOURCE_InvalidDbXref,
2063                   "Cannot process source feature's \"/db_xref=%s\" for source \"%s\".",
2064                   &val_buf[0], q);
2065         return tag;
2066     }
2067 
2068     for (b = ObsoleteSourceDbxrefTag; *b != NULL; b++)
2069     {
2070         if (StringICmp(*b, &val_buf[0]) == 0)
2071             break;
2072     }
2073 
2074     if(*b != NULL)
2075     {
2076         ErrPostEx(SEV_WARNING, ERR_SOURCE_ObsoleteDbXref,
2077                   "/db_xref type \"%s\" is obsolete.", &val_buf[0]);
2078         if (StringICmp(&val_buf[0], "IFO") == 0)
2079         {
2080             line = (char*) MemNew(25 + StringLen(p + 1));
2081             StringCpy(line, "NBRC:");
2082             StringCat(line, p + 1);
2083             qual->SetVal(line);
2084             MemFree(line);
2085 
2086             val_buf.assign(line, line + StringLen(line));
2087             val_buf.push_back(0);
2088 
2089             p = &val_buf[0] + 4;
2090             *p = '\0';
2091         }
2092     }
2093 
2094     for (b = DENLRSourceDbxrefTag; *b != NULL; b++)
2095     {
2096         if (StringICmp(*b, &val_buf[0]) == 0)
2097             break;
2098     }
2099 
2100     if(*b == NULL && (source == Parser::ESource::DDBJ || source == Parser::ESource::EMBL))
2101     {
2102         for(b = DESourceDbxrefTag; *b != NULL; b++)
2103             if (StringICmp(*b, &val_buf[0]) == 0)
2104                 break;
2105     }
2106     if(*b == NULL && source == Parser::ESource::EMBL)
2107     {
2108         for(b = ESourceDbxrefTag; *b != NULL; b++)
2109             if (StringICmp(*b, &val_buf[0]) == 0)
2110                 break;
2111     }
2112     if(*b == NULL && (source == Parser::ESource::NCBI || source == Parser::ESource::LANL ||
2113                       source == Parser::ESource::Refseq))
2114     {
2115         for (b = NLRSourceDbxrefTag; *b != NULL; b++)
2116         {
2117             if (StringICmp(*b, &val_buf[0]) == 0)
2118                 break;
2119         }
2120     }
2121 
2122     if(*b == NULL)
2123     {
2124         *p = ':';
2125         ErrPostEx(SEV_ERROR, ERR_SOURCE_InvalidDbXref,
2126                   "Invalid database name in source feature's \"/db_xref=%s\" for source \"%s\".",
2127                   &val_buf[0], q);
2128         return tag;
2129     }
2130 
2131     tag.Reset(new objects::CDbtag);
2132     tag->SetDb(&val_buf[0]);
2133 
2134     *p++ = ':';
2135     for(q = p; *p >= '0' && *p <= '9';)
2136          p++;
2137 
2138     if(*p == '\0' && *q != '0')
2139         tag->SetTag().SetId(atoi(q));
2140     else
2141         tag->SetTag().SetStr(q);
2142 
2143     return tag;
2144 }
2145 
2146 /**********************************************************/
UpdateRawBioSource(SourceFeatBlkPtr sfbp,Parser::ESource source,IndexblkPtr ibp,Uint1 taxserver)2147 static bool UpdateRawBioSource(SourceFeatBlkPtr sfbp, Parser::ESource source, IndexblkPtr ibp, Uint1 taxserver)
2148 {
2149     char*      div;
2150     char*      tco;
2151     char*      p;
2152     char*      q;
2153 
2154     Int4         newgen;
2155     Int4         oldgen;
2156     Int2         i;
2157 
2158     bool is_syn = false;
2159     bool is_pat = false;
2160 
2161     div = ibp->division;
2162     if(div != NULL)
2163     {
2164         if(StringCmp(div, "SYN") == 0)
2165             is_syn = true;
2166         else if(StringCmp(div, "PAT") == 0)
2167             is_pat = true;
2168     }
2169     for(; sfbp != NULL; sfbp = sfbp->next)
2170     {
2171         if (sfbp->bio_src.Empty())
2172             continue;
2173 
2174         objects::CBioSource& bio = *sfbp->bio_src;
2175 
2176         if(!sfbp->lookup)
2177         {
2178             if(is_syn && !sfbp->tg)
2179                 bio.SetOrigin(4);        /* artificial */
2180         }
2181         else
2182         {
2183             if (bio.CanGetOrg() && bio.GetOrg().CanGetOrgname() &&
2184                 bio.GetOrg().GetOrgname().CanGetDiv() &&
2185                 bio.GetOrg().GetOrgname().GetDiv() == "SYN")
2186             {
2187                 bio.SetOrigin(4);        /* artificial */
2188                 if (is_syn == false && is_pat == false)
2189                 {
2190                     const Char* taxname = NULL;
2191                     if (bio.GetOrg().CanGetTaxname() &&
2192                         !bio.GetOrg().GetTaxname().empty())
2193                         taxname = bio.GetOrg().GetTaxname().c_str();
2194                     ErrPostEx(SEV_ERROR, ERR_ORGANISM_SynOrgNameNotSYNdivision,
2195                               "The NCBI Taxonomy DB believes that organism name \"%s\" is reserved for synthetic sequences, but this record is not in the SYN division.",
2196                               (taxname == NULL) ? "not_specified" : taxname);
2197                 }
2198             }
2199         }
2200 
2201         newgen = -1;
2202         oldgen = -1;
2203 
2204         bool dropped = false;
2205         NON_CONST_ITERATE(TQualVector, cur, sfbp->quals)
2206         {
2207             if (!(*cur)->IsSetQual() || (*cur)->GetQual().empty())
2208                 continue;
2209 
2210             const std::string& cur_qual = (*cur)->GetQual();
2211             if (cur_qual == "db_xref")
2212             {
2213                 CRef<objects::CDbtag> dbtag = GetSourceDbtag(*cur, source);
2214                 if (dbtag.Empty())
2215                     continue;
2216 
2217                 bio.SetOrg().SetDb().push_back(dbtag);
2218                 continue;
2219             }
2220 
2221             const Char* val_ptr = (*cur)->IsSetVal() ? (*cur)->GetVal().c_str() : NULL;
2222             if (cur_qual == "organelle")
2223             {
2224                 if (val_ptr == NULL || val_ptr[0] == '\0')
2225                     continue;
2226 
2227                 p = StringChr(val_ptr, ':');
2228                 if (p != NULL)
2229                 {
2230                     if (StringChr(p + 1, ':') != NULL)
2231                     {
2232                         ErrPostEx(SEV_ERROR, ERR_SOURCE_OrganelleQualMultToks,
2233                                   "More than 2 tokens found in /organelle qualifier: \"%s\". Entry dropped.",
2234                                   val_ptr);
2235                         dropped = true;
2236                         break;
2237                     }
2238 
2239                     std::string val_str(val_ptr, static_cast<const Char*>(p));
2240                     i = StringMatchIcase(OrganelleFirstToken, val_str.c_str());
2241                     if(i < 0)
2242                     {
2243                         ErrPostEx(SEV_ERROR, ERR_SOURCE_OrganelleIllegalClass,
2244                                   "Illegal class in /organelle qualifier: \"%s\". Entry dropped.",
2245                                   val_ptr);
2246                         dropped = true;
2247                         break;
2248                     }
2249                     if(i == 4)
2250                         ibp->got_plastid = true;
2251                     if(newgen < 0)
2252                         newgen = StringMatchIcase(GenomicSourceFeatQual,
2253                                                   p + 1);
2254                 }
2255                 else
2256                 {
2257                     i = StringMatchIcase(OrganelleFirstToken, val_ptr);
2258                     if(i < 0)
2259                     {
2260                         ErrPostEx(SEV_ERROR, ERR_SOURCE_OrganelleIllegalClass,
2261                                   "Illegal class in /organelle qualifier: \"%s\". Entry dropped.",
2262                                   val_ptr);
2263                         dropped = true;
2264                         break;
2265                     }
2266                     if(i == 4)
2267                         ibp->got_plastid = true;
2268                     if(newgen < 0)
2269                         newgen = StringMatchIcase(GenomicSourceFeatQual,
2270                                                   val_ptr);
2271                 }
2272                 continue;
2273             }
2274 
2275             if(oldgen < 0)
2276                 oldgen = StringMatchIcase(GenomicSourceFeatQual, cur_qual.c_str());
2277 
2278             if (cur_qual != "country" ||
2279                 val_ptr == NULL || val_ptr[0] == '\0')
2280                 continue;
2281 
2282             tco = StringSave(val_ptr);
2283             p = StringChr(tco, ':');
2284             if(p != NULL)
2285                 *p = '\0';
2286             for(p = tco; *p == ' ' || *p == '\t';)
2287                 p++;
2288             if(*p == '\0')
2289             {
2290                 ErrPostEx(SEV_ERROR, ERR_SOURCE_InvalidCountry,
2291                           "Empty country name in /country qualifier : \"%s\".",
2292                           val_ptr);
2293             }
2294             else
2295             {
2296                 for(q = p + 1; *q != '\0';)
2297                     q++;
2298                 for(q--; *q == ' ' || *q == '\t';)
2299                     q--;
2300                 *++q = '\0';
2301 
2302                 bool valid_country = objects::CCountries::IsValid(p);
2303                 if (!valid_country)
2304                 {
2305                     valid_country = objects::CCountries::WasValid(p);
2306 
2307                     if (!valid_country)
2308                         ErrPostEx(SEV_ERROR, ERR_SOURCE_InvalidCountry,
2309                         "Country \"%s\" from /country qualifier \"%s\" is not a valid country name.",
2310                         tco, val_ptr);
2311                     else
2312                         ErrPostEx(SEV_WARNING, ERR_SOURCE_FormerCountry,
2313                         "Country \"%s\" from /country qualifier \"%s\" is a former country name which is no longer valid.",
2314                         tco, val_ptr);
2315                 }
2316             }
2317 
2318             MemFree(tco);
2319             FTASubSourceAdd(bio, val_ptr, 23);
2320         }
2321 
2322         if (dropped)
2323             break;
2324 
2325         if (newgen > -1)
2326             bio.SetGenome(newgen);
2327         else if (oldgen > -1)
2328             bio.SetGenome(oldgen);
2329         else if (sfbp->genome != 0)
2330             bio.SetGenome(sfbp->genome);
2331 
2332         CheckQualsInSourceFeat(bio, sfbp->quals, taxserver);
2333         fta_sort_biosource(bio);
2334     }
2335 
2336     if(sfbp != NULL)
2337         return false;
2338 
2339     return true;
2340 }
2341 
2342 
2343 /**********************************************************/
is_a_space_char(Char c)2344 static bool is_a_space_char(Char c)
2345 {
2346     return c == ' ' || c == '\t';
2347 }
2348 
2349 /**********************************************************/
CompareDescrFeatSources(SourceFeatBlkPtr sfbp,const objects::CBioseq & bioseq)2350 static void CompareDescrFeatSources(SourceFeatBlkPtr sfbp, const objects::CBioseq& bioseq)
2351 {
2352     SourceFeatBlkPtr tsfbp;
2353 
2354     if(sfbp == NULL || !bioseq.IsSetDescr())
2355         return;
2356 
2357     ITERATE(objects::CSeq_descr::Tdata, descr, bioseq.GetDescr().Get())
2358     {
2359         if (!(*descr)->IsSource())
2360             continue;
2361 
2362         const objects::CBioSource& bio_src = (*descr)->GetSource();
2363 
2364         if (!bio_src.IsSetOrg() || !bio_src.GetOrg().IsSetTaxname() ||
2365             bio_src.GetOrg().GetTaxname().empty())
2366             continue;
2367 
2368         const std::string& taxname = bio_src.GetOrg().GetTaxname();
2369         std::string orgdescr;
2370         std::remove_copy_if(taxname.begin(), taxname.end(), std::back_inserter(orgdescr), is_a_space_char);
2371 
2372         std::string commdescr;
2373         if (bio_src.GetOrg().IsSetCommon())
2374         {
2375             const std::string& common = bio_src.GetOrg().GetCommon();
2376             std::remove_copy_if(common.begin(), common.end(), std::back_inserter(commdescr), is_a_space_char);
2377         }
2378 
2379         for (tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
2380         {
2381             if (tsfbp->name == NULL || tsfbp->name[0] == '\0')
2382                 continue;
2383 
2384             size_t name_len = strlen(tsfbp->name);
2385             std::string orgfeat;
2386             std::remove_copy_if(tsfbp->name, tsfbp->name + name_len, std::back_inserter(orgfeat), is_a_space_char);
2387 
2388             if(StringICmp(orgdescr.c_str(), "unknown") == 0)
2389             {
2390                 if(StringICmp(orgdescr.c_str(), orgfeat.c_str()) == 0 ||
2391                    (!commdescr.empty() && StringICmp(commdescr.c_str(), orgfeat.c_str()) == 0))
2392                 {
2393                     break;
2394                 }
2395             }
2396             else
2397             {
2398                 if (orgdescr == orgfeat || commdescr == orgfeat)
2399                 {
2400                     break;
2401                 }
2402             }
2403         }
2404 
2405         if(tsfbp == NULL)
2406         {
2407             ErrPostEx(SEV_ERROR, ERR_ORGANISM_NoSourceFeatMatch,
2408                       "Organism name \"%s\" from OS/ORGANISM line does not exist in this record's source features.",
2409                       taxname.c_str());
2410         }
2411     }
2412 }
2413 
2414 /**********************************************************/
CheckSourceLineage(SourceFeatBlkPtr sfbp,Parser::ESource source,bool is_pat)2415 static bool CheckSourceLineage(SourceFeatBlkPtr sfbp, Parser::ESource source, bool is_pat)
2416 {
2417     const Char* p;
2418     ErrSev  sev;
2419 
2420     for(; sfbp != NULL; sfbp = sfbp->next)
2421     {
2422         if(!sfbp->lookup || sfbp->bio_src.Empty() || !sfbp->bio_src->IsSetOrg())
2423             continue;
2424 
2425         p = NULL;
2426         if (sfbp->bio_src->GetOrg().IsSetOrgname() &&
2427             sfbp->bio_src->GetOrg().GetOrgname().IsSetLineage())
2428             p = sfbp->bio_src->GetOrg().GetOrgname().GetLineage().c_str();
2429 
2430         if (p == NULL || *p == '\0')
2431         {
2432             if ((source == Parser::ESource::DDBJ || source == Parser::ESource::EMBL) && is_pat)
2433                 sev = SEV_WARNING;
2434             else
2435                 sev = SEV_REJECT;
2436             ErrPostEx(sev, ERR_SERVER_NoLineageFromTaxon,
2437                       "Taxonomy lookup for organism name \"%s\" yielded an Org-ref that has no lineage.",
2438                       sfbp->name);
2439             if(sev == SEV_REJECT)
2440                 break;
2441         }
2442     }
2443     if(sfbp == NULL)
2444         return true;
2445     return false;
2446 }
2447 
2448 /**********************************************************/
PropogateSuppliedLineage(objects::CBioseq & bioseq,SourceFeatBlkPtr sfbp,Uint1 taxserver)2449 static void PropogateSuppliedLineage(objects::CBioseq& bioseq,
2450                                      SourceFeatBlkPtr sfbp, Uint1 taxserver)
2451 {
2452     SourceFeatBlkPtr tsfbp;
2453 
2454     const Char       *p;
2455 
2456     if (!bioseq.IsSetDescr() || sfbp == NULL)
2457         return;
2458 
2459     for(; sfbp != NULL; sfbp = sfbp->next)
2460     {
2461         if(sfbp->lookup || sfbp->bio_src.Empty() ||
2462            !sfbp->bio_src->IsSetOrg() || !sfbp->bio_src->GetOrg().IsSetTaxname() ||
2463            sfbp->name == NULL || *sfbp->name == '\0' ||
2464            sfbp->bio_src->GetOrg().GetTaxname().empty())
2465             continue;
2466 
2467         objects::COrgName& orgname = sfbp->bio_src->SetOrg().SetOrgname();
2468 
2469         if (orgname.IsSetLineage())
2470         {
2471             if (!orgname.GetLineage().empty())
2472                 continue;
2473 
2474             orgname.ResetLineage();
2475         }
2476 
2477         const std::string& taxname = sfbp->bio_src->GetOrg().GetTaxname();
2478         std::string lineage;
2479 
2480         bool found = false;
2481         ITERATE(objects::CSeq_descr::Tdata, descr, bioseq.GetDescr().Get())
2482         {
2483             if (!(*descr)->IsSource())
2484                 continue;
2485 
2486             const objects::CBioSource& bio_src = (*descr)->GetSource();
2487 
2488             if (!bio_src.IsSetOrg() || !bio_src.GetOrg().IsSetOrgname() ||
2489                 !bio_src.GetOrg().IsSetTaxname() || bio_src.GetOrg().GetTaxname().empty() ||
2490                 !bio_src.GetOrg().GetOrgname().IsSetLineage())
2491                 continue;
2492 
2493             lineage = bio_src.GetOrg().GetOrgname().GetLineage();
2494             const std::string& cur_taxname = bio_src.GetOrg().GetTaxname();
2495 
2496             if (StringICmp(cur_taxname.c_str(), taxname.c_str()) == 0)
2497             {
2498                 found = true;
2499                 break;
2500             }
2501         }
2502 
2503         if (!found)
2504         {
2505             ErrPostEx((taxserver == 0) ? SEV_INFO : SEV_WARNING,
2506                       ERR_ORGANISM_UnclassifiedLineage,
2507                       "Taxonomy lookup for organism name \"%s\" failed, and no matching organism exists in OS/ORGANISM lines, so lineage has been set to \"Unclassified\".",
2508                       taxname.c_str());
2509             p = "Unclassified";
2510         }
2511         else
2512         {
2513             if (lineage.empty())
2514             {
2515                 ErrPostEx((taxserver == 0) ? SEV_INFO : SEV_WARNING,
2516                           ERR_ORGANISM_UnclassifiedLineage,
2517                           "Taxonomy lookup for organism name \"%s\" failed, and the matching organism from OS/ORGANISM lines has no lineage, so lineage has been set to \"Unclassified\".",
2518                           taxname.c_str());
2519                 p = "Unclassified";
2520             }
2521             else
2522                 p = lineage.c_str();
2523         }
2524 
2525         orgname.SetLineage(p);
2526         for(tsfbp = sfbp->next; tsfbp != NULL; tsfbp = tsfbp->next)
2527         {
2528             if (tsfbp->lookup || tsfbp->bio_src.Empty() ||
2529                 !tsfbp->bio_src->IsSetOrg() || !tsfbp->bio_src->GetOrg().IsSetTaxname() ||
2530                 tsfbp->name == NULL || *tsfbp->name == '\0' ||
2531                 tsfbp->bio_src->GetOrg().GetTaxname().empty() ||
2532                 StringICmp(sfbp->name, tsfbp->name) != 0)
2533 
2534                 continue;
2535 
2536             objects::COrgName& torgname = tsfbp->bio_src->SetOrg().SetOrgname();
2537 
2538             if (torgname.IsSetLineage())
2539             {
2540                 if (!torgname.GetLineage().empty())
2541                     continue;
2542             }
2543             torgname.SetLineage(p);
2544         }
2545     }
2546 }
2547 
2548 /**********************************************************/
CheckMoltypeConsistency(SourceFeatBlkPtr sfbp,char ** moltype)2549 static bool CheckMoltypeConsistency(SourceFeatBlkPtr sfbp, char** moltype)
2550 {
2551     SourceFeatBlkPtr tsfbp;
2552     char*          name;
2553     char*          p;
2554     bool             ret;
2555     Char             ch;
2556 
2557     if(sfbp == NULL)
2558         return true;
2559 
2560     for(tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
2561         if(tsfbp->moltype != NULL)
2562             break;
2563 
2564     if(tsfbp == NULL)
2565         return true;
2566 
2567     name = tsfbp->moltype;
2568     for(ret = true, tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
2569     {
2570         if(tsfbp->moltype == NULL)
2571         {
2572             ch = '\0';
2573             p = tsfbp->location;
2574             if(p != NULL && StringLen(p) > 50)
2575             {
2576                 ch = p[50];
2577                 p[50] = '\0';
2578             }
2579             ErrPostEx(SEV_ERROR, ERR_SOURCE_MissingMolType,
2580                       "Source feature at \"%s\" lacks a /mol_type qualifier.",
2581                       (p == NULL) ? "<empty>" : p);
2582             if(ch != '\0')
2583                 p[50] = ch;
2584         }
2585         else if(ret && StringCmp(name, tsfbp->moltype) != 0)
2586             ret = false;
2587     }
2588 
2589     if(ret)
2590         *moltype = StringSave(name);
2591 
2592     return(ret);
2593 }
2594 
2595 /**********************************************************/
CheckForENV(SourceFeatBlkPtr sfbp,IndexblkPtr ibp,Parser::ESource source)2596 static bool CheckForENV(SourceFeatBlkPtr sfbp, IndexblkPtr ibp, Parser::ESource source)
2597 {
2598     const char **b;
2599 
2600     char*    location;
2601     Int4       sources;
2602     Int4       envs;
2603     Char       ch;
2604 
2605     if(sfbp == NULL || ibp == NULL)
2606         return true;
2607 
2608     bool skip = false;
2609     location = NULL;
2610     ibp->env_sample_qual = false;
2611     for(envs = 0, sources = 0; sfbp != NULL; sfbp = sfbp->next, sources++)
2612     {
2613         bool env_found = false;
2614         ITERATE(TQualVector, cur, sfbp->quals)
2615         {
2616             if ((*cur)->IsSetQual() && (*cur)->GetQual() == "environmental_sample")
2617             {
2618                 env_found = true;
2619                 break;
2620             }
2621         }
2622         if (env_found)
2623             envs++;
2624         else
2625             location = sfbp->location;
2626 
2627         if(!sfbp->full || sfbp->name == NULL || sfbp->name[0] == '\0')
2628             continue;
2629 
2630         for (b = special_orgs; *b != NULL; b++)
2631         {
2632             if (StringICmp(*b, sfbp->name) == 0)
2633                 break;
2634         }
2635         if(*b != NULL)
2636             skip = true;
2637     }
2638 
2639     if(envs > 0)
2640     {
2641         ibp->env_sample_qual = true;
2642         if(!skip && envs != sources)
2643         {
2644             if(location != NULL && StringLen(location) > 50)
2645             {
2646                 ch = location[50];
2647                 location[50] = '\0';
2648             }
2649             else
2650                 ch = '\0';
2651             ErrPostEx(SEV_REJECT, ERR_SOURCE_InconsistentEnvSampQual,
2652                       "Inconsistent /environmental_sample qualifier usage. Source feature at location \"%s\" lacks the qualifier.",
2653                       (location == NULL) ? "unknown" : location);
2654             if(ch != '\0')
2655                 location[50] = ch;
2656             return false;
2657         }
2658     }
2659     else if(StringICmp(ibp->division, "ENV") == 0)
2660     {
2661         if(source == Parser::ESource::EMBL)
2662             ErrPostEx(SEV_ERROR, ERR_SOURCE_MissingEnvSampQual,
2663                       "This ENV division record has source features that lack the /environmental_sample qualifier. It will not be placed in the ENV division until the qualifier is added.");
2664         else
2665         {
2666             ErrPostEx(SEV_REJECT, ERR_SOURCE_MissingEnvSampQual,
2667                       "This ENV division record has source features that lack the /environmental_sample qualifier.");
2668             return false;
2669         }
2670     }
2671     return true;
2672 }
2673 
2674 /**********************************************************/
CheckPcrPrimersTag(char * str)2675 static char* CheckPcrPrimersTag(char* str)
2676 {
2677     if(StringNCmp(str, "fwd_name", 8) == 0 ||
2678        StringNCmp(str, "rev_name", 8) == 0)
2679         str += 8;
2680     else if(StringNCmp(str, "fwd_seq", 7) == 0 ||
2681             StringNCmp(str, "rev_seq", 7) == 0)
2682         str += 7;
2683     else
2684         return(NULL);
2685 
2686     if(*str == ' ')
2687         str++;
2688     if(*str == ':')
2689         return(str + 1);
2690     return(NULL);
2691 }
2692 
2693 /**********************************************************/
PopulatePcrPrimers(objects::CBioSource & bio,PcrPrimersPtr ppp,Int4 count)2694 static void PopulatePcrPrimers(objects::CBioSource& bio, PcrPrimersPtr ppp, Int4 count)
2695 {
2696     PcrPrimersPtr tppp;
2697 
2698     char*       str_fs;
2699     char*       str_rs;
2700     char*       str_fn;
2701     char*       str_rn;
2702     Int4          num_fn;
2703     Int4          num_rn;
2704 
2705     if (ppp == NULL || count < 1)
2706         return;
2707 
2708     objects::CBioSource::TSubtype& subs = bio.SetSubtype();
2709     CRef<objects::CSubSource> sub;
2710 
2711     if (count == 1)
2712     {
2713         sub.Reset(new objects::CSubSource);
2714         sub->SetSubtype(33);
2715         sub->SetName(ppp->fwd_seq);
2716         subs.push_back(sub);
2717 
2718         sub.Reset(new objects::CSubSource);
2719         sub->SetSubtype(34);
2720         sub->SetName(ppp->rev_seq);
2721         subs.push_back(sub);
2722 
2723         if(ppp->fwd_name != NULL && ppp->fwd_name[0] != '\0')
2724         {
2725             sub.Reset(new objects::CSubSource);
2726             sub->SetSubtype(35);
2727             sub->SetName(ppp->fwd_name);
2728             subs.push_back(sub);
2729         }
2730 
2731         if(ppp->rev_name != NULL && ppp->rev_name[0] != '\0')
2732         {
2733             sub.Reset(new objects::CSubSource);
2734             sub->SetSubtype(36);
2735             sub->SetName(ppp->rev_name);
2736             subs.push_back(sub);
2737         }
2738         return;
2739     }
2740 
2741     size_t len_fs = 2,
2742            len_rs = 2,
2743            len_fn = 0,
2744            len_rn = 0;
2745     num_fn = 0;
2746     num_rn = 0;
2747     for(tppp = ppp; tppp != NULL; tppp = tppp->next)
2748     {
2749         len_fs += (StringLen(tppp->fwd_seq) + 1);
2750         len_rs += (StringLen(tppp->rev_seq) + 1);
2751         if(tppp->fwd_name != NULL && tppp->fwd_name[0] != '\0')
2752         {
2753             len_fn += (StringLen(tppp->fwd_name) + 1);
2754             num_fn++;
2755         }
2756         if(tppp->rev_name != NULL && tppp->rev_name[0] != '\0')
2757         {
2758             len_rn += (StringLen(tppp->rev_name) + 1);
2759             num_rn++;
2760         }
2761     }
2762 
2763     str_fs = (char*) MemNew(len_fs);
2764     str_rs = (char*) MemNew(len_rs);
2765     str_fn = (len_fn == 0) ? NULL : (char*) MemNew(len_fn + count -
2766                                                      num_fn + 2);
2767     str_rn = (len_rn == 0) ? NULL : (char*) MemNew(len_rn + count -
2768                                                      num_rn + 2);
2769 
2770     for(tppp = ppp; tppp != NULL; tppp = tppp->next)
2771     {
2772         StringCat(str_fs, ",");
2773         StringCat(str_fs, tppp->fwd_seq);
2774         StringCat(str_rs, ",");
2775         StringCat(str_rs, tppp->rev_seq);
2776         if(str_fn != NULL)
2777         {
2778             StringCat(str_fn, ",");
2779             if(tppp->fwd_name != NULL && tppp->fwd_name[0] != '\0')
2780                 StringCat(str_fn, tppp->fwd_name);
2781         }
2782         if(str_rn != NULL)
2783         {
2784             StringCat(str_rn, ",");
2785             if(tppp->rev_name != NULL && tppp->rev_name[0] != '\0')
2786                 StringCat(str_rn, tppp->rev_name);
2787         }
2788     }
2789 
2790     str_fs[0] = '(';
2791     StringCat(str_fs, ")");
2792 
2793     sub.Reset(new objects::CSubSource);
2794     sub->SetSubtype(33);
2795     sub->SetName(str_fs);
2796     subs.push_back(sub);
2797 
2798     str_rs[0] = '(';
2799     StringCat(str_rs, ")");
2800 
2801     sub.Reset(new objects::CSubSource);
2802     sub->SetSubtype(34);
2803     sub->SetName(str_rs);
2804     subs.push_back(sub);
2805 
2806     if(str_fn != NULL)
2807     {
2808         str_fn[0] = '(';
2809         StringCat(str_fn, ")");
2810 
2811         sub.Reset(new objects::CSubSource);
2812         sub->SetSubtype(35);
2813         sub->SetName(str_fn);
2814         subs.push_back(sub);
2815     }
2816 
2817     if(str_rn != NULL)
2818     {
2819         str_rn[0] = '(';
2820         StringCat(str_rn, ")");
2821 
2822         sub.Reset(new objects::CSubSource);
2823         sub->SetSubtype(36);
2824         sub->SetName(str_rn);
2825         subs.push_back(sub);
2826     }
2827 }
2828 
2829 /**********************************************************/
PcrPrimersFree(PcrPrimersPtr ppp)2830 static void PcrPrimersFree(PcrPrimersPtr ppp)
2831 {
2832     PcrPrimersPtr next;
2833 
2834     for(; ppp != NULL; ppp = next)
2835     {
2836         next = ppp->next;
2837         if(ppp->fwd_name != NULL)
2838             MemFree(ppp->fwd_name);
2839         if(ppp->fwd_seq != NULL)
2840             MemFree(ppp->fwd_seq);
2841         if(ppp->rev_name != NULL)
2842             MemFree(ppp->rev_name);
2843         if(ppp->rev_seq != NULL)
2844             MemFree(ppp->rev_seq);
2845         MemFree(ppp);
2846     }
2847 }
2848 
2849 /**********************************************************/
ParsePcrPrimers(SourceFeatBlkPtr sfbp)2850 static bool ParsePcrPrimers(SourceFeatBlkPtr sfbp)
2851 {
2852     PcrPrimersPtr ppp;
2853     PcrPrimersPtr tppp;
2854 
2855     char*       p;
2856     char*       q;
2857     char*       r;
2858     char*       s;
2859     bool          comma;
2860     bool          bad_start;
2861     bool          empty;
2862     Char          ch;
2863     Int4          count;
2864     Int4          prev;                 /* 1 = fwd_name, 2 = fwd_seq,
2865                                            3 = rev_name, 4 = rev_seq */
2866 
2867     bool got_problem = false;
2868     for(ppp = NULL; sfbp != NULL; sfbp = sfbp->next)
2869     {
2870         if (sfbp->quals.empty() || sfbp->bio_src.Empty())
2871             continue;
2872 
2873         count = 0;
2874         ITERATE(TQualVector, cur, sfbp->quals)
2875         {
2876             if((*cur)->GetQual() != "PCR_primers" ||
2877                !(*cur)->IsSetVal() || (*cur)->GetVal().empty())
2878                 continue;
2879 
2880             count++;
2881             if(ppp == NULL)
2882             {
2883                 ppp = (PcrPrimersPtr) MemNew(sizeof(PcrPrimers));
2884                 tppp = ppp;
2885             }
2886             else
2887             {
2888                 tppp->next = (PcrPrimersPtr) MemNew(sizeof(PcrPrimers));
2889                 tppp = tppp->next;
2890             }
2891 
2892             prev = 0;
2893             std::vector<Char> val_buf((*cur)->GetVal().begin(), (*cur)->GetVal().end());
2894             val_buf.push_back(0);
2895 
2896             for(comma = false, bad_start = false, p = &val_buf[0]; *p != '\0';)
2897             {
2898                 q = CheckPcrPrimersTag(p);
2899                 if(q == NULL)
2900                 {
2901                     if (p != &val_buf[0])
2902                     {
2903                         p++;
2904                         continue;
2905                     }
2906                     bad_start = true;
2907                     break;
2908                 }
2909 
2910                 if(*q == ' ')
2911                     q++;
2912                 for(r = q;;)
2913                 {
2914                     r = StringChr(r, ',');
2915                     if(r == NULL)
2916                         break;
2917                     if(*++r == ' ')
2918                         r++;
2919                     if(CheckPcrPrimersTag(r) != NULL)
2920                         break;
2921                 }
2922                 if(r != NULL)
2923                 {
2924                     r--;
2925                     if(*r == ' ')
2926                         r--;
2927                     if(r > q && *(r - 1) == ' ')
2928                         r--;
2929                     ch = *r;
2930                     *r = '\0';
2931                 }
2932 
2933                 if(StringChr(q, ',') != NULL)
2934                     comma = true;
2935 
2936                 empty = false;
2937                 if(q == NULL || *q == '\0')
2938                     empty = true;
2939                 else if(StringNCmp(p, "fwd_name", 8) == 0)
2940                 {
2941                     if(prev == 1)
2942                         prev = -2;
2943                     else if(prev > 2 && prev < 5)
2944                         prev = -1;
2945                     else
2946                     {
2947                         if(tppp->fwd_name == NULL)
2948                             tppp->fwd_name = StringSave(q);
2949                         else
2950                         {
2951                             s = (char*) MemNew(StringLen(tppp->fwd_name) +
2952                                                  StringLen(q) + 2);
2953                             StringCpy(s, tppp->fwd_name);
2954                             StringCat(s, ":");
2955                             StringCat(s, q);
2956                             MemFree(tppp->fwd_name);
2957                             tppp->fwd_name = s;
2958                         }
2959                         prev = 1;
2960                     }
2961                 }
2962                 else if(StringNCmp(p, "fwd_seq", 7) == 0)
2963                 {
2964                     if(prev > 2 && prev < 5)
2965                         prev = -1;
2966                     else
2967                     {
2968                         if(tppp->fwd_seq == NULL)
2969                             tppp->fwd_seq = StringSave(q);
2970                         else
2971                         {
2972                             s = (char*) MemNew(StringLen(tppp->fwd_seq) +
2973                                                  StringLen(q) + 2);
2974                             StringCpy(s, tppp->fwd_seq);
2975                             StringCat(s, ":");
2976                             StringCat(s, q);
2977                             MemFree(tppp->fwd_seq);
2978                             tppp->fwd_seq = s;
2979                             if(prev != 1)
2980                             {
2981                                 if(tppp->fwd_name == NULL)
2982                                     tppp->fwd_name = StringSave(":");
2983                                 else
2984                                 {
2985                                     s = (char*) MemNew(StringLen(tppp->fwd_name) + 2);
2986                                     StringCpy(s, tppp->fwd_name);
2987                                     StringCat(s, ":");
2988                                     MemFree(tppp->fwd_name);
2989                                     tppp->fwd_name = s;
2990                                 }
2991                             }
2992                         }
2993                         prev = 2;
2994                     }
2995                 }
2996                 else if(StringNCmp(p, "rev_name", 8) == 0)
2997                 {
2998                     if(prev == 3 || prev == 1)
2999                         prev = -2;
3000                     else
3001                     {
3002                         if(tppp->rev_name == NULL)
3003                             tppp->rev_name = StringSave(q);
3004                         else
3005                         {
3006                             s = (char*) MemNew(StringLen(tppp->rev_name) +
3007                                                  StringLen(q) + 2);
3008                             StringCpy(s, tppp->rev_name);
3009                             StringCat(s, ":");
3010                             StringCat(s, q);
3011                             MemFree(tppp->rev_name);
3012                             tppp->rev_name = s;
3013                         }
3014                         prev = 3;
3015                     }
3016                 }
3017                 else
3018                 {
3019                     if(prev == 1)
3020                         prev = -2;
3021                     else
3022                     {
3023                         if(tppp->rev_seq == NULL)
3024                             tppp->rev_seq = StringSave(q);
3025                         else
3026                         {
3027                             s = (char*) MemNew(StringLen(tppp->rev_seq) +
3028                                                  StringLen(q) + 2);
3029                             StringCpy(s, tppp->rev_seq);
3030                             StringCat(s, ":");
3031                             StringCat(s, q);
3032                             MemFree(tppp->rev_seq);
3033                             tppp->rev_seq = s;
3034                             if(prev != 3)
3035                             {
3036                                 if(tppp->rev_name == NULL)
3037                                     tppp->rev_name = StringSave(":");
3038                                 else
3039                                 {
3040                                     s = (char*) MemNew(StringLen(tppp->rev_name) + 2);
3041                                     StringCpy(s, tppp->rev_name);
3042                                     StringCat(s, ":");
3043                                     MemFree(tppp->rev_name);
3044                                     tppp->rev_name = s;
3045                                 }
3046                             }
3047                         }
3048                         prev = 4;
3049                     }
3050                 }
3051 
3052                 if(r == NULL)
3053                     break;
3054 
3055                 *r++ = ch;
3056 
3057                 if(comma || prev < 0 || empty)
3058                     break;
3059 
3060                 if(ch == ' ')
3061                     r++;
3062                 if(*r == ' ')
3063                     r++;
3064                 p = r;
3065             }
3066 
3067             if(prev == 1 || prev == 3)
3068                 prev = -2;
3069 
3070             if(bad_start)
3071             {
3072                 ErrPostEx(SEV_REJECT, ERR_QUALIFIER_InvalidPCRprimer,
3073                           "Unknown text found at the beginning of /PCR_primers qualifier: \"%s\". Entry dropped.",
3074                           &val_buf[0]);
3075                 got_problem = true;
3076                 break;
3077             }
3078 
3079             if(comma)
3080             {
3081                 ErrPostEx(SEV_REJECT, ERR_QUALIFIER_PCRprimerEmbeddedComma,
3082                           "Encountered embedded comma within /PCR_primers qualifier's field value: \"%s\". Entry dropped.",
3083                           &val_buf[0]);
3084                 got_problem = true;
3085                 break;
3086             }
3087 
3088             if(prev == -1)
3089             {
3090                 ErrPostEx(SEV_REJECT, ERR_QUALIFIER_InvalidPCRprimer,
3091                           "Encountered incorrect order of \"forward\" and \"reversed\" sequences within /PCR_primers qualifier: \"%s\". Entry dropped.",
3092                           &val_buf[0]);
3093                 got_problem = true;
3094                 break;
3095             }
3096 
3097             if(prev == -2)
3098             {
3099                 ErrPostEx(SEV_REJECT, ERR_QUALIFIER_MissingPCRprimerSeq,
3100                           "/PCR_primers qualifier \"%s\" is missing or has an empty required fwd_seq or rev_seq fields (or both). Entry dropped.",
3101                           &val_buf[0]);
3102                 got_problem = true;
3103                 break;
3104             }
3105 
3106             if(empty)
3107             {
3108                 ErrPostEx(SEV_REJECT, ERR_QUALIFIER_InvalidPCRprimer,
3109                           "/PCR_primers qualifier \"%s\" has an empty field value. Entry dropped.",
3110                           &val_buf[0]);
3111                 got_problem = true;
3112                 break;
3113             }
3114 
3115             if(tppp->fwd_seq == NULL || tppp->fwd_seq[0] == '\0' ||
3116                tppp->rev_seq == NULL || tppp->rev_seq[0] == '\0')
3117             {
3118                 ErrPostEx(SEV_REJECT, ERR_QUALIFIER_MissingPCRprimerSeq,
3119                           "/PCR_primers qualifier \"%s\" is missing or has an empty required fwd_seq or rev_seq fields (or both). Entry dropped.",
3120                           &val_buf[0]);
3121                 got_problem = true;
3122                 break;
3123             }
3124         }
3125 
3126         if (got_problem)
3127         {
3128             PcrPrimersFree(ppp);
3129             break;
3130         }
3131 
3132         PopulatePcrPrimers(*sfbp->bio_src, ppp, count);
3133         PcrPrimersFree(ppp);
3134         ppp = NULL;
3135     }
3136 
3137     if(sfbp == NULL)
3138         return true;
3139     return false;
3140 }
3141 
3142 /**********************************************************/
CheckCollectionDate(SourceFeatBlkPtr sfbp,Parser::ESource source)3143 static void CheckCollectionDate(SourceFeatBlkPtr sfbp, Parser::ESource source)
3144 {
3145     const char *Mmm[] = {"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul",
3146                          "Aug", "Sep", "Oct", "Nov", "Dec", NULL};
3147     const char **b;
3148     const char *q;
3149 
3150     char*    p;
3151     char*    r;
3152     char*    val;
3153     Int4       year;
3154     Int4       month;
3155     Int4       day;
3156     Int4       bad;
3157     Int4       num_slash;
3158     Int4       num_T;
3159     Int4       num_colon;
3160     Int4       num_Z;
3161     Int4       len;
3162 
3163     CTime time(CTime::eCurrent);
3164     objects::CDate_std date(time);
3165 
3166     for(; sfbp != NULL; sfbp = sfbp->next)
3167     {
3168         if (sfbp->quals.empty() || sfbp->bio_src.Empty())
3169             continue;
3170 
3171         ITERATE(TQualVector, cur, sfbp->quals)
3172         {
3173             bad = 0;
3174             if ((*cur)->GetQual() != "collection_date" ||
3175                 !(*cur)->IsSetVal() || (*cur)->GetVal().empty())
3176                 continue;
3177 
3178             val = (char *) (*cur)->GetVal().c_str();
3179             for(num_slash = 0, p = val; *p != '\0'; p++)
3180                 if(*p == '/')
3181                     num_slash++;
3182 
3183             if(num_slash > 1)
3184             {
3185                 p = StringSave(sfbp->location);
3186                 if(p != NULL && StringLen(p) > 50)
3187                     p[50] = '\0';
3188                 ErrPostEx(SEV_ERROR, ERR_SOURCE_InvalidCollectionDate,
3189                           "/collection_date \"%s\" for source feature at \"%s\" has too many components.",
3190                           val, (p == NULL) ? "unknown location" : p);
3191                 if(p != NULL)
3192                     MemFree(p);
3193                 continue;
3194             }
3195 
3196             for(val = (char *) (*cur)->GetVal().c_str();;)
3197             {
3198                 r = StringChr(val, '/');
3199                 if(r != NULL)
3200                     *r = '\0';
3201 
3202                 len = StringLen(val);
3203 
3204                 if(len == 4)
3205                 {
3206                     for(q = val; *q == '0';)
3207                         q++;
3208                     for(p = (char*) q; *p != '\0'; p++)
3209                         if(*p < '0' || *p > '9')
3210                             break;
3211                     if(*p != '\0')
3212                         bad = 1;
3213                     else if (atoi(q) > date.GetYear())
3214                         bad = 3;
3215                 }
3216                 else if(len == 8)
3217                 {
3218                     if(val[3] != '-')
3219                         bad = 1;
3220                     else
3221                     {
3222                         p = val;
3223                         p[3] = '\0';
3224                         if(source == Parser::ESource::DDBJ)
3225                         {
3226                             if(p[0] >= 'a' && p[0] <= 'z')
3227                                 p[0] &= ~040;
3228                             if(p[1] >= 'A' && p[1] <= 'Z')
3229                                 p[1] |= 040;
3230                             if(p[2] >= 'A' && p[2] <= 'Z')
3231                                 p[2] |= 040;
3232                         }
3233                         for(b = Mmm, month = 1; *b != NULL; b++, month++)
3234                             if(StringCmp(*b, p) == 0)
3235                                 break;
3236                         if(*b == NULL)
3237                             bad = 1;
3238                         p[3] = '-';
3239                     }
3240                     if(bad == 0)
3241                     {
3242                         for(q = val + 4; *q == '0';)
3243                             q++;
3244                         for(p = (char*) q; *p != '\0'; p++)
3245                             if(*p < '0' || *p > '9')
3246                                 break;
3247                         if(*p != '\0')
3248                             bad = 1;
3249                         else
3250                         {
3251                             year = atoi(q);
3252                             if(year > date.GetYear() ||
3253                                (year == date.GetYear() && month > date.GetMonth()))
3254                                 bad = 3;
3255                         }
3256                     }
3257                 }
3258                 else if(len == 11)
3259                 {
3260                     if(val[2] != '-' || val[6] != '-')
3261                         bad = 1;
3262                     else
3263                     {
3264                         p = val;
3265                         val[2] = '\0';
3266                         val[6] = '\0';
3267                         if(p[0] < '0' || p[0] > '3' || p[1] < '0' || p[1] > '9')
3268                             bad = 1;
3269                         else
3270                         {
3271                             if(*p == '0')
3272                                 p++;
3273                             day = atoi(p);
3274                             p = val + 3;
3275                             if(source == Parser::ESource::DDBJ)
3276                             {
3277                                 if(p[0] >= 'a' && p[0] <= 'z')
3278                                     p[0] &= ~040;
3279                                 if(p[1] >= 'A' && p[1] <= 'Z')
3280                                     p[1] |= 040;
3281                                 if(p[2] >= 'A' && p[2] <= 'Z')
3282                                     p[2] |= 040;
3283                             }
3284                             for(b = Mmm, month = 1; *b != NULL; b++, month++)
3285                                 if(StringCmp(*b, p) == 0)
3286                                     break;
3287                             if(*b == NULL)
3288                                 bad = 1;
3289                             else
3290                             {
3291                                 if(day < 1 || day > 31)
3292                                     bad = 2;
3293                                 else if(month == 2 && day > 29)
3294                                     bad = 2;
3295                                 else if((month == 4 || month == 6 || month == 9 || month == 11) && day > 30)
3296                                     bad = 2;
3297                             }
3298                         }
3299                         if(bad == 0)
3300                         {
3301                             for(q = val + 7; *q == '0';)
3302                                 q++;
3303                             for(p = (char*) q; *p != '\0'; p++)
3304                                 if(*p < '0' || *p > '9')
3305                                     break;
3306                             if(*p != '\0')
3307                                 bad = 1;
3308                             else
3309                             {
3310                                 year = atoi(q) - 1900;
3311                                 if(year > date.GetYear() ||
3312                                    (year == date.GetYear() && month > date.GetMonth()) ||
3313                                    (year == date.GetYear() && month == date.GetMonth() && day > date.GetDay()))
3314                                     bad = 3;
3315                             }
3316                         }
3317                         val[2] = '-';
3318                         val[6] = '-';
3319                     }
3320                 }
3321                 else if(len == 7 || len == 10 || len == 14 || len == 17 ||
3322                         len == 20)
3323                 {
3324                     num_T = 0;
3325                     num_Z = 0;
3326                     num_colon = 0;
3327                     for(p = val; *p != '\0'; p++)
3328                     {
3329                         if((*p < 'a' || *p > 'z') && (*p < 'A' || *p > 'Z') &&
3330                            (*p < '0' || *p > '9') && *p != '-' && *p != '/' &&
3331                            *p != ':')
3332                         {
3333                             bad = 3;
3334                             break;
3335                         }
3336                         if(*p == ':')
3337                             num_colon++;
3338                         else if(*p == 'T')
3339                             num_T++;
3340                         else if(*p == 'Z')
3341                             num_Z++;
3342                     }
3343                     if(len == 7 || len == 10)
3344                     {
3345                         if(num_T > 0)
3346                             bad = 4;
3347                         if(num_Z > 0)
3348                             bad = 5;
3349                         if(num_colon > 0)
3350                             bad = 6;
3351                     }
3352                     else
3353                     {
3354                         if(num_Z > 1)
3355                             bad = 5;
3356                         if(num_T > 1)
3357                             bad = 4;
3358                         if((len == 14 && num_colon > 0) ||
3359                            (len == 17 && num_colon > 1) ||
3360                            (len == 20 && num_colon > 2))
3361                             bad = 6;
3362                     }
3363                 }
3364                 else
3365                     bad = 8;
3366 
3367                 if(bad == 0)
3368                 {
3369                     if(r == NULL)
3370                         break;
3371 
3372                     *r = '/';
3373                     val = r + 1;
3374                     continue;
3375                 }
3376 
3377                 p = StringSave(sfbp->location);
3378                 if(p != NULL && StringLen(p) > 50)
3379                     p[50] = '\0';
3380                 if(bad == 1)
3381                     q = "is not of the format DD-Mmm-YYYY, Mmm-YYYY, or YYYY";
3382                 else if(bad == 2)
3383                     q = "has an illegal day value for the stated month";
3384                 else if(bad == 3)
3385                     q = "has invalid characters";
3386                 else if(bad == 4)
3387                     q = "has too many time values";
3388                 else if(bad == 5)
3389                     q = "has too many Zulu indicators";
3390                 else if(bad == 6)
3391                     q = "has too many hour and minute delimiters";
3392                 else
3393                     q = "has not yet occured";
3394                 ErrPostEx(SEV_ERROR, ERR_SOURCE_InvalidCollectionDate,
3395                           "/collection_date \"%s\" for source feature at \"%s\" %s.",
3396                           val, (p == NULL) ? "unknown location" : p, q);
3397                 if(p != NULL)
3398                     MemFree(p);
3399 
3400                 if(r == NULL)
3401                     break;
3402 
3403                 *r = '/';
3404                 val = r + 1;
3405             }
3406         }
3407     }
3408 }
3409 
3410 /**********************************************************/
CheckNeedSYNFocus(SourceFeatBlkPtr sfbp)3411 static bool CheckNeedSYNFocus(SourceFeatBlkPtr sfbp)
3412 {
3413     const char **b;
3414 
3415     if(sfbp == NULL || sfbp->next == NULL)
3416         return false;
3417 
3418     for(; sfbp != NULL; sfbp = sfbp->next)
3419     {
3420         if(!sfbp->full)
3421             continue;
3422 
3423         for(b = special_orgs; *b != NULL; b++)
3424             if(StringICmp(*b, sfbp->name) == 0)
3425                 break;
3426 
3427         if(*b != NULL)
3428             break;
3429     }
3430 
3431     if(sfbp != NULL)
3432         return false;
3433     return true;
3434 }
3435 
3436 /**********************************************************/
CheckMetagenome(objects::CBioSource & bio)3437 static void CheckMetagenome(objects::CBioSource& bio)
3438 {
3439     if (!bio.IsSetOrg())
3440         return;
3441 
3442     bool metatax = false;
3443     bool metalin = false;
3444 
3445     if (bio.IsSetOrgname() && bio.GetOrgname().IsSetLineage() &&
3446         StringStr(bio.GetOrgname().GetLineage().c_str(), "metagenomes") != NULL)
3447         metalin = true;
3448 
3449     if (bio.GetOrg().IsSetTaxname() &&
3450         StringStr(bio.GetOrg().GetTaxname().c_str(), "metagenome") != NULL)
3451         metatax = true;
3452 
3453     if(!metalin && !metatax)
3454         return;
3455 
3456     const Char* taxname = bio.GetOrg().IsSetTaxname() ? bio.GetOrg().GetTaxname().c_str() : NULL;
3457     if (taxname == NULL || taxname[0] == 0)
3458         taxname = "unknown";
3459 
3460     if (metalin && metatax)
3461     {
3462         CRef<objects::CSubSource> sub(new objects::CSubSource);
3463         sub->SetSubtype(37);
3464         sub->SetName("");
3465         bio.SetSubtype().push_back(sub);
3466     }
3467     else if(!metalin)
3468         ErrPostEx(SEV_ERROR, ERR_ORGANISM_LineageLacksMetagenome,
3469                   "Organism name \"%s\" contains \"metagenome\" but the lineage lacks the \"metagenomes\" classification.",
3470                   taxname);
3471     else
3472         ErrPostEx(SEV_ERROR, ERR_ORGANISM_OrgNameLacksMetagenome,
3473                   "Lineage includes the \"metagenomes\" classification but organism name \"%s\" lacks \"metagenome\".",
3474                   taxname);
3475 }
3476 
3477 /**********************************************************/
CheckSubmitterSeqidQuals(SourceFeatBlkPtr sfbp,char * acc)3478 static bool CheckSubmitterSeqidQuals(SourceFeatBlkPtr sfbp, char* acc)
3479 {
3480     SourceFeatBlkPtr tsfbp;
3481     char*          ssid;
3482     Int4             count_feat;
3483     Int4             count_qual;
3484 
3485     if(sfbp == NULL)
3486         return(true);
3487 
3488     count_feat = 0;
3489     count_qual = 0;
3490     for(ssid = NULL, tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
3491     {
3492         count_feat++;
3493         if(tsfbp->submitter_seqid == NULL)
3494             continue;
3495 
3496         count_qual++;
3497         if(tsfbp->submitter_seqid[0] == '\0')
3498         {
3499             ErrPostEx(SEV_REJECT, ERR_SOURCE_MultipleSubmitterSeqids,
3500                       "Multiple /submitter_seqid qualifiers were encountered within source feature at location \"%s\". Entry dropped.",
3501                       (tsfbp->location == NULL) ? "?empty?" : tsfbp->location);
3502             break;
3503         }
3504 
3505         if(ssid == NULL)
3506             ssid = tsfbp->submitter_seqid;
3507         else if(StringCmp(ssid, tsfbp->submitter_seqid) != 0)
3508         {
3509             ErrPostEx(SEV_REJECT, ERR_SOURCE_DifferentSubmitterSeqids,
3510                       "Different /submitter_seqid qualifiers were encountered amongst source features: \"%s\" and \"%s\" at least. Entry dropped.",
3511                       ssid, tsfbp->submitter_seqid);
3512             break;
3513         }
3514     }
3515 
3516     if(tsfbp != NULL)
3517         return(false);
3518 
3519     if(count_feat == count_qual)
3520         return(true);
3521 
3522     ErrPostEx(SEV_REJECT, ERR_SOURCE_LackingSubmitterSeqids,
3523               "One ore more source features are lacking /submitter_seqid qualifiers provided in others. Entry dropped.");
3524     return(false);
3525 }
3526 
3527 /**********************************************************/
ParseSourceFeat(ParserPtr pp,DataBlkPtr dbp,TSeqIdList & seqids,Int2 type,objects::CBioseq & bioseq,TSeqFeatList & seq_feats)3528 void ParseSourceFeat(ParserPtr pp, DataBlkPtr dbp, TSeqIdList& seqids,
3529                      Int2 type, objects::CBioseq& bioseq, TSeqFeatList& seq_feats)
3530 {
3531     SourceFeatBlkPtr sfbp;
3532     SourceFeatBlkPtr tsfbp;
3533 
3534     MinMaxPtr        mmp;
3535     IndexblkPtr      ibp;
3536     char*          res;
3537     char*          acc;
3538     char*          p;
3539     Int4             i;
3540     Int4             use_what = USE_ALL;
3541     bool             err;
3542     ErrSev           sev;
3543     bool             need_focus;
3544     bool             already;
3545 
3546     ibp = pp->entrylist[pp->curindx];
3547     acc = ibp->acnum;
3548     size_t len = ibp->bases;
3549 
3550     if(ibp->segnum < 2)
3551         pp->errstat = 0;
3552 
3553     sfbp = CollectSourceFeats(dbp, type);
3554     if(sfbp == NULL)
3555     {
3556         ErrPostEx(SEV_REJECT, ERR_SOURCE_FeatureMissing,
3557                   "Required source feature is missing. Entry dropped.");
3558         return;
3559     }
3560 
3561     RemoveSourceFeatSpaces(sfbp);
3562     CheckForExemption(sfbp);
3563 
3564     if(!CheckSourceFeatLocFuzz(sfbp))
3565     {
3566         SourceFeatBlkSetFree(sfbp);
3567         return;
3568     }
3569 
3570     res = CheckSourceFeatLocAccs(sfbp, acc);
3571     if(res != NULL)
3572     {
3573         ErrPostEx(SEV_REJECT, ERR_SOURCE_BadLocation,
3574                   "Source feature location points to another record: \"%s\". Entry dropped.",
3575                   res);
3576         SourceFeatBlkSetFree(sfbp);
3577         return;
3578     }
3579 
3580     if(!SourceFeatStructFillIn(ibp, sfbp, use_what))
3581     {
3582         ErrPostEx(SEV_REJECT, ERR_SOURCE_MultipleMolTypes,
3583                   "Multiple /mol_type qualifiers were encountered within source feature. Entry dropped.");
3584         SourceFeatBlkSetFree(sfbp);
3585         return;
3586     }
3587 
3588     if(ibp->submitter_seqid && !CheckSubmitterSeqidQuals(sfbp, acc))
3589     {
3590         MemFree(ibp->submitter_seqid);
3591         ibp->submitter_seqid = NULL;
3592         SourceFeatBlkSetFree(sfbp);
3593         return;
3594     }
3595 
3596     if(!CheckMoltypeConsistency(sfbp, &ibp->moltype))
3597     {
3598         ErrPostEx(SEV_REJECT, ERR_SOURCE_InconsistentMolType,
3599                   "Inconsistent /mol_type qualifiers were encountered. Entry dropped.");
3600         SourceFeatBlkSetFree(sfbp);
3601         return;
3602     }
3603 
3604     res = CheckSourceFeatFocusAndTransposon(sfbp);
3605     if(res != NULL)
3606     {
3607         ErrPostEx(SEV_REJECT, ERR_SOURCE_FocusAndTransposonNotAllowed,
3608                   "/transposon (or /insertion_seq) qualifiers should not be used in conjunction with /focus. Source feature at \"%s\". Entry dropped.",
3609                   res);
3610         SourceFeatBlkSetFree(sfbp);
3611         return;
3612     }
3613 
3614     res = CheckSourceFeatOrgs(sfbp, &i);
3615     if(res != NULL)
3616     {
3617         if(i == 1)
3618         {
3619             ErrPostEx(SEV_REJECT, ERR_SOURCE_NoOrganismQual,
3620                       "/organism qualifier contains only organell/genome name. No genus/species present. Source feature at \"%s\". Entry dropped.",
3621                       res);
3622         }
3623         else
3624         {
3625             ErrPostEx(SEV_REJECT, ERR_SOURCE_OrganismIncomplete,
3626                       "Required /organism qualifier is containing genome info only at \"%s\". Entry dropped.",
3627                       res);
3628         }
3629         SourceFeatBlkSetFree(sfbp);
3630         return;
3631     }
3632 
3633     CompareDescrFeatSources(sfbp, bioseq);
3634 
3635     CreateRawBioSources(pp, sfbp, use_what);
3636 
3637     if(!CheckSourceLineage(sfbp, pp->source, ibp->is_pat))
3638     {
3639         SourceFeatBlkSetFree(sfbp);
3640         return;
3641     }
3642 
3643     PropogateSuppliedLineage(bioseq, sfbp, pp->taxserver);
3644 
3645     mmp = (MinMaxPtr) MemNew(sizeof(MinMax));
3646     mmp->orgname = NULL;
3647     mmp->min = 0;
3648     mmp->max = 0;
3649     mmp->skip = false;
3650     i = CheckSourceFeatCoverage(sfbp, mmp, len);
3651     if(i != 0)
3652     {
3653         if(i == 1)
3654         {
3655             ErrPostEx(SEV_REJECT, ERR_SOURCE_IncompleteCoverage,
3656                       "Supplied source features do not span every base of the sequence. Entry dropped.");
3657         }
3658         else
3659         {
3660             ErrPostEx(SEV_REJECT, ERR_SOURCE_ExcessCoverage,
3661                       "Sequence is spanned by too many source features. Entry dropped.");
3662         }
3663         SourceFeatBlkSetFree(sfbp);
3664         MinMaxFree(mmp);
3665         return;
3666     }
3667 
3668     if(!CheckForENV(sfbp, ibp, pp->source))
3669     {
3670         SourceFeatBlkSetFree(sfbp);
3671         MinMaxFree(mmp);
3672         return;
3673     }
3674 
3675     if(!CheckSYNTGNDivision(sfbp, ibp->division))
3676     {
3677         SourceFeatBlkSetFree(sfbp);
3678         MinMaxFree(mmp);
3679         return;
3680     }
3681 
3682     if(pp->source == Parser::ESource::EMBL)
3683         need_focus = CheckNeedSYNFocus(sfbp);
3684     else
3685         need_focus = true;
3686 
3687     already = false;
3688     i = CheckTransgenicSourceFeats(sfbp);
3689     if(i == 5)
3690     {
3691         if(pp->source == Parser::ESource::DDBJ || pp->source == Parser::ESource::EMBL)
3692             sev = SEV_WARNING;
3693         else
3694             sev = SEV_ERROR;
3695         ErrPostEx(sev, ERR_SOURCE_TransSingleOrgName,
3696                   "Use of /transgenic requires at least two source features with differences among /organism, /strain, /organelle, and /isolate, between the host and foreign organisms.");
3697     }
3698     else if(i > 0)
3699     {
3700         sev = SEV_REJECT;
3701         if(i == 1)
3702         {
3703             ErrPostEx(sev, ERR_SOURCE_TransgenicTooShort,
3704                       "Source feature with /transgenic qualifier does not span the entire sequence. Entry dropped.");
3705         }
3706         else if(i == 2)
3707         {
3708             ErrPostEx(sev, ERR_SOURCE_FocusAndTransgenicQuals,
3709                       "Both /focus and /transgenic qualifiers exist; these quals are mutually exclusive. Entry dropped.");
3710         }
3711         else if(i == 3)
3712         {
3713             ErrPostEx(sev, ERR_SOURCE_MultipleTransgenicQuals,
3714                       "Multiple source features have /transgenic qualifiers. Entry dropped.");
3715         }
3716         else
3717         {
3718             already = true;
3719             if(!need_focus)
3720                 sev = SEV_ERROR;
3721             ErrPostEx(sev, ERR_SOURCE_FocusQualMissing,
3722                       "Multiple organism names exist, but no source feature has a /focus qualifier.%s",
3723                       (sev == SEV_ERROR) ? "" : " Entry dropped.");
3724         }
3725 
3726         if(sev == SEV_REJECT)
3727         {
3728             SourceFeatBlkSetFree(sfbp);
3729             MinMaxFree(mmp);
3730             return;
3731         }
3732     }
3733 
3734     res = CheckWholeSourcesVersusFocused(sfbp);
3735     if(res != NULL)
3736     {
3737         ErrPostEx(SEV_REJECT, ERR_SOURCE_FocusQualNotFullLength,
3738                   "/focus qualifier should be used for the full-length source feature, not on source feature at \"%s\".",
3739                   res);
3740         SourceFeatBlkSetFree(sfbp);
3741         MinMaxFree(mmp);
3742         return;
3743     }
3744     i = CheckFocusInOrgs(sfbp, len, &pp->errstat);
3745     if(pp->errstat != 0 && (ibp->segnum == 0 || pp->errstat == ibp->segtotal))
3746         i = 1;
3747     if(i > 0)
3748     {
3749         sev = SEV_REJECT;
3750         if(i == 1)
3751         {
3752             ErrPostEx(sev, ERR_SOURCE_FocusQualNotNeeded,
3753                       "/focus qualifier present, but only one organism name exists. Entry dropped.");
3754         }
3755         else if(i == 2)
3756         {
3757             ErrPostEx(sev, ERR_SOURCE_MultipleOrganismWithFocus,
3758                       "/focus qualifiers exist on source features with differing organism names. Entry dropped.");
3759         }
3760         else
3761         {
3762             if(!need_focus)
3763                 sev = SEV_ERROR;
3764             if(!already)
3765                 ErrPostEx(sev, ERR_SOURCE_FocusQualMissing,
3766                           "Multiple organism names exist, but no source feature has a /focus qualifier.%s",
3767                           (sev == SEV_ERROR) ? "" : " Entry dropped.");
3768         }
3769 
3770         if(sev == SEV_REJECT)
3771         {
3772             SourceFeatBlkSetFree(sfbp);
3773             MinMaxFree(mmp);
3774             return;
3775         }
3776     }
3777     res = CheckSourceOverlap(mmp->next, len);
3778     MinMaxFree(mmp);
3779     if(res != NULL)
3780     {
3781         ErrPostEx(SEV_REJECT, ERR_SOURCE_MultiOrgOverlap,
3782                   "Overlapping source features have different organism names %s. Entry dropped.",
3783                   res);
3784         SourceFeatBlkSetFree(sfbp);
3785         MemFree(res);
3786         return;
3787     }
3788 
3789     res = CheckForUnusualFullLengthOrgs(sfbp);
3790     if(res != NULL)
3791     {
3792         ErrPostEx(SEV_WARNING, ERR_SOURCE_UnusualOrgName,
3793                   "Unusual organism name \"%s\" encountered for full-length source feature.",
3794                   res);
3795     }
3796 
3797     for(tsfbp = sfbp, i = 0; tsfbp != NULL; tsfbp = tsfbp->next)
3798         i++;
3799     if(i > BIOSOURCES_THRESHOLD)
3800     {
3801         ErrPostEx(SEV_WARNING, ERR_SOURCE_ManySourceFeats,
3802                   "This record has more than %d source features.",
3803                   BIOSOURCES_THRESHOLD);
3804     }
3805 
3806     if(!ParsePcrPrimers(sfbp))
3807     {
3808         SourceFeatBlkSetFree(sfbp);
3809         return;
3810     }
3811 
3812     CheckCollectionDate(sfbp, pp->source);
3813 
3814     sfbp = PickTheDescrSource(sfbp);
3815     if(sfbp == NULL || !UpdateRawBioSource(sfbp, pp->source, ibp, pp->taxserver))
3816     {
3817         SourceFeatBlkSetFree(sfbp);
3818         return;
3819     }
3820 
3821     if (sfbp->focus)
3822         sfbp->bio_src->SetIs_focus();
3823     else
3824         sfbp->bio_src->ResetIs_focus();
3825 
3826 
3827     for (tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
3828     {
3829         CheckMetagenome(*tsfbp->bio_src);
3830 
3831         CRef<objects::CSeq_feat> feat(new objects::CSeq_feat);
3832         feat->SetData().SetBiosrc(*tsfbp->bio_src);
3833 
3834         if(pp->buf != NULL)
3835             MemFree(pp->buf);
3836         pp->buf = NULL;
3837 
3838         GetSeqLocation(*feat, tsfbp->location, seqids, &err,
3839                        pp, (char*) "source");
3840 
3841         if(err)
3842         {
3843             ErrPostEx(SEV_ERROR, ERR_FEATURE_Dropped,
3844                       "/source|%s| range check detects problems. Entry dropped.",
3845                       tsfbp->location);
3846             break;
3847         }
3848 
3849         if (!tsfbp->quals.empty())
3850         {
3851             p = GetTheQualValue(tsfbp->quals, "evidence");
3852             if(p != NULL)
3853             {
3854                 if(StringICmp(p, "experimental") == 0)
3855                     feat->SetExp_ev(objects::CSeq_feat::eExp_ev_experimental);
3856                 else if(StringICmp(p, "not_experimental") == 0)
3857                     feat->SetExp_ev(objects::CSeq_feat::eExp_ev_not_experimental);
3858                 MemFree(p);
3859             }
3860         }
3861 
3862         seq_feats.push_back(feat);
3863     }
3864 
3865     SourceFeatBlkSetFree(sfbp);
3866 
3867     if(tsfbp != NULL)
3868         seq_feats.clear();
3869 }
3870 
3871 END_NCBI_SCOPE
3872