1 /* fta_src.c
2 *
3 * ===========================================================================
4 *
5 * PUBLIC DOMAIN NOTICE
6 * National Center for Biotechnology Information
7 *
8 * This software/database is a "United States Government Work" under the
9 * terms of the United States Copyright Act. It was written as part of
10 * the author's official duties as a United States Government employee and
11 * thus cannot be copyrighted. This software/database is freely available
12 * to the public for use. The National Library of Medicine and the U.S.
13 * Government have not placed any restriction on its use or reproduction.
14 *
15 * Although all reasonable efforts have been taken to ensure the accuracy
16 * and reliability of the software and data, the NLM and the U.S.
17 * Government do not and cannot warrant the performance or results that
18 * may be obtained by using this software or data. The NLM and the U.S.
19 * Government disclaim all warranties, express or implied, including
20 * warranties of performance, merchantability or fitness for any particular
21 * purpose.
22 *
23 * Please cite the author in any work or product based on this material.
24 *
25 * ===========================================================================
26 *
27 * File Name: fta_src.c
28 *
29 * Author: Sergey Bazhin
30 *
31 * File Description:
32 * -----------------
33 * Messes about source features.
34 */
35 #include <ncbi_pch.hpp>
36
37 #include "ftacpp.hpp"
38
39 #include <objects/seqfeat/Gb_qual.hpp>
40 #include <objects/seq/Seq_annot.hpp>
41 #include <objects/seq/Seq_annot_.hpp>
42 #include <objects/seqfeat/Org_ref.hpp>
43 #include <objects/seqfeat/OrgName.hpp>
44 #include <objects/seqfeat/OrgMod.hpp>
45 #include <objects/general/Dbtag.hpp>
46 #include <objects/general/Object_id.hpp>
47 #include <objects/seqfeat/SubSource.hpp>
48 #include <objects/seqfeat/BioSource.hpp>
49 #include <objects/seq/Seq_descr.hpp>
50
51 #include "index.h"
52
53 #include <objtools/flatfile/flatdefn.h>
54 #include "ftanet.h"
55
56 #include "ftaerr.hpp"
57 #include "asci_blk.h"
58 #include "loadfeat.h"
59 #include "utilfeat.h"
60 #include "add.h"
61 #include "utilfun.h"
62
63 #ifdef THIS_FILE
64 # undef THIS_FILE
65 #endif
66 #define THIS_FILE "fta_src.cpp"
67
68 BEGIN_NCBI_SCOPE
69
70
71 typedef struct {
72 const char *name;
73 Uint1 num;
74 } CharUInt1;
75
76 #define USE_CULTIVAR 00001
77 #define USE_ISOLATE 00002
78 #define USE_SEROTYPE 00004
79 #define USE_SEROVAR 00010
80 #define USE_SPECIMEN_VOUCHER 00020
81 #define USE_STRAIN 00040
82 #define USE_SUB_SPECIES 00100
83 #define USE_SUB_STRAIN 00200
84 #define USE_VARIETY 00400
85 #define USE_ECOTYPE 01000
86 #define USE_ALL 01777
87
88 #define BIOSOURCES_THRESHOLD 20
89
90 typedef struct _pcr_primers {
91 char* fwd_name;
92 char* fwd_seq;
93 char* rev_name;
94 char* rev_seq;
95 struct _pcr_primers* next;
96 } PcrPrimers, *PcrPrimersPtr;
97
98 typedef struct _source_feat_blk {
99 char* name;
100 char* strain;
101 char* organelle;
102 char* isolate;
103 char* namstr;
104 char* location;
105 char* moltype;
106 char* genomename;
107 char* submitter_seqid;
108
109 TQualVector quals;
110 CRef<objects::CBioSource> bio_src;
111 CRef<objects::COrgName> orgname;
112
113 bool full;
114 bool focus;
115 bool tg;
116 bool lookup;
117 bool skip;
118 bool useit;
119
120 Uint1 genome;
121 struct _source_feat_blk* next;
122
_source_feat_blk_source_feat_blk123 _source_feat_blk() :
124 name(NULL),
125 strain(NULL),
126 organelle(NULL),
127 isolate(NULL),
128 namstr(NULL),
129 location(NULL),
130 moltype(NULL),
131 genomename(NULL),
132 submitter_seqid(NULL),
133 full(false),
134 focus(false),
135 tg(false),
136 lookup(false),
137 skip(false),
138 useit(false),
139 genome(0),
140 next(NULL)
141 {}
142
143 } SourceFeatBlk, *SourceFeatBlkPtr;
144
145 typedef struct _min_max {
146 char* orgname; /* Do not free! It's just a pointer */
147 Int4 min;
148 Int4 max;
149 bool skip;
150 struct _min_max* next;
151 } MinMax, *MinMaxPtr;
152
153 static const char *ObsoleteSourceDbxrefTag[] = {
154 "IFO",
155 NULL
156 };
157
158 static const char *DENLRSourceDbxrefTag[] = { /* DENL = DDBJ + EMBL + NCBI +
159 LANL + RefSeq */
160 "AFTOL",
161 "ANTWEB",
162 "ATCC",
163 "ATCC(DNA)",
164 "ATCC(IN HOST)",
165 "BEI",
166 "BOLD",
167 "FBOL",
168 "FUNGORUM",
169 "GREENGENES",
170 "GRIN",
171 "HMP",
172 "HOMD",
173 "IKMC",
174 "ISHAM-ITS",
175 "JCM",
176 "NBRC",
177 "RBGE_GARDEN",
178 "RBGE_HERBARIUM",
179 "RZPD",
180 "UNILIB",
181 NULL
182 };
183
184 static const char *DESourceDbxrefTag[] = { /* DE = DDBJ + EMBL */
185 "FANTOM_DB",
186 "IMGT/HLA",
187 "IMGT/LIGM",
188 "MGD",
189 "MGI",
190 NULL
191 };
192
193 static const char *ESourceDbxrefTag[] = { /* E = EMBL */
194 "UNITE",
195 NULL
196 };
197
198 static const char *NLRSourceDbxrefTag[] = { /* N = NCBI + LANL + RefSeq */
199 "FLYBASE",
200 NULL
201 };
202
203 static const char *exempt_quals[] = {
204 "transposon",
205 "insertion_seq",
206 NULL
207 };
208
209 static const char *special_orgs[] = {
210 "synthetic construct",
211 "artificial sequence",
212 "eukaryotic synthetic construct",
213 NULL
214 };
215
216 static const char *unusual_toks[] = {
217 "complement",
218 NULL
219 };
220
221 static const char *source_genomes[] = {
222 "mitochondr",
223 "chloroplast",
224 "kinetoplas",
225 "cyanelle",
226 "plastid",
227 "chromoplast",
228 "macronuclear",
229 "extrachrom",
230 "plasmid",
231 NULL
232 };
233
234 static const char *SourceBadQuals[] = {
235 "label",
236 "usedin",
237 "citation",
238 NULL
239 };
240
241 static const char *SourceSubSources[] = {
242 "chromosome", /* 1 */
243 "map", /* 2 */
244 "clone", /* 3 */
245 "sub_clone", /* 4 */
246 "haplotype", /* 5 */
247 "genotype", /* 6 */
248 "sex", /* 7 */
249 "cell_line", /* 8 */
250 "cell_type", /* 9 */
251 "tissue_type", /* 10 */
252 "clone_lib", /* 11 */
253 "dev_stage", /* 12 */
254 "frequency", /* 13 */
255 "germline", /* 14 */
256 "rearranged", /* 15 */
257 "lab_host", /* 16 */
258 "pop_variant", /* 17 */
259 "tissue_lib", /* 18 */
260 "plasmid", /* 19 */
261 "transposon", /* 20 */
262 "insertion_seq", /* 21 */
263 "plastid", /* 22 */
264 "", /* 23 */
265 "segment", /* 24 */
266 "", /* 25 */
267 "transgenic", /* 26 */
268 "environmental_sample", /* 27 */
269 "isolation_source", /* 28 */
270 "lat_lon", /* 29 */
271 "collection_date", /* 30 */
272 "collected_by", /* 31 */
273 "identified_by", /* 32 */
274 "", /* 33 */
275 "", /* 34 */
276 "", /* 35 */
277 "", /* 36 */
278 "metagenomic", /* 37 */
279 "mating_type", /* 38 */
280 NULL
281 };
282
283 static CharUInt1 SourceOrgMods[] = {
284 {"strain", 2},
285 {"sub_strain", 3},
286 {"variety", 6},
287 {"serotype", 7},
288 {"serovar", 9},
289 {"cultivar", 10},
290 {"isolate", 17},
291 {"specific_host", 21},
292 {"host", 21},
293 {"sub_species", 22},
294 {"specimen_voucher", 23},
295 {"ecotype", 27},
296 {"culture_collection", 35},
297 {"bio_material", 36},
298 {"metagenome_source", 37},
299 {"type_material", 38},
300 {NULL, 0}
301 };
302
303 static const char *GenomicSourceFeatQual[] = {
304 "unknown",
305 "unknown",
306 "chloroplast",
307 "chromoplast",
308 "kinetoplast",
309 "mitochondrion",
310 "plastid",
311 "macronuclear",
312 "extrachrom",
313 "plasmid",
314 "transposon",
315 "insertion-seq",
316 "cyanelle",
317 "proviral",
318 "virion",
319 "nucleomorph",
320 "apicoplast",
321 "leucoplast",
322 "proplastid", /* 18 */
323 "", /* 19 */
324 "", /* 20 */
325 "", /* 21 */
326 "chromatophore", /* 22 */
327 NULL
328 };
329
330 static const char *OrganelleFirstToken[] = {
331 "chromatophore",
332 "hydrogenosome",
333 "mitochondrion",
334 "nucleomorph",
335 "plastid",
336 NULL
337 };
338
339 /**********************************************************/
SourceFeatBlkNew(void)340 static SourceFeatBlkPtr SourceFeatBlkNew(void)
341 {
342 return new SourceFeatBlk;
343 }
344
345 /**********************************************************/
SourceFeatBlkFree(SourceFeatBlkPtr sfbp)346 static void SourceFeatBlkFree(SourceFeatBlkPtr sfbp)
347 {
348 if (sfbp->name != NULL)
349 MemFree(sfbp->name);
350 if(sfbp->strain != NULL)
351 MemFree(sfbp->strain);
352 if(sfbp->organelle != NULL)
353 MemFree(sfbp->organelle);
354 if(sfbp->isolate != NULL)
355 MemFree(sfbp->isolate);
356 if(sfbp->namstr != NULL)
357 MemFree(sfbp->namstr);
358 if(sfbp->location != NULL)
359 MemFree(sfbp->location);
360 if(sfbp->moltype != NULL)
361 MemFree(sfbp->moltype);
362 if(sfbp->genomename != NULL)
363 MemFree(sfbp->genomename);
364
365 delete sfbp;
366 }
367
368 /**********************************************************/
SourceFeatBlkSetFree(SourceFeatBlkPtr sfbp)369 static void SourceFeatBlkSetFree(SourceFeatBlkPtr sfbp)
370 {
371 SourceFeatBlkPtr tsfbp;
372
373 for(tsfbp = sfbp; tsfbp != NULL; tsfbp = sfbp)
374 {
375 sfbp = tsfbp->next;
376 SourceFeatBlkFree(tsfbp);
377 }
378 }
379
380 /**********************************************************/
CollectSourceFeats(DataBlkPtr dbp,Int2 type)381 static SourceFeatBlkPtr CollectSourceFeats(DataBlkPtr dbp, Int2 type)
382 {
383 SourceFeatBlkPtr sfbp;
384 SourceFeatBlkPtr tsfbp;
385 DataBlkPtr tdbp;
386 FeatBlkPtr fbp;
387
388 sfbp = SourceFeatBlkNew();
389 tsfbp = sfbp;
390
391 for(; dbp != NULL; dbp = dbp->next)
392 {
393 if(dbp->type != type)
394 continue;
395 for(tdbp = (DataBlkPtr) dbp->data; tdbp != NULL; tdbp = tdbp->next)
396 {
397 fbp = (FeatBlkPtr) tdbp->data;
398 if(fbp == NULL || fbp->key == NULL ||
399 StringCmp(fbp->key, "source") != 0)
400 continue;
401 tsfbp->next = SourceFeatBlkNew();
402 tsfbp = tsfbp->next;
403 if(fbp->location != NULL)
404 tsfbp->location = StringSave(fbp->location);
405 tsfbp->quals = fbp->quals;
406 }
407 }
408 tsfbp = sfbp->next;
409 delete sfbp;
410 //MemFree(sfbp);
411 return(tsfbp);
412 }
413
414 /**********************************************************/
RemoveStringSpaces(char * line)415 static void RemoveStringSpaces(char* line)
416 {
417 char* p;
418 char* q;
419
420 if(line == NULL || *line == '\0')
421 return;
422
423 for(p = line, q = line; *p != '\0'; p++)
424 if(*p != ' ' && *p != '\t')
425 *q++ = *p;
426 *q = '\0';
427 }
428
429 /**********************************************************/
RemoveSourceFeatSpaces(SourceFeatBlkPtr sfbp)430 static void RemoveSourceFeatSpaces(SourceFeatBlkPtr sfbp)
431 {
432 for(; sfbp != NULL; sfbp = sfbp->next)
433 {
434 RemoveStringSpaces(sfbp->location);
435 NON_CONST_ITERATE(TQualVector, cur, sfbp->quals)
436 {
437 if ((*cur)->IsSetQual())
438 {
439 std::vector<char> buf((*cur)->GetQual().begin(), (*cur)->GetQual().end());
440 buf.push_back(0);
441 ShrinkSpaces(&buf[0]);
442 (*cur)->SetQual(&buf[0]);
443 }
444
445 if ((*cur)->IsSetVal())
446 {
447 std::vector<char> buf((*cur)->GetVal().begin(), (*cur)->GetVal().end());
448 buf.push_back(0);
449 ShrinkSpaces(&buf[0]);
450 (*cur)->SetVal(&buf[0]);
451 }
452 }
453 }
454 }
455
456 /**********************************************************/
CheckForExemption(SourceFeatBlkPtr sfbp)457 static void CheckForExemption(SourceFeatBlkPtr sfbp)
458 {
459 const char **b;
460
461 for(; sfbp != NULL; sfbp = sfbp->next)
462 {
463 ITERATE(TQualVector, cur, sfbp->quals)
464 {
465 for (b = exempt_quals; *b != NULL; b++)
466 {
467 if ((*cur)->GetQual() == *b)
468 break;
469 }
470 if(*b != NULL)
471 {
472 sfbp->skip = true;
473 break;
474 }
475 }
476 }
477 }
478
479 /**********************************************************/
PopulateSubNames(char * namstr,const Char * name,const Char * value,Uint1 subtype,TOrgModList & mods)480 static void PopulateSubNames(char* namstr, const Char *name,
481 const Char* value, Uint1 subtype, TOrgModList& mods)
482 {
483 CRef<objects::COrgMod> mod(new objects::COrgMod);
484
485 StringCat(namstr, name);
486 StringCat(namstr, value);
487 StringCat(namstr, ")");
488
489 mod->SetSubtype(subtype);
490 mod->SetSubname(value);
491
492 mods.push_front(mod);
493 }
494
495 /**********************************************************/
CollectSubNames(SourceFeatBlkPtr sfbp,Int4 use_what,const Char * name,const Char * cultivar,const Char * isolate,const Char * serotype,const Char * serovar,const Char * specimen_voucher,const Char * strain,const Char * sub_species,const Char * sub_strain,const Char * variety,const Char * ecotype)496 static void CollectSubNames(SourceFeatBlkPtr sfbp, Int4 use_what, const Char* name,
497 const Char* cultivar, const Char* isolate,
498 const Char* serotype, const Char* serovar,
499 const Char* specimen_voucher, const Char* strain,
500 const Char* sub_species, const Char* sub_strain,
501 const Char* variety, const Char* ecotype)
502 {
503 if(sfbp == NULL)
504 return;
505
506 if(sfbp->namstr != NULL)
507 MemFree(sfbp->namstr);
508 sfbp->namstr = NULL;
509
510 if (sfbp->orgname.NotEmpty())
511 sfbp->orgname.Reset();
512
513 if(name == NULL)
514 return;
515
516 size_t i = StringLen(name) + 1;
517 size_t j = i;
518 if((use_what & USE_CULTIVAR) == USE_CULTIVAR && cultivar != NULL)
519 i += (StringLen(cultivar) + StringLen("cultivar") + 5);
520 if((use_what & USE_ISOLATE) == USE_ISOLATE && isolate != NULL)
521 i += (StringLen(isolate) + StringLen("isolate") + 5);
522 if((use_what & USE_SEROTYPE) == USE_SEROTYPE && serotype != NULL)
523 i += (StringLen(serotype) + StringLen("serotype") + 5);
524 if((use_what & USE_SEROVAR) == USE_SEROVAR && serovar != NULL)
525 i += (StringLen(serovar) + StringLen("serovar") + 5);
526 if((use_what & USE_SPECIMEN_VOUCHER) == USE_SPECIMEN_VOUCHER &&
527 specimen_voucher != NULL)
528 i += (StringLen(specimen_voucher) + StringLen("specimen_voucher") + 5);
529 if((use_what & USE_STRAIN) == USE_STRAIN && strain != NULL)
530 i += (StringLen(strain) + StringLen("strain") + 5);
531 if((use_what & USE_SUB_SPECIES) == USE_SUB_SPECIES && sub_species != NULL)
532 i += (StringLen(sub_species) + StringLen("sub_species") + 5);
533 if((use_what & USE_SUB_STRAIN) == USE_SUB_STRAIN && sub_strain != NULL)
534 i += (StringLen(sub_strain) + StringLen("sub_strain") + 5);
535 if((use_what & USE_VARIETY) == USE_VARIETY && variety != NULL)
536 i += (StringLen(variety) + StringLen("variety") + 5);
537 if((use_what & USE_ECOTYPE) == USE_ECOTYPE && ecotype != NULL)
538 i += (StringLen(ecotype) + StringLen("ecotype") + 5);
539 sfbp->namstr = (char*) MemNew(i);
540 StringCpy(sfbp->namstr, name);
541 if(i == j)
542 return;
543
544 sfbp->orgname = new objects::COrgName;
545 TOrgModList& mods = sfbp->orgname->SetMod();
546
547 if((use_what & USE_CULTIVAR) == USE_CULTIVAR && cultivar != NULL)
548 PopulateSubNames(sfbp->namstr, " (cultivar ", cultivar, 10, mods);
549 if((use_what & USE_ISOLATE) == USE_ISOLATE && isolate != NULL)
550 PopulateSubNames(sfbp->namstr, " (isolate ", isolate, 17, mods);
551 if((use_what & USE_SEROTYPE) == USE_SEROTYPE && serotype != NULL)
552 PopulateSubNames(sfbp->namstr, " (serotype ", serotype, 7, mods);
553 if((use_what & USE_SEROVAR) == USE_SEROVAR && serovar != NULL)
554 PopulateSubNames(sfbp->namstr, " (serovar ", serovar, 9, mods);
555 if((use_what & USE_SPECIMEN_VOUCHER) == USE_SPECIMEN_VOUCHER &&
556 specimen_voucher != NULL)
557 PopulateSubNames(sfbp->namstr, " (specimen_voucher ", specimen_voucher, 23, mods);
558 if((use_what & USE_STRAIN) == USE_STRAIN && strain != NULL)
559 PopulateSubNames(sfbp->namstr, " (strain ", strain, 2, mods);
560 if((use_what & USE_SUB_SPECIES) == USE_SUB_SPECIES && sub_species != NULL)
561 PopulateSubNames(sfbp->namstr, " (sub_species ", sub_species, 22, mods);
562 if((use_what & USE_SUB_STRAIN) == USE_SUB_STRAIN && sub_strain != NULL)
563 PopulateSubNames(sfbp->namstr, " (sub_strain ", sub_strain, 3, mods);
564 if((use_what & USE_VARIETY) == USE_VARIETY && variety != NULL)
565 PopulateSubNames(sfbp->namstr, " (variety ", variety, 6, mods);
566 if((use_what & USE_ECOTYPE) == USE_ECOTYPE && ecotype != NULL)
567 PopulateSubNames(sfbp->namstr, " (ecotype ", ecotype, 27, mods);
568 }
569
570 /**********************************************************/
SourceFeatStructFillIn(IndexblkPtr ibp,SourceFeatBlkPtr sfbp,Int4 use_what)571 static bool SourceFeatStructFillIn(IndexblkPtr ibp, SourceFeatBlkPtr sfbp, Int4 use_what)
572 {
573 const Char **b;
574
575 const Char* name;
576 const Char* cultivar;
577 const Char* isolate;
578 const Char* organelle;
579 const Char* serotype;
580 const Char* serovar;
581 const Char* ecotype;
582 const Char* specimen_voucher;
583 const Char* strain;
584 const Char* sub_species;
585 const Char* sub_strain;
586 const Char* variety;
587 char* genomename;
588 const Char* p;
589 char* q;
590 bool ret;
591 Int4 i;
592
593 for(ret = true; sfbp != NULL; sfbp = sfbp->next)
594 {
595 name = NULL;
596 cultivar = NULL;
597 isolate = NULL;
598 organelle = NULL;
599 serotype = NULL;
600 serovar = NULL;
601 ecotype = NULL;
602 specimen_voucher = NULL;
603 strain = NULL;
604 sub_species = NULL;
605 sub_strain = NULL;
606 variety = NULL;
607 genomename = NULL;
608
609 ITERATE(TQualVector, cur, sfbp->quals)
610 {
611 if (!(*cur)->IsSetQual())
612 continue;
613
614 const std::string& qual_str = (*cur)->GetQual();
615 const Char* val_ptr = (*cur)->IsSetVal() ? (*cur)->GetVal().c_str() : NULL;
616
617 if (qual_str == "db_xref")
618 {
619 q = StringChr(val_ptr, ':');
620 if(q == NULL || q[1] == '\0')
621 continue;
622 *q = '\0';
623 if (StringICmp(val_ptr, "taxon") == 0)
624 if(ibp->taxid < 1)
625 ibp->taxid = atoi(q + 1);
626 *q = ':';
627 continue;
628 }
629 if (qual_str == "focus")
630 {
631 sfbp->focus = true;
632 continue;
633 }
634 if (qual_str == "transgenic")
635 {
636 sfbp->tg = true;
637 continue;
638 }
639 if (qual_str == "cultivar")
640 {
641 cultivar = val_ptr;
642 continue;
643 }
644 if (qual_str == "isolate")
645 {
646 if(isolate == NULL)
647 isolate = val_ptr;
648 continue;
649 }
650 if (qual_str == "mol_type")
651 {
652 if(sfbp->moltype != NULL)
653 ret = false;
654 else if (val_ptr != NULL)
655 sfbp->moltype = StringSave(val_ptr);
656 continue;
657 }
658 if (qual_str == "organelle")
659 {
660 if(organelle == NULL)
661 organelle = val_ptr;
662 continue;
663 }
664 if (qual_str == "serotype")
665 {
666 serotype = val_ptr;
667 continue;
668 }
669 if (qual_str == "serovar")
670 {
671 serovar = val_ptr;
672 continue;
673 }
674 if (qual_str == "ecotype")
675 {
676 ecotype = val_ptr;
677 continue;
678 }
679 if (qual_str == "specimen_voucher")
680 {
681 specimen_voucher = val_ptr;
682 continue;
683 }
684 if (qual_str == "strain")
685 {
686 if(strain == NULL)
687 strain = val_ptr;
688 continue;
689 }
690 if (qual_str == "sub_species")
691 {
692 sub_species = val_ptr;
693 continue;
694 }
695 if (qual_str == "sub_strain")
696 {
697 sub_strain = val_ptr;
698 continue;
699 }
700 if (qual_str == "variety")
701 {
702 variety = val_ptr;
703 continue;
704 }
705 if(qual_str == "submitter_seqid")
706 {
707 if(sfbp->submitter_seqid != NULL)
708 {
709 MemFree(sfbp->submitter_seqid);
710 sfbp->submitter_seqid = StringSave("");
711 }
712 else
713 sfbp->submitter_seqid = StringSave(val_ptr);
714 if(ibp->submitter_seqid == NULL)
715 ibp->submitter_seqid = StringSave(val_ptr);
716 continue;
717 }
718
719 if (qual_str != "organism" ||
720 val_ptr == NULL || val_ptr[0] == '\0')
721 continue;
722
723 if(ibp->organism == NULL)
724 ibp->organism = StringSave(val_ptr);
725
726 p = StringChr(val_ptr, ' ');
727
728 std::string str_to_find;
729 if (p != NULL)
730 str_to_find.assign(val_ptr, p);
731 else
732 str_to_find.assign(val_ptr);
733
734 for(i = 0, b = source_genomes; *b != NULL; b++, i++)
735 if (StringNICmp(str_to_find.c_str(), *b, StringLen(*b)) == 0)
736 break;
737 if(*b != NULL && i != 8)
738 {
739 if(genomename != NULL)
740 MemFree(genomename);
741 genomename = StringSave(str_to_find.c_str());
742 }
743
744 if(p != NULL)
745 ++p;
746
747 if(*b == NULL)
748 p = val_ptr;
749 else
750 {
751 if(i == 0)
752 sfbp->genome = 5; /* Mitochondrion */
753 else if(i == 1)
754 sfbp->genome = 2; /* Chloroplast */
755 else if(i == 2)
756 sfbp->genome = 4; /* Kinetoplast */
757 else if(i == 3)
758 sfbp->genome = 12; /* Cyanelle */
759 else if(i == 4)
760 sfbp->genome = 6; /* Plastid */
761 else if(i == 5)
762 sfbp->genome = 3; /* Chromoplast */
763 else if(i == 6)
764 sfbp->genome = 7; /* Macronuclear */
765 else if(i == 7)
766 sfbp->genome = 8; /* Extrachrom */
767 else if(i == 8)
768 {
769 p = val_ptr;
770 sfbp->genome = 9; /* Plasmid */
771 }
772 }
773 name = p;
774 }
775
776 if(sfbp->name != NULL)
777 MemFree(sfbp->name);
778 sfbp->name = (name == NULL) ? NULL : StringSave(name);
779
780 if(sfbp->genomename != NULL)
781 MemFree(sfbp->genomename);
782 sfbp->genomename = genomename;
783
784 if(strain != NULL && sfbp->strain == NULL)
785 sfbp->strain = StringSave(strain);
786 if(isolate != NULL && sfbp->isolate == NULL)
787 sfbp->isolate = StringSave(isolate);
788 if(organelle != NULL && sfbp->organelle == NULL)
789 sfbp->organelle = StringSave(organelle);
790
791 CollectSubNames(sfbp, use_what, name, cultivar, isolate, serotype,
792 serovar, specimen_voucher, strain, sub_species,
793 sub_strain, variety, ecotype);
794 }
795 return(ret);
796 }
797
798 /**********************************************************/
CheckSourceFeatFocusAndTransposon(SourceFeatBlkPtr sfbp)799 static char* CheckSourceFeatFocusAndTransposon(SourceFeatBlkPtr sfbp)
800 {
801 for (; sfbp != NULL; sfbp = sfbp->next)
802 {
803 if (sfbp->focus && sfbp->skip)
804 break;
805 }
806
807 if(sfbp != NULL)
808 return(sfbp->location);
809 return(NULL);
810 }
811
812 /**********************************************************/
CheckSourceFeatOrgs(SourceFeatBlkPtr sfbp,int * status)813 static char* CheckSourceFeatOrgs(SourceFeatBlkPtr sfbp, int* status)
814 {
815 *status = 0;
816 for(; sfbp != NULL; sfbp = sfbp->next)
817 {
818 /** if(sfbp->namstr != NULL)*/
819 if(sfbp->name != NULL)
820 continue;
821
822 *status = (sfbp->genome == 0) ? 1 : 2;
823 break;
824 }
825 if(sfbp != NULL)
826 return(sfbp->location);
827 return(NULL);
828 }
829
830 /**********************************************************/
CheckSourceFeatLocFuzz(SourceFeatBlkPtr sfbp)831 static bool CheckSourceFeatLocFuzz(SourceFeatBlkPtr sfbp)
832 {
833 const char **b;
834 char* p;
835 char* q;
836 Int4 count;
837 bool partial;
838 bool invalid;
839 bool ret;
840
841 ret = true;
842 for(; sfbp != NULL; sfbp = sfbp->next)
843 {
844 if(sfbp->location == NULL || sfbp->location[0] == '\0')
845 break;
846 if(sfbp->skip)
847 continue;
848
849 ITERATE(TQualVector, cur, sfbp->quals)
850 {
851 if ((*cur)->GetQual() != "partial")
852 continue;
853
854 ErrPostEx(SEV_ERROR, ERR_SOURCE_PartialQualifier,
855 "Source feature location has /partial qualifier. Qualifier has been ignored: \"%s\".",
856 (sfbp->location == NULL) ? "?empty?" : sfbp->location);
857 break;
858 }
859
860 for(b = unusual_toks; *b != NULL; b++)
861 {
862 p = StringStr(sfbp->location, *b);
863 if(p == NULL)
864 continue;
865 q = p + StringLen(*b);
866 if(p > sfbp->location)
867 p--;
868 if((p == sfbp->location || *p == '(' || *p == ')' ||
869 *p == ':' || *p == ',' || *p == '.') &&
870 (*q == '\0' || *q == '(' || *q == ')' || *q == ',' ||
871 *q == ':' || *q == '.'))
872 {
873 ErrPostEx(SEV_ERROR, ERR_SOURCE_UnusualLocation,
874 "Source feature has an unusual location: \"%s\".",
875 (sfbp->location == NULL) ? "?empty?" : sfbp->location);
876 break;
877 }
878 }
879
880 partial = false;
881 invalid = false;
882 for(count = 0, p = sfbp->location; *p != '\0'; p++)
883 {
884 if(*p == '^')
885 invalid = true;
886 else if(*p == '>' || *p == '<')
887 partial = true;
888 else if(*p == '(')
889 count++;
890 else if(*p == ')')
891 count--;
892 else if(*p == '.' && p[1] == '.')
893 p++;
894 else if(*p == '.' && p[1] != '.')
895 {
896 for(q = p + 1; *q >= '0' && *q <= '9';)
897 q++;
898 if(q == p || *q != ':')
899 invalid = true;
900 }
901 }
902 if(partial)
903 {
904 ErrPostEx(SEV_ERROR, ERR_SOURCE_PartialLocation,
905 "Source feature location is partial; partiality flags have been ignored: \"%s\".",
906 (sfbp->location == NULL) ? "?empty?" : sfbp->location);
907 }
908 if(invalid || count != 0)
909 {
910 ErrPostEx(SEV_REJECT, ERR_SOURCE_InvalidLocation,
911 "Invalid location for source feature at \"%s\". Entry dropped.",
912 (sfbp->location == NULL) ? "?empty?" : sfbp->location);
913 ret = false;
914 }
915 }
916 return(ret);
917 }
918
919 /**********************************************************/
CheckSourceFeatLocAccs(SourceFeatBlkPtr sfbp,char * acc)920 static char* CheckSourceFeatLocAccs(SourceFeatBlkPtr sfbp, char* acc)
921 {
922 char* p;
923 char* q;
924 char* r;
925 Int4 i;
926
927 for(; sfbp != NULL; sfbp = sfbp->next)
928 {
929 if(sfbp->location == NULL || sfbp->location[0] == '\0')
930 continue;
931 for(p = sfbp->location + 1; *p != '\0'; p++)
932 {
933 if(*p != ':')
934 continue;
935 for(r = NULL, q = p - 1;; q--)
936 {
937 if(q == sfbp->location)
938 {
939 if(*q != '_' && (*q < '0' || *q > '9') &&
940 (*q < 'a' || *q > 'z') && (*q < 'A' || *q > 'Z'))
941 q++;
942 break;
943 }
944 if(*q == '.')
945 {
946 if(r == NULL)
947 {
948 r = q;
949 continue;
950 }
951 q++;
952 break;
953 }
954 if(*q != '_' && (*q < '0' || *q > '9') &&
955 (*q < 'a' || *q > 'z') && (*q < 'A' || *q > 'Z'))
956 {
957 q++;
958 break;
959 }
960 }
961 if(q == p)
962 continue;
963 if(r != NULL)
964 *r = '\0';
965 else
966 *p = '\0';
967 i = StringICmp(q, acc);
968 if(r != NULL)
969 *r = '.';
970 else
971 *p = ':';
972 if(i != 0)
973 break;
974 }
975 if(*p != '\0')
976 break;
977 }
978 if(sfbp == NULL)
979 return(NULL);
980 return(sfbp->location);
981 }
982
983 /**********************************************************/
MinMaxFree(MinMaxPtr mmp)984 static void MinMaxFree(MinMaxPtr mmp)
985 {
986 MinMaxPtr tmmp;
987
988 for(; mmp != NULL; mmp = tmmp)
989 {
990 tmmp = mmp->next;
991 MemFree(mmp);
992 }
993 }
994
995 /**********************************************************/
fta_if_special_org(const Char * name)996 bool fta_if_special_org(const Char* name)
997 {
998 const char **b;
999
1000 if(name == NULL || *name == '\0')
1001 return false;
1002
1003 for(b = special_orgs; *b != NULL; b++)
1004 if(StringICmp(*b, name) == 0)
1005 break;
1006 if(*b != NULL || StringIStr(name, "vector") != NULL)
1007 return true;
1008 return false;
1009 }
1010
1011 /**********************************************************/
CheckSourceFeatCoverage(SourceFeatBlkPtr sfbp,MinMaxPtr mmp,size_t len)1012 static Int4 CheckSourceFeatCoverage(SourceFeatBlkPtr sfbp, MinMaxPtr mmp,
1013 size_t len)
1014 {
1015 SourceFeatBlkPtr tsfbp;
1016 MinMaxPtr tmmp;
1017 MinMaxPtr mmpnext;
1018 char* p;
1019 char* q;
1020 char* r;
1021 char* loc;
1022 Int4 count;
1023 Int4 min;
1024 Int4 max;
1025 Int4 i;
1026 Int4 tgs;
1027 Int4 sporg;
1028
1029 loc = NULL;
1030 tmmp = mmp;
1031 for(tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1032 {
1033 if(tsfbp->location == NULL || tsfbp->location[0] == '\0' ||
1034 tsfbp->name == NULL || tsfbp->name[0] == '\0')
1035 continue;
1036 if(loc != NULL)
1037 MemFree(loc);
1038 loc = StringSave(tsfbp->location);
1039 for(p = loc; *p != '\0'; p++)
1040 if(*p == ',' || *p == '(' || *p == ')' || *p == ':' ||
1041 *p == ';' || *p == '^')
1042 *p = ' ';
1043 for(p = loc, q = loc; *p != '\0';)
1044 {
1045 if(*p == '>' || *p == '<')
1046 {
1047 p++;
1048 continue;
1049 }
1050 *q++ = *p;
1051 if(*p == ' ')
1052 while(*p == ' ')
1053 p++;
1054 else
1055 p++;
1056 }
1057 if(q > loc && *(q - 1) == ' ')
1058 q--;
1059 *q = '\0';
1060
1061 q = (*loc == ' ') ? (loc + 1) : loc;
1062 for(p = q;;)
1063 {
1064 min = 0;
1065 max = 0;
1066 p = StringChr(p, ' ');
1067 if(p != NULL)
1068 *p++ = '\0';
1069 for(r = q; *r >= '0' && *r <= '9';)
1070 r++;
1071 if(*r == '\0')
1072 {
1073 i = atoi(q);
1074 if(i > 0)
1075 {
1076 min = i;
1077 max = i;
1078 }
1079 }
1080 else if(*r == '.' && r[1] == '.')
1081 {
1082 *r++ = '\0';
1083 min = atoi(q);
1084 if(min > 0)
1085 {
1086 for(q = ++r; *r >= '0' && *r <= '9';)
1087 r++;
1088 if(*r == '\0')
1089 max = atoi(q);
1090 }
1091 }
1092 if(min > 0 && max > 0)
1093 {
1094 if(min == 1 && (size_t) max == len)
1095 tsfbp->full = true;
1096 for(tmmp = mmp;; tmmp = tmmp->next)
1097 {
1098 if(min < tmmp->min)
1099 {
1100 mmpnext = tmmp->next;
1101 tmmp->next = (MinMaxPtr) MemNew(sizeof(MinMax));
1102 tmmp->next->orgname = tmmp->orgname;
1103 tmmp->next->min = tmmp->min;
1104 tmmp->next->max = tmmp->max;
1105 tmmp->next->skip = tmmp->skip;
1106 tmmp->next->next = mmpnext;
1107 tmmp->orgname = tsfbp->name;
1108 tmmp->min = min;
1109 tmmp->max = max;
1110 tmmp->skip = tsfbp->skip;
1111 break;
1112 }
1113 if(tmmp->next == NULL)
1114 {
1115 tmmp->next = (MinMaxPtr) MemNew(sizeof(MinMax));
1116 tmmp->next->orgname = tsfbp->name;
1117 tmmp->next->min = min;
1118 tmmp->next->max = max;
1119 tmmp->next->skip = tsfbp->skip;
1120 break;
1121 }
1122 }
1123 }
1124
1125 if(p == NULL)
1126 break;
1127 q = p;
1128 }
1129 }
1130 if(loc != NULL)
1131 MemFree(loc);
1132
1133 mmp = mmp->next;
1134 if(mmp == NULL || mmp->min != 1)
1135 return(1);
1136
1137 for(max = mmp->max; mmp != NULL; mmp = mmp->next)
1138 if(mmp->max > max && mmp->min <= max + 1)
1139 max = mmp->max;
1140
1141 if((size_t) max < len)
1142 return(1);
1143
1144 tgs = 0;
1145 count = 0;
1146 sporg = 0;
1147 for(tsfbp = sfbp, i = 0; tsfbp != NULL; tsfbp = tsfbp->next, i++)
1148 {
1149 if(!tsfbp->full)
1150 continue;
1151
1152 if(fta_if_special_org(tsfbp->name))
1153 sporg++;
1154
1155 count++;
1156 if(tsfbp->tg)
1157 tgs++;
1158 }
1159
1160 if(count < 2)
1161 return(0);
1162 if(count > 2 || i > count || (tgs != 1 && sporg != 1))
1163 return(2);
1164 return(0);
1165 }
1166
1167 /**********************************************************/
CheckWholeSourcesVersusFocused(SourceFeatBlkPtr sfbp)1168 static char* CheckWholeSourcesVersusFocused(SourceFeatBlkPtr sfbp)
1169 {
1170 char* p = NULL;
1171 bool whole = false;
1172
1173 for(; sfbp != NULL; sfbp = sfbp->next)
1174 {
1175 if(sfbp->full)
1176 whole = true;
1177 else if(sfbp->focus)
1178 p = sfbp->location;
1179 }
1180
1181 if(whole)
1182 return(p);
1183 return(NULL);
1184 }
1185
1186 /**********************************************************/
CheckSYNTGNDivision(SourceFeatBlkPtr sfbp,char * div)1187 static bool CheckSYNTGNDivision(SourceFeatBlkPtr sfbp, char* div)
1188 {
1189 char* p;
1190 bool got;
1191 bool ret;
1192 Int4 syntgndiv;
1193 Char ch;
1194
1195 syntgndiv = 0;
1196 if(div != NULL && *div != '\0')
1197 {
1198 if(StringCmp(div, "SYN") == 0)
1199 syntgndiv = 1;
1200 else if(StringCmp(div, "TGN") == 0)
1201 syntgndiv = 2;
1202 }
1203
1204 for(ret = true, got = false; sfbp != NULL; sfbp = sfbp->next)
1205 {
1206 if(!sfbp->tg)
1207 continue;
1208
1209 if(syntgndiv == 0)
1210 {
1211 p = sfbp->location;
1212 if(p != NULL && StringLen(p) > 50)
1213 {
1214 ch = p[50];
1215 p[50] = '\0';
1216 }
1217 else
1218 ch = '\0';
1219 ErrPostEx(SEV_REJECT, ERR_DIVISION_TransgenicNotSYN_TGN,
1220 "Source feature located at \"%s\" has a /transgenic qualifier, but this record is not in the SYN or TGN division.",
1221 (p == NULL) ? "unknown" : p);
1222 if(ch != '\0')
1223 p[50] = ch;
1224 ret = false;
1225 }
1226
1227 if(sfbp->full)
1228 got = true;
1229 }
1230
1231 if(syntgndiv == 2 && !got)
1232 ErrPostEx(SEV_ERROR, ERR_DIVISION_TGNnotTransgenic,
1233 "This record uses the TGN division code, but there is no full-length /transgenic source feature.");
1234 return(ret);
1235 }
1236
1237 /**********************************************************/
CheckTransgenicSourceFeats(SourceFeatBlkPtr sfbp)1238 static Int4 CheckTransgenicSourceFeats(SourceFeatBlkPtr sfbp)
1239 {
1240 SourceFeatBlkPtr tsfbp;
1241 char* taxname;
1242 bool same;
1243 bool tgfull;
1244
1245 if(sfbp == NULL)
1246 return(0);
1247
1248 Int4 ret = 0;
1249 bool tgs = false;
1250 bool focus = false;
1251 for(tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1252 {
1253 if(tsfbp->tg)
1254 {
1255 if(!tsfbp->full)
1256 ret = 1; /* /transgenic on not full-length */
1257 else if(tgs)
1258 ret = 3; /* multiple /transgenics */
1259 if(ret != 0)
1260 break;
1261 tgs = true;
1262 }
1263 if(tsfbp->focus)
1264 focus = true;
1265 if(tgs && focus)
1266 {
1267 ret = 2; /* /focus and /transgenic */
1268 break;
1269 }
1270 }
1271
1272 if(ret != 0)
1273 return(ret);
1274
1275 same = true;
1276 tgfull = false;
1277 taxname = NULL;
1278 for(tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1279 {
1280 if(tsfbp->skip)
1281 continue;
1282 if(taxname == NULL)
1283 taxname = tsfbp->name;
1284 else if(same && !fta_strings_same(taxname, tsfbp->name))
1285 same = false;
1286 if(tsfbp->tg && tsfbp->full)
1287 tgfull = true;
1288 if(tsfbp->focus)
1289 focus = true;
1290 }
1291
1292 if(same == false && tgfull == false && focus == false)
1293 return(4);
1294
1295 if(sfbp->next == NULL || !tgs)
1296 return(0);
1297
1298 for(tsfbp = sfbp->next; tsfbp != NULL; tsfbp = tsfbp->next)
1299 if(fta_strings_same(sfbp->name, tsfbp->name) == false ||
1300 fta_strings_same(sfbp->strain, tsfbp->strain) == false ||
1301 fta_strings_same(sfbp->isolate, tsfbp->isolate) == false ||
1302 fta_strings_same(sfbp->organelle, tsfbp->organelle) == false)
1303 break;
1304
1305 if(tsfbp == NULL)
1306 return(5); /* all source features have the same
1307 /organism, /strain, /isolate and
1308 /organelle qualifiers */
1309 return(0);
1310 }
1311
1312 /**********************************************************/
CheckFocusInOrgs(SourceFeatBlkPtr sfbp,size_t len,int * status)1313 static Int4 CheckFocusInOrgs(SourceFeatBlkPtr sfbp, size_t len, int* status)
1314 {
1315 SourceFeatBlkPtr tsfbp;
1316 const char **b;
1317 char* name;
1318 Char pat[100];
1319 Int4 count;
1320 bool same;
1321
1322 count = 0;
1323 name = NULL;
1324 same = true;
1325 for(tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1326 {
1327 if(tsfbp->name == NULL)
1328 continue;
1329 if(tsfbp->focus)
1330 count++;
1331 if(name == NULL)
1332 name = tsfbp->name;
1333 else if(StringICmp(name, tsfbp->name) != 0)
1334 same = false;
1335 }
1336 if(same && count > 0)
1337 (*status)++;
1338
1339 name = NULL;
1340 for(tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1341 {
1342 if(!tsfbp->focus || tsfbp->name == NULL)
1343 continue;
1344 if(name == NULL)
1345 name = tsfbp->name;
1346 else if(StringICmp(name, tsfbp->name) != 0)
1347 break;
1348 }
1349 if(tsfbp != NULL)
1350 return(2);
1351
1352 if(same || count != 0)
1353 return(0);
1354
1355 name = NULL;
1356 sprintf(pat, "1..%ld", len);
1357 for(tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1358 {
1359 if(tsfbp->name == NULL || tsfbp->location == NULL ||
1360 tsfbp->skip)
1361 continue;
1362
1363 for (b = special_orgs; *b != NULL; b++)
1364 {
1365 if (StringICmp(*b, tsfbp->name) == 0 &&
1366 StringCmp(tsfbp->location, pat) == 0)
1367 break;
1368 }
1369 if(*b != NULL)
1370 continue;
1371
1372 if(name == NULL)
1373 /** name = tsfbp->namstr;*/
1374 name = tsfbp->name;
1375 /** else if(StringICmp(name, tsfbp->namstr) != 0)*/
1376 else if(StringICmp(name, tsfbp->name) != 0)
1377 break;
1378 }
1379
1380 if(tsfbp == NULL)
1381 return(0);
1382
1383 for (tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1384 {
1385 if (tsfbp->full && tsfbp->tg && !tsfbp->skip)
1386 break;
1387 }
1388
1389 if(tsfbp != NULL)
1390 return(0);
1391 return(3);
1392 }
1393
1394 /**********************************************************/
IfSpecialFeat(MinMaxPtr mmp,size_t len)1395 static bool IfSpecialFeat(MinMaxPtr mmp, size_t len)
1396 {
1397 if((mmp->min == 1 && (size_t) mmp->max == len) || mmp->skip)
1398 return true;
1399 return false;
1400 }
1401
1402 /**********************************************************/
CheckSourceOverlap(MinMaxPtr mmp,size_t len)1403 static char* CheckSourceOverlap(MinMaxPtr mmp, size_t len)
1404 {
1405 MinMaxPtr tmmp;
1406 char* res;
1407
1408 for(; mmp != NULL; mmp = mmp->next)
1409 {
1410 if(IfSpecialFeat(mmp, len))
1411 continue;
1412 for(tmmp = mmp->next; tmmp != NULL; tmmp = tmmp->next)
1413 {
1414 if(IfSpecialFeat(tmmp, len))
1415 continue;
1416 if(StringICmp(mmp->orgname, tmmp->orgname) == 0)
1417 continue;
1418 if(tmmp->min <= mmp->max && tmmp->max >= mmp->min)
1419 break;
1420 }
1421 if(tmmp != NULL)
1422 break;
1423 }
1424 if(mmp == NULL)
1425 return(NULL);
1426
1427 res = (char*) MemNew(1024);
1428 sprintf(res, "\"%s\" at %d..%d vs \"%s\" at %d..%d", mmp->orgname,
1429 mmp->min, mmp->max, tmmp->orgname, tmmp->min, tmmp->max);
1430 return(res);
1431 }
1432
1433 /**********************************************************/
CheckForUnusualFullLengthOrgs(SourceFeatBlkPtr sfbp)1434 static char* CheckForUnusualFullLengthOrgs(SourceFeatBlkPtr sfbp)
1435 {
1436 SourceFeatBlkPtr tsfbp;
1437 const char **b;
1438
1439 if(sfbp == NULL || sfbp->next == NULL)
1440 return(NULL);
1441
1442 for(tsfbp = sfbp->next; tsfbp != NULL; tsfbp = tsfbp->next)
1443 if(StringICmp(sfbp->name, tsfbp->name) != 0)
1444 break;
1445
1446 if(tsfbp == NULL)
1447 return(NULL);
1448
1449 for(tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1450 if(tsfbp->full && tsfbp->tg)
1451 break;
1452
1453 if(tsfbp != NULL)
1454 return(NULL);
1455
1456 for(; sfbp != NULL; sfbp = sfbp->next)
1457 {
1458 if(!sfbp->full || sfbp->tg)
1459 continue;
1460
1461 for(b = special_orgs; *b != NULL; b++)
1462 if(StringICmp(*b, sfbp->name) == 0)
1463 break;
1464
1465 if(*b != NULL)
1466 continue;
1467
1468 if(StringIStr(sfbp->name, "vector") == NULL)
1469 break;
1470 }
1471 if(sfbp == NULL)
1472 return(NULL);
1473 return(sfbp->name);
1474 }
1475
1476 /**********************************************************/
CreateRawBioSources(ParserPtr pp,SourceFeatBlkPtr sfbp,Int4 use_what)1477 static void CreateRawBioSources(ParserPtr pp, SourceFeatBlkPtr sfbp,
1478 Int4 use_what)
1479 {
1480 SourceFeatBlkPtr tsfbp;
1481 char* namstr;
1482 const Char* cultivar;
1483 const Char* isolate;
1484 const Char* serotype;
1485 const Char* serovar;
1486 const Char* ecotype;
1487 const Char* specimen_voucher;
1488 const Char* strain;
1489 const Char* sub_species;
1490 const Char* sub_strain;
1491 const Char* variety;
1492
1493 for(; sfbp != NULL; sfbp = sfbp->next)
1494 {
1495 if (sfbp->bio_src.NotEmpty())
1496 continue;
1497
1498 namstr = StringSave(sfbp->namstr);
1499 CRef<objects::COrg_ref> org_ref(new objects::COrg_ref);
1500 org_ref->SetTaxname(sfbp->name);
1501
1502 if (sfbp->orgname.NotEmpty())
1503 {
1504 org_ref->SetOrgname(*sfbp->orgname);
1505 }
1506
1507 CRef<objects::COrg_ref> t_org_ref(new objects::COrg_ref);
1508 t_org_ref->Assign(*org_ref);
1509 fta_fix_orgref(pp, *org_ref, &pp->entrylist[pp->curindx]->drop, sfbp->genomename);
1510
1511 if (t_org_ref->Equals(*org_ref))
1512 sfbp->lookup = false;
1513 else
1514 {
1515 sfbp->lookup = true;
1516 MemFree(sfbp->name);
1517 sfbp->name = StringSave(org_ref->GetTaxname().c_str());
1518
1519 sfbp->orgname.Reset();
1520
1521 cultivar = NULL;
1522 isolate = NULL;
1523 serotype = NULL;
1524 serovar = NULL;
1525 ecotype = NULL;
1526 specimen_voucher = NULL;
1527 strain = NULL;
1528 sub_species = NULL;
1529 sub_strain = NULL;
1530 variety = NULL;
1531 if (org_ref->IsSetOrgname() && org_ref->IsSetOrgMod())
1532 {
1533 ITERATE(objects::COrgName::TMod, mod, org_ref->GetOrgname().GetMod())
1534 {
1535 switch ((*mod)->GetSubtype())
1536 {
1537 case 10:
1538 cultivar = (*mod)->GetSubname().c_str();
1539 break;
1540 case 17:
1541 isolate = (*mod)->GetSubname().c_str();
1542 break;
1543 case 7:
1544 serotype = (*mod)->GetSubname().c_str();
1545 break;
1546 case 9:
1547 serovar = (*mod)->GetSubname().c_str();
1548 break;
1549 case 27:
1550 ecotype = (*mod)->GetSubname().c_str();
1551 break;
1552 case 23:
1553 specimen_voucher = (*mod)->GetSubname().c_str();
1554 break;
1555 case 2:
1556 strain = (*mod)->GetSubname().c_str();
1557 break;
1558 case 22:
1559 sub_species = (*mod)->GetSubname().c_str();
1560 break;
1561 case 3:
1562 sub_strain = (*mod)->GetSubname().c_str();
1563 break;
1564 case 6:
1565 variety = (*mod)->GetSubname().c_str();
1566 break;
1567 }
1568 }
1569 }
1570 CollectSubNames(sfbp, use_what, sfbp->name, cultivar, isolate,
1571 serotype, serovar, specimen_voucher, strain,
1572 sub_species, sub_strain, variety, ecotype);
1573 }
1574
1575 sfbp->bio_src.Reset(new objects::CBioSource);
1576 sfbp->bio_src->SetOrg(*org_ref);
1577
1578 for(tsfbp = sfbp->next; tsfbp != NULL; tsfbp = tsfbp->next)
1579 {
1580 if(tsfbp->bio_src.NotEmpty() || StringICmp(namstr, tsfbp->namstr) != 0)
1581 continue;
1582
1583 tsfbp->lookup = sfbp->lookup;
1584
1585 tsfbp->bio_src.Reset(new objects::CBioSource);
1586 tsfbp->bio_src->Assign(*sfbp->bio_src);
1587
1588 if(!sfbp->lookup)
1589 continue;
1590
1591 MemFree(tsfbp->name);
1592 tsfbp->name = StringSave(sfbp->name);
1593
1594 MemFree(tsfbp->namstr);
1595 tsfbp->namstr = StringSave(sfbp->namstr);
1596 }
1597 MemFree(namstr);
1598 }
1599 }
1600
1601 /**********************************************************/
SourceFeatMoveOneUp(SourceFeatBlkPtr where,SourceFeatBlkPtr what)1602 static SourceFeatBlkPtr SourceFeatMoveOneUp(SourceFeatBlkPtr where,
1603 SourceFeatBlkPtr what)
1604 {
1605 SourceFeatBlkPtr prev;
1606 SourceFeatBlkPtr tsfbp;
1607
1608 if(what == where)
1609 return(where);
1610
1611 prev = where;
1612 for(tsfbp = where->next; tsfbp != NULL; tsfbp = tsfbp->next)
1613 {
1614 if(tsfbp == what)
1615 break;
1616 prev = tsfbp;
1617 }
1618 if(tsfbp == NULL)
1619 return(where);
1620
1621 prev->next = what->next;
1622 what->next = where;
1623 return(what);
1624 }
1625
1626 /**********************************************************/
SourceFeatRemoveDups(SourceFeatBlkPtr sfbp)1627 static SourceFeatBlkPtr SourceFeatRemoveDups(SourceFeatBlkPtr sfbp)
1628 {
1629 SourceFeatBlkPtr tsfbp;
1630 SourceFeatBlkPtr prev;
1631 SourceFeatBlkPtr next;
1632
1633 for(prev = sfbp, tsfbp = sfbp->next; tsfbp != NULL; tsfbp = next)
1634 {
1635 next = tsfbp->next;
1636 if(!tsfbp->useit)
1637 {
1638 prev = tsfbp;
1639 continue;
1640 }
1641
1642 bool different = false;
1643 ITERATE(TQualVector, cur, tsfbp->quals)
1644 {
1645 const std::string& cur_qual = (*cur)->GetQual();
1646 if (cur_qual == "focus")
1647 continue;
1648
1649 bool found = false;
1650 ITERATE(TQualVector, next, sfbp->quals)
1651 {
1652 const std::string& next_qual = (*next)->GetQual();
1653
1654 if (next_qual == "focus" || next_qual != cur_qual)
1655 continue;
1656
1657 if (!(*cur)->IsSetVal() && !(*next)->IsSetVal())
1658 {
1659 found = true;
1660 break;
1661 }
1662
1663 if ((*cur)->IsSetVal() && (*next)->IsSetVal() &&
1664 (*cur)->GetVal() == (*next)->GetVal())
1665 {
1666 found = true;
1667 break;
1668 }
1669 }
1670
1671 if (!found) /* Different, leave as is */
1672 {
1673 different = true;
1674 break;
1675 }
1676 }
1677
1678 if (different) /* Different, leave as is */
1679 {
1680 prev = tsfbp;
1681 continue;
1682 }
1683 prev->next = tsfbp->next;
1684 tsfbp->next = NULL;
1685 SourceFeatBlkFree(tsfbp);
1686 }
1687 return(sfbp);
1688 }
1689
1690 /**********************************************************/
SourceFeatDerive(SourceFeatBlkPtr sfbp,SourceFeatBlkPtr res)1691 static SourceFeatBlkPtr SourceFeatDerive(SourceFeatBlkPtr sfbp,
1692 SourceFeatBlkPtr res)
1693 {
1694 SourceFeatBlkPtr tsfbp;
1695
1696 if(res == NULL)
1697 return(sfbp);
1698
1699 tsfbp = SourceFeatBlkNew();
1700 tsfbp->name = (res->name == NULL) ? NULL : StringSave(res->name);
1701 tsfbp->namstr = (res->namstr == NULL) ? NULL : StringSave(res->namstr);
1702 tsfbp->location = (res->location == NULL) ? NULL : StringSave(res->location);
1703 tsfbp->full = res->full;
1704 tsfbp->focus = res->focus;
1705 tsfbp->lookup = res->lookup;
1706 tsfbp->genome = res->genome;
1707 tsfbp->next = NULL;
1708
1709 tsfbp->bio_src.Reset(new objects::CBioSource);
1710 tsfbp->bio_src->Assign(*res->bio_src);
1711
1712 tsfbp->orgname.Reset(new objects::COrgName);
1713 if (res->orgname.NotEmpty())
1714 tsfbp->orgname->Assign(*res->orgname);
1715
1716 tsfbp->quals = res->quals;
1717 tsfbp->next = sfbp;
1718 sfbp = tsfbp;
1719
1720 for (TQualVector::iterator cur = sfbp->quals.begin(); cur != sfbp->quals.end(); )
1721 {
1722 const std::string& cur_qual = (*cur)->GetQual();
1723 if (cur_qual == "focus")
1724 {
1725 ++cur;
1726 continue;
1727 }
1728
1729 for(tsfbp = sfbp->next; tsfbp != NULL; tsfbp = tsfbp->next)
1730 {
1731 if(tsfbp == res || !tsfbp->useit)
1732 continue;
1733
1734 bool found = false;
1735 ITERATE(TQualVector, next, tsfbp->quals)
1736 {
1737 const std::string& next_qual = (*next)->GetQual();
1738
1739 if (next_qual == "focus" || next_qual != cur_qual)
1740 continue;
1741
1742 if (!(*cur)->IsSetVal() && !(*next)->IsSetVal())
1743 {
1744 found = true;
1745 break;
1746 }
1747
1748 if ((*cur)->IsSetVal() && (*next)->IsSetVal() &&
1749 (*cur)->GetVal() == (*next)->GetVal())
1750 {
1751 found = true;
1752 break;
1753 }
1754 }
1755
1756 if (!found) /* Not found */
1757 break;
1758 }
1759
1760 if (tsfbp == NULL) /* Got the match */
1761 {
1762 ++cur;
1763 continue;
1764 }
1765
1766 cur = sfbp->quals.erase(cur);
1767 }
1768
1769 return(SourceFeatRemoveDups(sfbp));
1770 }
1771
1772 /**********************************************************/
PickTheDescrSource(SourceFeatBlkPtr sfbp)1773 static SourceFeatBlkPtr PickTheDescrSource(SourceFeatBlkPtr sfbp)
1774 {
1775 SourceFeatBlkPtr res;
1776 SourceFeatBlkPtr tsfbp;
1777
1778 if(sfbp->next == NULL)
1779 {
1780 if(!sfbp->full)
1781 {
1782 ErrPostEx(SEV_WARNING, ERR_SOURCE_SingleSourceTooShort,
1783 "Source feature does not span the entire length of the sequence.");
1784 }
1785 return(sfbp);
1786 }
1787
1788 Int4 count_skip = 0;
1789 Int4 count_noskip = 0;
1790 bool same = true;
1791 for(tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1792 {
1793 if(StringICmp(tsfbp->name, sfbp->name) != 0)
1794 {
1795 same = false;
1796 break;
1797 }
1798
1799 if(!tsfbp->skip)
1800 {
1801 res = tsfbp;
1802 count_noskip++;
1803 }
1804 else
1805 count_skip++;
1806 }
1807
1808 if(same)
1809 {
1810 if(count_noskip == 1)
1811 {
1812 sfbp = SourceFeatMoveOneUp(sfbp, res);
1813 return(SourceFeatRemoveDups(sfbp));
1814 }
1815 for(res = NULL, tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1816 {
1817 if(count_noskip != 0 && tsfbp->skip)
1818 continue;
1819 tsfbp->useit = true;
1820 if(res == NULL)
1821 res = tsfbp;
1822 }
1823 return(SourceFeatDerive(sfbp, res));
1824 }
1825
1826 for (tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1827 {
1828 if (tsfbp->tg)
1829 break;
1830 }
1831 if(tsfbp != NULL)
1832 return(SourceFeatMoveOneUp(sfbp, tsfbp));
1833
1834 for(res = NULL, tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1835 {
1836 if(!tsfbp->focus)
1837 continue;
1838 res = tsfbp;
1839 if(!tsfbp->skip)
1840 break;
1841 }
1842
1843 if(res != NULL)
1844 {
1845 count_skip = 0;
1846 count_noskip = 0;
1847 for(tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1848 {
1849 if(StringICmp(res->name, tsfbp->name) != 0)
1850 continue;
1851 tsfbp->useit = true;
1852 if(tsfbp->skip)
1853 count_skip++;
1854 else
1855 count_noskip++;
1856 }
1857 if(count_noskip > 0)
1858 {
1859 for(tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1860 {
1861 if(StringICmp(res->name, tsfbp->name) != 0)
1862 continue;
1863 if(res != tsfbp && tsfbp->skip)
1864 tsfbp->useit = false;
1865 }
1866 }
1867 return(SourceFeatDerive(sfbp, res));
1868 }
1869
1870 for(tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
1871 {
1872 if(!tsfbp->full)
1873 continue;
1874 res = tsfbp;
1875 break;
1876 }
1877 if(res != NULL)
1878 {
1879 sfbp = SourceFeatMoveOneUp(sfbp, res);
1880 return(SourceFeatRemoveDups(sfbp));
1881 }
1882
1883 SourceFeatBlkSetFree(sfbp);
1884 ErrPostEx(SEV_ERROR, ERR_SOURCE_MissingSourceFeatureForDescr,
1885 "Could not select the right source feature among different organisms to create descriptor: no /focus and 1..N one. Entry dropped.");
1886 return(NULL);
1887 }
1888
1889 /**********************************************************/
AddOrgMod(objects::COrg_ref & org_ref,const Char * val,Uint1 type)1890 static void AddOrgMod(objects::COrg_ref& org_ref, const Char* val, Uint1 type)
1891 {
1892 objects::COrgName& orgname = org_ref.SetOrgname();
1893
1894 CRef<objects::COrgMod> mod(new objects::COrgMod);
1895 mod->SetSubtype(type);
1896 mod->SetSubname((val == NULL) ? "" : val);
1897
1898 orgname.SetMod().push_back(mod);
1899 }
1900
1901 /**********************************************************/
FTASubSourceAdd(objects::CBioSource & bio,const Char * val,Uint1 type)1902 static void FTASubSourceAdd(objects::CBioSource& bio, const Char* val, Uint1 type)
1903 {
1904 if (type != 12) /* dev-stage */
1905 {
1906 bool found = false;
1907 ITERATE(objects::CBioSource::TSubtype, subtype, bio.GetSubtype())
1908 {
1909 if ((*subtype)->GetSubtype() == type)
1910 {
1911 found = true;
1912 break;
1913 }
1914 }
1915
1916 if (found)
1917 return;
1918 }
1919
1920 CRef<objects::CSubSource> sub(new objects::CSubSource);
1921 sub->SetSubtype(type);
1922 sub->SetName((val == NULL) ? "" : val);
1923 bio.SetSubtype().push_back(sub);
1924 }
1925
1926 /**********************************************************/
CheckQualsInSourceFeat(objects::CBioSource & bio,TQualVector & quals,Uint1 taxserver)1927 static void CheckQualsInSourceFeat(objects::CBioSource& bio, TQualVector& quals,
1928 Uint1 taxserver)
1929 {
1930 const Char **b;
1931
1932 char* p;
1933
1934 if (!bio.CanGetOrg())
1935 return;
1936
1937 std::vector<std::string> modnames;
1938
1939 if (bio.GetOrg().CanGetOrgname() && bio.GetOrg().GetOrgname().CanGetMod())
1940 {
1941 ITERATE(objects::COrgName::TMod, mod, bio.GetOrg().GetOrgname().GetMod())
1942 {
1943 for (size_t i = 0; SourceOrgMods[i].name != NULL; ++i)
1944 {
1945 if(SourceOrgMods[i].num != (*mod)->GetSubtype())
1946 continue;
1947
1948 modnames.push_back(SourceOrgMods[i].name);
1949 break;
1950 }
1951 }
1952 }
1953
1954 ITERATE(TQualVector, cur, quals)
1955 {
1956 if (!(*cur)->IsSetQual() || (*cur)->GetQual() == "organism")
1957 continue;
1958
1959 const std::string& cur_qual = (*cur)->GetQual();
1960 const Char* val_ptr = (*cur)->IsSetVal() ? (*cur)->GetVal().c_str() : NULL;
1961
1962 if (cur_qual == "note")
1963 {
1964 FTASubSourceAdd(bio, val_ptr, 255);
1965 continue;
1966 }
1967
1968 for(b = SourceBadQuals; *b != NULL; b++)
1969 {
1970 if (cur_qual != *b)
1971 continue;
1972
1973 if (val_ptr == NULL || val_ptr[0] == '\0')
1974 p = StringSave("???");
1975 else
1976 p = StringSave(val_ptr);
1977 if(StringLen(p) > 50)
1978 p[50] = '\0';
1979 ErrPostEx(SEV_WARNING, ERR_SOURCE_UnwantedQualifiers,
1980 "Unwanted qualifier on source feature: %s=%s",
1981 cur_qual.c_str(), p);
1982 MemFree(p);
1983 }
1984
1985 b = SourceSubSources;
1986 for (size_t i = 1; *b != NULL; i++, b++)
1987 {
1988 if (**b != '\0' && cur_qual == *b)
1989 {
1990 FTASubSourceAdd(bio, val_ptr, (Uint1)i);
1991 break;
1992 }
1993 }
1994
1995 if (cur_qual == "organism" ||
1996 (taxserver != 0 && cur_qual == "type_material"))
1997 continue;
1998
1999 if (find(modnames.begin(), modnames.end(), cur_qual) != modnames.end())
2000 continue;
2001
2002 for (size_t i = 0; SourceOrgMods[i].name != NULL; i++)
2003 {
2004 if (cur_qual == SourceOrgMods[i].name)
2005 {
2006 AddOrgMod(bio.SetOrg(), val_ptr, SourceOrgMods[i].num);
2007 break;
2008 }
2009 }
2010 }
2011 }
2012
2013 /**********************************************************/
GetSourceDbtag(CRef<objects::CGb_qual> & qual,Parser::ESource source)2014 static CRef<objects::CDbtag> GetSourceDbtag(CRef<objects::CGb_qual>& qual, Parser::ESource source)
2015 {
2016 const char **b;
2017 const char *q;
2018 char* line;
2019 char* p;
2020
2021 CRef<objects::CDbtag> tag;
2022
2023 if (qual->GetQual() != "db_xref")
2024 return tag;
2025
2026 std::vector<Char> val_buf(qual->GetVal().begin(), qual->GetVal().end());
2027 val_buf.push_back(0);
2028
2029 p = StringChr(&val_buf[0], ':');
2030 if(p == NULL || p[1] == '\0')
2031 return tag;
2032
2033 *p = '\0';
2034 if (StringICmp(&val_buf[0], "taxon") == 0)
2035 {
2036 *p = ':';
2037 return tag;
2038 }
2039
2040 if(source == Parser::ESource::NCBI)
2041 q = "NCBI";
2042 else if(source == Parser::ESource::EMBL)
2043 q = "EMBL";
2044 else if(source == Parser::ESource::DDBJ)
2045 q = "DDBJ";
2046 else if(source == Parser::ESource::SPROT)
2047 q = "SwissProt";
2048 else if(source == Parser::ESource::PIR)
2049 q = "PIR";
2050 else if(source == Parser::ESource::LANL)
2051 q = "LANL";
2052 else if(source == Parser::ESource::Refseq)
2053 q = "RefSeq";
2054 else
2055 q = "Unknown";
2056
2057 if(source != Parser::ESource::NCBI && source != Parser::ESource::DDBJ &&
2058 source != Parser::ESource::EMBL && source != Parser::ESource::LANL &&
2059 source != Parser::ESource::Refseq)
2060 {
2061 *p = ':';
2062 ErrPostEx(SEV_ERROR, ERR_SOURCE_InvalidDbXref,
2063 "Cannot process source feature's \"/db_xref=%s\" for source \"%s\".",
2064 &val_buf[0], q);
2065 return tag;
2066 }
2067
2068 for (b = ObsoleteSourceDbxrefTag; *b != NULL; b++)
2069 {
2070 if (StringICmp(*b, &val_buf[0]) == 0)
2071 break;
2072 }
2073
2074 if(*b != NULL)
2075 {
2076 ErrPostEx(SEV_WARNING, ERR_SOURCE_ObsoleteDbXref,
2077 "/db_xref type \"%s\" is obsolete.", &val_buf[0]);
2078 if (StringICmp(&val_buf[0], "IFO") == 0)
2079 {
2080 line = (char*) MemNew(25 + StringLen(p + 1));
2081 StringCpy(line, "NBRC:");
2082 StringCat(line, p + 1);
2083 qual->SetVal(line);
2084 MemFree(line);
2085
2086 val_buf.assign(line, line + StringLen(line));
2087 val_buf.push_back(0);
2088
2089 p = &val_buf[0] + 4;
2090 *p = '\0';
2091 }
2092 }
2093
2094 for (b = DENLRSourceDbxrefTag; *b != NULL; b++)
2095 {
2096 if (StringICmp(*b, &val_buf[0]) == 0)
2097 break;
2098 }
2099
2100 if(*b == NULL && (source == Parser::ESource::DDBJ || source == Parser::ESource::EMBL))
2101 {
2102 for(b = DESourceDbxrefTag; *b != NULL; b++)
2103 if (StringICmp(*b, &val_buf[0]) == 0)
2104 break;
2105 }
2106 if(*b == NULL && source == Parser::ESource::EMBL)
2107 {
2108 for(b = ESourceDbxrefTag; *b != NULL; b++)
2109 if (StringICmp(*b, &val_buf[0]) == 0)
2110 break;
2111 }
2112 if(*b == NULL && (source == Parser::ESource::NCBI || source == Parser::ESource::LANL ||
2113 source == Parser::ESource::Refseq))
2114 {
2115 for (b = NLRSourceDbxrefTag; *b != NULL; b++)
2116 {
2117 if (StringICmp(*b, &val_buf[0]) == 0)
2118 break;
2119 }
2120 }
2121
2122 if(*b == NULL)
2123 {
2124 *p = ':';
2125 ErrPostEx(SEV_ERROR, ERR_SOURCE_InvalidDbXref,
2126 "Invalid database name in source feature's \"/db_xref=%s\" for source \"%s\".",
2127 &val_buf[0], q);
2128 return tag;
2129 }
2130
2131 tag.Reset(new objects::CDbtag);
2132 tag->SetDb(&val_buf[0]);
2133
2134 *p++ = ':';
2135 for(q = p; *p >= '0' && *p <= '9';)
2136 p++;
2137
2138 if(*p == '\0' && *q != '0')
2139 tag->SetTag().SetId(atoi(q));
2140 else
2141 tag->SetTag().SetStr(q);
2142
2143 return tag;
2144 }
2145
2146 /**********************************************************/
UpdateRawBioSource(SourceFeatBlkPtr sfbp,Parser::ESource source,IndexblkPtr ibp,Uint1 taxserver)2147 static bool UpdateRawBioSource(SourceFeatBlkPtr sfbp, Parser::ESource source, IndexblkPtr ibp, Uint1 taxserver)
2148 {
2149 char* div;
2150 char* tco;
2151 char* p;
2152 char* q;
2153
2154 Int4 newgen;
2155 Int4 oldgen;
2156 Int2 i;
2157
2158 bool is_syn = false;
2159 bool is_pat = false;
2160
2161 div = ibp->division;
2162 if(div != NULL)
2163 {
2164 if(StringCmp(div, "SYN") == 0)
2165 is_syn = true;
2166 else if(StringCmp(div, "PAT") == 0)
2167 is_pat = true;
2168 }
2169 for(; sfbp != NULL; sfbp = sfbp->next)
2170 {
2171 if (sfbp->bio_src.Empty())
2172 continue;
2173
2174 objects::CBioSource& bio = *sfbp->bio_src;
2175
2176 if(!sfbp->lookup)
2177 {
2178 if(is_syn && !sfbp->tg)
2179 bio.SetOrigin(4); /* artificial */
2180 }
2181 else
2182 {
2183 if (bio.CanGetOrg() && bio.GetOrg().CanGetOrgname() &&
2184 bio.GetOrg().GetOrgname().CanGetDiv() &&
2185 bio.GetOrg().GetOrgname().GetDiv() == "SYN")
2186 {
2187 bio.SetOrigin(4); /* artificial */
2188 if (is_syn == false && is_pat == false)
2189 {
2190 const Char* taxname = NULL;
2191 if (bio.GetOrg().CanGetTaxname() &&
2192 !bio.GetOrg().GetTaxname().empty())
2193 taxname = bio.GetOrg().GetTaxname().c_str();
2194 ErrPostEx(SEV_ERROR, ERR_ORGANISM_SynOrgNameNotSYNdivision,
2195 "The NCBI Taxonomy DB believes that organism name \"%s\" is reserved for synthetic sequences, but this record is not in the SYN division.",
2196 (taxname == NULL) ? "not_specified" : taxname);
2197 }
2198 }
2199 }
2200
2201 newgen = -1;
2202 oldgen = -1;
2203
2204 bool dropped = false;
2205 NON_CONST_ITERATE(TQualVector, cur, sfbp->quals)
2206 {
2207 if (!(*cur)->IsSetQual() || (*cur)->GetQual().empty())
2208 continue;
2209
2210 const std::string& cur_qual = (*cur)->GetQual();
2211 if (cur_qual == "db_xref")
2212 {
2213 CRef<objects::CDbtag> dbtag = GetSourceDbtag(*cur, source);
2214 if (dbtag.Empty())
2215 continue;
2216
2217 bio.SetOrg().SetDb().push_back(dbtag);
2218 continue;
2219 }
2220
2221 const Char* val_ptr = (*cur)->IsSetVal() ? (*cur)->GetVal().c_str() : NULL;
2222 if (cur_qual == "organelle")
2223 {
2224 if (val_ptr == NULL || val_ptr[0] == '\0')
2225 continue;
2226
2227 p = StringChr(val_ptr, ':');
2228 if (p != NULL)
2229 {
2230 if (StringChr(p + 1, ':') != NULL)
2231 {
2232 ErrPostEx(SEV_ERROR, ERR_SOURCE_OrganelleQualMultToks,
2233 "More than 2 tokens found in /organelle qualifier: \"%s\". Entry dropped.",
2234 val_ptr);
2235 dropped = true;
2236 break;
2237 }
2238
2239 std::string val_str(val_ptr, static_cast<const Char*>(p));
2240 i = StringMatchIcase(OrganelleFirstToken, val_str.c_str());
2241 if(i < 0)
2242 {
2243 ErrPostEx(SEV_ERROR, ERR_SOURCE_OrganelleIllegalClass,
2244 "Illegal class in /organelle qualifier: \"%s\". Entry dropped.",
2245 val_ptr);
2246 dropped = true;
2247 break;
2248 }
2249 if(i == 4)
2250 ibp->got_plastid = true;
2251 if(newgen < 0)
2252 newgen = StringMatchIcase(GenomicSourceFeatQual,
2253 p + 1);
2254 }
2255 else
2256 {
2257 i = StringMatchIcase(OrganelleFirstToken, val_ptr);
2258 if(i < 0)
2259 {
2260 ErrPostEx(SEV_ERROR, ERR_SOURCE_OrganelleIllegalClass,
2261 "Illegal class in /organelle qualifier: \"%s\". Entry dropped.",
2262 val_ptr);
2263 dropped = true;
2264 break;
2265 }
2266 if(i == 4)
2267 ibp->got_plastid = true;
2268 if(newgen < 0)
2269 newgen = StringMatchIcase(GenomicSourceFeatQual,
2270 val_ptr);
2271 }
2272 continue;
2273 }
2274
2275 if(oldgen < 0)
2276 oldgen = StringMatchIcase(GenomicSourceFeatQual, cur_qual.c_str());
2277
2278 if (cur_qual != "country" ||
2279 val_ptr == NULL || val_ptr[0] == '\0')
2280 continue;
2281
2282 tco = StringSave(val_ptr);
2283 p = StringChr(tco, ':');
2284 if(p != NULL)
2285 *p = '\0';
2286 for(p = tco; *p == ' ' || *p == '\t';)
2287 p++;
2288 if(*p == '\0')
2289 {
2290 ErrPostEx(SEV_ERROR, ERR_SOURCE_InvalidCountry,
2291 "Empty country name in /country qualifier : \"%s\".",
2292 val_ptr);
2293 }
2294 else
2295 {
2296 for(q = p + 1; *q != '\0';)
2297 q++;
2298 for(q--; *q == ' ' || *q == '\t';)
2299 q--;
2300 *++q = '\0';
2301
2302 bool valid_country = objects::CCountries::IsValid(p);
2303 if (!valid_country)
2304 {
2305 valid_country = objects::CCountries::WasValid(p);
2306
2307 if (!valid_country)
2308 ErrPostEx(SEV_ERROR, ERR_SOURCE_InvalidCountry,
2309 "Country \"%s\" from /country qualifier \"%s\" is not a valid country name.",
2310 tco, val_ptr);
2311 else
2312 ErrPostEx(SEV_WARNING, ERR_SOURCE_FormerCountry,
2313 "Country \"%s\" from /country qualifier \"%s\" is a former country name which is no longer valid.",
2314 tco, val_ptr);
2315 }
2316 }
2317
2318 MemFree(tco);
2319 FTASubSourceAdd(bio, val_ptr, 23);
2320 }
2321
2322 if (dropped)
2323 break;
2324
2325 if (newgen > -1)
2326 bio.SetGenome(newgen);
2327 else if (oldgen > -1)
2328 bio.SetGenome(oldgen);
2329 else if (sfbp->genome != 0)
2330 bio.SetGenome(sfbp->genome);
2331
2332 CheckQualsInSourceFeat(bio, sfbp->quals, taxserver);
2333 fta_sort_biosource(bio);
2334 }
2335
2336 if(sfbp != NULL)
2337 return false;
2338
2339 return true;
2340 }
2341
2342
2343 /**********************************************************/
is_a_space_char(Char c)2344 static bool is_a_space_char(Char c)
2345 {
2346 return c == ' ' || c == '\t';
2347 }
2348
2349 /**********************************************************/
CompareDescrFeatSources(SourceFeatBlkPtr sfbp,const objects::CBioseq & bioseq)2350 static void CompareDescrFeatSources(SourceFeatBlkPtr sfbp, const objects::CBioseq& bioseq)
2351 {
2352 SourceFeatBlkPtr tsfbp;
2353
2354 if(sfbp == NULL || !bioseq.IsSetDescr())
2355 return;
2356
2357 ITERATE(objects::CSeq_descr::Tdata, descr, bioseq.GetDescr().Get())
2358 {
2359 if (!(*descr)->IsSource())
2360 continue;
2361
2362 const objects::CBioSource& bio_src = (*descr)->GetSource();
2363
2364 if (!bio_src.IsSetOrg() || !bio_src.GetOrg().IsSetTaxname() ||
2365 bio_src.GetOrg().GetTaxname().empty())
2366 continue;
2367
2368 const std::string& taxname = bio_src.GetOrg().GetTaxname();
2369 std::string orgdescr;
2370 std::remove_copy_if(taxname.begin(), taxname.end(), std::back_inserter(orgdescr), is_a_space_char);
2371
2372 std::string commdescr;
2373 if (bio_src.GetOrg().IsSetCommon())
2374 {
2375 const std::string& common = bio_src.GetOrg().GetCommon();
2376 std::remove_copy_if(common.begin(), common.end(), std::back_inserter(commdescr), is_a_space_char);
2377 }
2378
2379 for (tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
2380 {
2381 if (tsfbp->name == NULL || tsfbp->name[0] == '\0')
2382 continue;
2383
2384 size_t name_len = strlen(tsfbp->name);
2385 std::string orgfeat;
2386 std::remove_copy_if(tsfbp->name, tsfbp->name + name_len, std::back_inserter(orgfeat), is_a_space_char);
2387
2388 if(StringICmp(orgdescr.c_str(), "unknown") == 0)
2389 {
2390 if(StringICmp(orgdescr.c_str(), orgfeat.c_str()) == 0 ||
2391 (!commdescr.empty() && StringICmp(commdescr.c_str(), orgfeat.c_str()) == 0))
2392 {
2393 break;
2394 }
2395 }
2396 else
2397 {
2398 if (orgdescr == orgfeat || commdescr == orgfeat)
2399 {
2400 break;
2401 }
2402 }
2403 }
2404
2405 if(tsfbp == NULL)
2406 {
2407 ErrPostEx(SEV_ERROR, ERR_ORGANISM_NoSourceFeatMatch,
2408 "Organism name \"%s\" from OS/ORGANISM line does not exist in this record's source features.",
2409 taxname.c_str());
2410 }
2411 }
2412 }
2413
2414 /**********************************************************/
CheckSourceLineage(SourceFeatBlkPtr sfbp,Parser::ESource source,bool is_pat)2415 static bool CheckSourceLineage(SourceFeatBlkPtr sfbp, Parser::ESource source, bool is_pat)
2416 {
2417 const Char* p;
2418 ErrSev sev;
2419
2420 for(; sfbp != NULL; sfbp = sfbp->next)
2421 {
2422 if(!sfbp->lookup || sfbp->bio_src.Empty() || !sfbp->bio_src->IsSetOrg())
2423 continue;
2424
2425 p = NULL;
2426 if (sfbp->bio_src->GetOrg().IsSetOrgname() &&
2427 sfbp->bio_src->GetOrg().GetOrgname().IsSetLineage())
2428 p = sfbp->bio_src->GetOrg().GetOrgname().GetLineage().c_str();
2429
2430 if (p == NULL || *p == '\0')
2431 {
2432 if ((source == Parser::ESource::DDBJ || source == Parser::ESource::EMBL) && is_pat)
2433 sev = SEV_WARNING;
2434 else
2435 sev = SEV_REJECT;
2436 ErrPostEx(sev, ERR_SERVER_NoLineageFromTaxon,
2437 "Taxonomy lookup for organism name \"%s\" yielded an Org-ref that has no lineage.",
2438 sfbp->name);
2439 if(sev == SEV_REJECT)
2440 break;
2441 }
2442 }
2443 if(sfbp == NULL)
2444 return true;
2445 return false;
2446 }
2447
2448 /**********************************************************/
PropogateSuppliedLineage(objects::CBioseq & bioseq,SourceFeatBlkPtr sfbp,Uint1 taxserver)2449 static void PropogateSuppliedLineage(objects::CBioseq& bioseq,
2450 SourceFeatBlkPtr sfbp, Uint1 taxserver)
2451 {
2452 SourceFeatBlkPtr tsfbp;
2453
2454 const Char *p;
2455
2456 if (!bioseq.IsSetDescr() || sfbp == NULL)
2457 return;
2458
2459 for(; sfbp != NULL; sfbp = sfbp->next)
2460 {
2461 if(sfbp->lookup || sfbp->bio_src.Empty() ||
2462 !sfbp->bio_src->IsSetOrg() || !sfbp->bio_src->GetOrg().IsSetTaxname() ||
2463 sfbp->name == NULL || *sfbp->name == '\0' ||
2464 sfbp->bio_src->GetOrg().GetTaxname().empty())
2465 continue;
2466
2467 objects::COrgName& orgname = sfbp->bio_src->SetOrg().SetOrgname();
2468
2469 if (orgname.IsSetLineage())
2470 {
2471 if (!orgname.GetLineage().empty())
2472 continue;
2473
2474 orgname.ResetLineage();
2475 }
2476
2477 const std::string& taxname = sfbp->bio_src->GetOrg().GetTaxname();
2478 std::string lineage;
2479
2480 bool found = false;
2481 ITERATE(objects::CSeq_descr::Tdata, descr, bioseq.GetDescr().Get())
2482 {
2483 if (!(*descr)->IsSource())
2484 continue;
2485
2486 const objects::CBioSource& bio_src = (*descr)->GetSource();
2487
2488 if (!bio_src.IsSetOrg() || !bio_src.GetOrg().IsSetOrgname() ||
2489 !bio_src.GetOrg().IsSetTaxname() || bio_src.GetOrg().GetTaxname().empty() ||
2490 !bio_src.GetOrg().GetOrgname().IsSetLineage())
2491 continue;
2492
2493 lineage = bio_src.GetOrg().GetOrgname().GetLineage();
2494 const std::string& cur_taxname = bio_src.GetOrg().GetTaxname();
2495
2496 if (StringICmp(cur_taxname.c_str(), taxname.c_str()) == 0)
2497 {
2498 found = true;
2499 break;
2500 }
2501 }
2502
2503 if (!found)
2504 {
2505 ErrPostEx((taxserver == 0) ? SEV_INFO : SEV_WARNING,
2506 ERR_ORGANISM_UnclassifiedLineage,
2507 "Taxonomy lookup for organism name \"%s\" failed, and no matching organism exists in OS/ORGANISM lines, so lineage has been set to \"Unclassified\".",
2508 taxname.c_str());
2509 p = "Unclassified";
2510 }
2511 else
2512 {
2513 if (lineage.empty())
2514 {
2515 ErrPostEx((taxserver == 0) ? SEV_INFO : SEV_WARNING,
2516 ERR_ORGANISM_UnclassifiedLineage,
2517 "Taxonomy lookup for organism name \"%s\" failed, and the matching organism from OS/ORGANISM lines has no lineage, so lineage has been set to \"Unclassified\".",
2518 taxname.c_str());
2519 p = "Unclassified";
2520 }
2521 else
2522 p = lineage.c_str();
2523 }
2524
2525 orgname.SetLineage(p);
2526 for(tsfbp = sfbp->next; tsfbp != NULL; tsfbp = tsfbp->next)
2527 {
2528 if (tsfbp->lookup || tsfbp->bio_src.Empty() ||
2529 !tsfbp->bio_src->IsSetOrg() || !tsfbp->bio_src->GetOrg().IsSetTaxname() ||
2530 tsfbp->name == NULL || *tsfbp->name == '\0' ||
2531 tsfbp->bio_src->GetOrg().GetTaxname().empty() ||
2532 StringICmp(sfbp->name, tsfbp->name) != 0)
2533
2534 continue;
2535
2536 objects::COrgName& torgname = tsfbp->bio_src->SetOrg().SetOrgname();
2537
2538 if (torgname.IsSetLineage())
2539 {
2540 if (!torgname.GetLineage().empty())
2541 continue;
2542 }
2543 torgname.SetLineage(p);
2544 }
2545 }
2546 }
2547
2548 /**********************************************************/
CheckMoltypeConsistency(SourceFeatBlkPtr sfbp,char ** moltype)2549 static bool CheckMoltypeConsistency(SourceFeatBlkPtr sfbp, char** moltype)
2550 {
2551 SourceFeatBlkPtr tsfbp;
2552 char* name;
2553 char* p;
2554 bool ret;
2555 Char ch;
2556
2557 if(sfbp == NULL)
2558 return true;
2559
2560 for(tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
2561 if(tsfbp->moltype != NULL)
2562 break;
2563
2564 if(tsfbp == NULL)
2565 return true;
2566
2567 name = tsfbp->moltype;
2568 for(ret = true, tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
2569 {
2570 if(tsfbp->moltype == NULL)
2571 {
2572 ch = '\0';
2573 p = tsfbp->location;
2574 if(p != NULL && StringLen(p) > 50)
2575 {
2576 ch = p[50];
2577 p[50] = '\0';
2578 }
2579 ErrPostEx(SEV_ERROR, ERR_SOURCE_MissingMolType,
2580 "Source feature at \"%s\" lacks a /mol_type qualifier.",
2581 (p == NULL) ? "<empty>" : p);
2582 if(ch != '\0')
2583 p[50] = ch;
2584 }
2585 else if(ret && StringCmp(name, tsfbp->moltype) != 0)
2586 ret = false;
2587 }
2588
2589 if(ret)
2590 *moltype = StringSave(name);
2591
2592 return(ret);
2593 }
2594
2595 /**********************************************************/
CheckForENV(SourceFeatBlkPtr sfbp,IndexblkPtr ibp,Parser::ESource source)2596 static bool CheckForENV(SourceFeatBlkPtr sfbp, IndexblkPtr ibp, Parser::ESource source)
2597 {
2598 const char **b;
2599
2600 char* location;
2601 Int4 sources;
2602 Int4 envs;
2603 Char ch;
2604
2605 if(sfbp == NULL || ibp == NULL)
2606 return true;
2607
2608 bool skip = false;
2609 location = NULL;
2610 ibp->env_sample_qual = false;
2611 for(envs = 0, sources = 0; sfbp != NULL; sfbp = sfbp->next, sources++)
2612 {
2613 bool env_found = false;
2614 ITERATE(TQualVector, cur, sfbp->quals)
2615 {
2616 if ((*cur)->IsSetQual() && (*cur)->GetQual() == "environmental_sample")
2617 {
2618 env_found = true;
2619 break;
2620 }
2621 }
2622 if (env_found)
2623 envs++;
2624 else
2625 location = sfbp->location;
2626
2627 if(!sfbp->full || sfbp->name == NULL || sfbp->name[0] == '\0')
2628 continue;
2629
2630 for (b = special_orgs; *b != NULL; b++)
2631 {
2632 if (StringICmp(*b, sfbp->name) == 0)
2633 break;
2634 }
2635 if(*b != NULL)
2636 skip = true;
2637 }
2638
2639 if(envs > 0)
2640 {
2641 ibp->env_sample_qual = true;
2642 if(!skip && envs != sources)
2643 {
2644 if(location != NULL && StringLen(location) > 50)
2645 {
2646 ch = location[50];
2647 location[50] = '\0';
2648 }
2649 else
2650 ch = '\0';
2651 ErrPostEx(SEV_REJECT, ERR_SOURCE_InconsistentEnvSampQual,
2652 "Inconsistent /environmental_sample qualifier usage. Source feature at location \"%s\" lacks the qualifier.",
2653 (location == NULL) ? "unknown" : location);
2654 if(ch != '\0')
2655 location[50] = ch;
2656 return false;
2657 }
2658 }
2659 else if(StringICmp(ibp->division, "ENV") == 0)
2660 {
2661 if(source == Parser::ESource::EMBL)
2662 ErrPostEx(SEV_ERROR, ERR_SOURCE_MissingEnvSampQual,
2663 "This ENV division record has source features that lack the /environmental_sample qualifier. It will not be placed in the ENV division until the qualifier is added.");
2664 else
2665 {
2666 ErrPostEx(SEV_REJECT, ERR_SOURCE_MissingEnvSampQual,
2667 "This ENV division record has source features that lack the /environmental_sample qualifier.");
2668 return false;
2669 }
2670 }
2671 return true;
2672 }
2673
2674 /**********************************************************/
CheckPcrPrimersTag(char * str)2675 static char* CheckPcrPrimersTag(char* str)
2676 {
2677 if(StringNCmp(str, "fwd_name", 8) == 0 ||
2678 StringNCmp(str, "rev_name", 8) == 0)
2679 str += 8;
2680 else if(StringNCmp(str, "fwd_seq", 7) == 0 ||
2681 StringNCmp(str, "rev_seq", 7) == 0)
2682 str += 7;
2683 else
2684 return(NULL);
2685
2686 if(*str == ' ')
2687 str++;
2688 if(*str == ':')
2689 return(str + 1);
2690 return(NULL);
2691 }
2692
2693 /**********************************************************/
PopulatePcrPrimers(objects::CBioSource & bio,PcrPrimersPtr ppp,Int4 count)2694 static void PopulatePcrPrimers(objects::CBioSource& bio, PcrPrimersPtr ppp, Int4 count)
2695 {
2696 PcrPrimersPtr tppp;
2697
2698 char* str_fs;
2699 char* str_rs;
2700 char* str_fn;
2701 char* str_rn;
2702 Int4 num_fn;
2703 Int4 num_rn;
2704
2705 if (ppp == NULL || count < 1)
2706 return;
2707
2708 objects::CBioSource::TSubtype& subs = bio.SetSubtype();
2709 CRef<objects::CSubSource> sub;
2710
2711 if (count == 1)
2712 {
2713 sub.Reset(new objects::CSubSource);
2714 sub->SetSubtype(33);
2715 sub->SetName(ppp->fwd_seq);
2716 subs.push_back(sub);
2717
2718 sub.Reset(new objects::CSubSource);
2719 sub->SetSubtype(34);
2720 sub->SetName(ppp->rev_seq);
2721 subs.push_back(sub);
2722
2723 if(ppp->fwd_name != NULL && ppp->fwd_name[0] != '\0')
2724 {
2725 sub.Reset(new objects::CSubSource);
2726 sub->SetSubtype(35);
2727 sub->SetName(ppp->fwd_name);
2728 subs.push_back(sub);
2729 }
2730
2731 if(ppp->rev_name != NULL && ppp->rev_name[0] != '\0')
2732 {
2733 sub.Reset(new objects::CSubSource);
2734 sub->SetSubtype(36);
2735 sub->SetName(ppp->rev_name);
2736 subs.push_back(sub);
2737 }
2738 return;
2739 }
2740
2741 size_t len_fs = 2,
2742 len_rs = 2,
2743 len_fn = 0,
2744 len_rn = 0;
2745 num_fn = 0;
2746 num_rn = 0;
2747 for(tppp = ppp; tppp != NULL; tppp = tppp->next)
2748 {
2749 len_fs += (StringLen(tppp->fwd_seq) + 1);
2750 len_rs += (StringLen(tppp->rev_seq) + 1);
2751 if(tppp->fwd_name != NULL && tppp->fwd_name[0] != '\0')
2752 {
2753 len_fn += (StringLen(tppp->fwd_name) + 1);
2754 num_fn++;
2755 }
2756 if(tppp->rev_name != NULL && tppp->rev_name[0] != '\0')
2757 {
2758 len_rn += (StringLen(tppp->rev_name) + 1);
2759 num_rn++;
2760 }
2761 }
2762
2763 str_fs = (char*) MemNew(len_fs);
2764 str_rs = (char*) MemNew(len_rs);
2765 str_fn = (len_fn == 0) ? NULL : (char*) MemNew(len_fn + count -
2766 num_fn + 2);
2767 str_rn = (len_rn == 0) ? NULL : (char*) MemNew(len_rn + count -
2768 num_rn + 2);
2769
2770 for(tppp = ppp; tppp != NULL; tppp = tppp->next)
2771 {
2772 StringCat(str_fs, ",");
2773 StringCat(str_fs, tppp->fwd_seq);
2774 StringCat(str_rs, ",");
2775 StringCat(str_rs, tppp->rev_seq);
2776 if(str_fn != NULL)
2777 {
2778 StringCat(str_fn, ",");
2779 if(tppp->fwd_name != NULL && tppp->fwd_name[0] != '\0')
2780 StringCat(str_fn, tppp->fwd_name);
2781 }
2782 if(str_rn != NULL)
2783 {
2784 StringCat(str_rn, ",");
2785 if(tppp->rev_name != NULL && tppp->rev_name[0] != '\0')
2786 StringCat(str_rn, tppp->rev_name);
2787 }
2788 }
2789
2790 str_fs[0] = '(';
2791 StringCat(str_fs, ")");
2792
2793 sub.Reset(new objects::CSubSource);
2794 sub->SetSubtype(33);
2795 sub->SetName(str_fs);
2796 subs.push_back(sub);
2797
2798 str_rs[0] = '(';
2799 StringCat(str_rs, ")");
2800
2801 sub.Reset(new objects::CSubSource);
2802 sub->SetSubtype(34);
2803 sub->SetName(str_rs);
2804 subs.push_back(sub);
2805
2806 if(str_fn != NULL)
2807 {
2808 str_fn[0] = '(';
2809 StringCat(str_fn, ")");
2810
2811 sub.Reset(new objects::CSubSource);
2812 sub->SetSubtype(35);
2813 sub->SetName(str_fn);
2814 subs.push_back(sub);
2815 }
2816
2817 if(str_rn != NULL)
2818 {
2819 str_rn[0] = '(';
2820 StringCat(str_rn, ")");
2821
2822 sub.Reset(new objects::CSubSource);
2823 sub->SetSubtype(36);
2824 sub->SetName(str_rn);
2825 subs.push_back(sub);
2826 }
2827 }
2828
2829 /**********************************************************/
PcrPrimersFree(PcrPrimersPtr ppp)2830 static void PcrPrimersFree(PcrPrimersPtr ppp)
2831 {
2832 PcrPrimersPtr next;
2833
2834 for(; ppp != NULL; ppp = next)
2835 {
2836 next = ppp->next;
2837 if(ppp->fwd_name != NULL)
2838 MemFree(ppp->fwd_name);
2839 if(ppp->fwd_seq != NULL)
2840 MemFree(ppp->fwd_seq);
2841 if(ppp->rev_name != NULL)
2842 MemFree(ppp->rev_name);
2843 if(ppp->rev_seq != NULL)
2844 MemFree(ppp->rev_seq);
2845 MemFree(ppp);
2846 }
2847 }
2848
2849 /**********************************************************/
ParsePcrPrimers(SourceFeatBlkPtr sfbp)2850 static bool ParsePcrPrimers(SourceFeatBlkPtr sfbp)
2851 {
2852 PcrPrimersPtr ppp;
2853 PcrPrimersPtr tppp;
2854
2855 char* p;
2856 char* q;
2857 char* r;
2858 char* s;
2859 bool comma;
2860 bool bad_start;
2861 bool empty;
2862 Char ch;
2863 Int4 count;
2864 Int4 prev; /* 1 = fwd_name, 2 = fwd_seq,
2865 3 = rev_name, 4 = rev_seq */
2866
2867 bool got_problem = false;
2868 for(ppp = NULL; sfbp != NULL; sfbp = sfbp->next)
2869 {
2870 if (sfbp->quals.empty() || sfbp->bio_src.Empty())
2871 continue;
2872
2873 count = 0;
2874 ITERATE(TQualVector, cur, sfbp->quals)
2875 {
2876 if((*cur)->GetQual() != "PCR_primers" ||
2877 !(*cur)->IsSetVal() || (*cur)->GetVal().empty())
2878 continue;
2879
2880 count++;
2881 if(ppp == NULL)
2882 {
2883 ppp = (PcrPrimersPtr) MemNew(sizeof(PcrPrimers));
2884 tppp = ppp;
2885 }
2886 else
2887 {
2888 tppp->next = (PcrPrimersPtr) MemNew(sizeof(PcrPrimers));
2889 tppp = tppp->next;
2890 }
2891
2892 prev = 0;
2893 std::vector<Char> val_buf((*cur)->GetVal().begin(), (*cur)->GetVal().end());
2894 val_buf.push_back(0);
2895
2896 for(comma = false, bad_start = false, p = &val_buf[0]; *p != '\0';)
2897 {
2898 q = CheckPcrPrimersTag(p);
2899 if(q == NULL)
2900 {
2901 if (p != &val_buf[0])
2902 {
2903 p++;
2904 continue;
2905 }
2906 bad_start = true;
2907 break;
2908 }
2909
2910 if(*q == ' ')
2911 q++;
2912 for(r = q;;)
2913 {
2914 r = StringChr(r, ',');
2915 if(r == NULL)
2916 break;
2917 if(*++r == ' ')
2918 r++;
2919 if(CheckPcrPrimersTag(r) != NULL)
2920 break;
2921 }
2922 if(r != NULL)
2923 {
2924 r--;
2925 if(*r == ' ')
2926 r--;
2927 if(r > q && *(r - 1) == ' ')
2928 r--;
2929 ch = *r;
2930 *r = '\0';
2931 }
2932
2933 if(StringChr(q, ',') != NULL)
2934 comma = true;
2935
2936 empty = false;
2937 if(q == NULL || *q == '\0')
2938 empty = true;
2939 else if(StringNCmp(p, "fwd_name", 8) == 0)
2940 {
2941 if(prev == 1)
2942 prev = -2;
2943 else if(prev > 2 && prev < 5)
2944 prev = -1;
2945 else
2946 {
2947 if(tppp->fwd_name == NULL)
2948 tppp->fwd_name = StringSave(q);
2949 else
2950 {
2951 s = (char*) MemNew(StringLen(tppp->fwd_name) +
2952 StringLen(q) + 2);
2953 StringCpy(s, tppp->fwd_name);
2954 StringCat(s, ":");
2955 StringCat(s, q);
2956 MemFree(tppp->fwd_name);
2957 tppp->fwd_name = s;
2958 }
2959 prev = 1;
2960 }
2961 }
2962 else if(StringNCmp(p, "fwd_seq", 7) == 0)
2963 {
2964 if(prev > 2 && prev < 5)
2965 prev = -1;
2966 else
2967 {
2968 if(tppp->fwd_seq == NULL)
2969 tppp->fwd_seq = StringSave(q);
2970 else
2971 {
2972 s = (char*) MemNew(StringLen(tppp->fwd_seq) +
2973 StringLen(q) + 2);
2974 StringCpy(s, tppp->fwd_seq);
2975 StringCat(s, ":");
2976 StringCat(s, q);
2977 MemFree(tppp->fwd_seq);
2978 tppp->fwd_seq = s;
2979 if(prev != 1)
2980 {
2981 if(tppp->fwd_name == NULL)
2982 tppp->fwd_name = StringSave(":");
2983 else
2984 {
2985 s = (char*) MemNew(StringLen(tppp->fwd_name) + 2);
2986 StringCpy(s, tppp->fwd_name);
2987 StringCat(s, ":");
2988 MemFree(tppp->fwd_name);
2989 tppp->fwd_name = s;
2990 }
2991 }
2992 }
2993 prev = 2;
2994 }
2995 }
2996 else if(StringNCmp(p, "rev_name", 8) == 0)
2997 {
2998 if(prev == 3 || prev == 1)
2999 prev = -2;
3000 else
3001 {
3002 if(tppp->rev_name == NULL)
3003 tppp->rev_name = StringSave(q);
3004 else
3005 {
3006 s = (char*) MemNew(StringLen(tppp->rev_name) +
3007 StringLen(q) + 2);
3008 StringCpy(s, tppp->rev_name);
3009 StringCat(s, ":");
3010 StringCat(s, q);
3011 MemFree(tppp->rev_name);
3012 tppp->rev_name = s;
3013 }
3014 prev = 3;
3015 }
3016 }
3017 else
3018 {
3019 if(prev == 1)
3020 prev = -2;
3021 else
3022 {
3023 if(tppp->rev_seq == NULL)
3024 tppp->rev_seq = StringSave(q);
3025 else
3026 {
3027 s = (char*) MemNew(StringLen(tppp->rev_seq) +
3028 StringLen(q) + 2);
3029 StringCpy(s, tppp->rev_seq);
3030 StringCat(s, ":");
3031 StringCat(s, q);
3032 MemFree(tppp->rev_seq);
3033 tppp->rev_seq = s;
3034 if(prev != 3)
3035 {
3036 if(tppp->rev_name == NULL)
3037 tppp->rev_name = StringSave(":");
3038 else
3039 {
3040 s = (char*) MemNew(StringLen(tppp->rev_name) + 2);
3041 StringCpy(s, tppp->rev_name);
3042 StringCat(s, ":");
3043 MemFree(tppp->rev_name);
3044 tppp->rev_name = s;
3045 }
3046 }
3047 }
3048 prev = 4;
3049 }
3050 }
3051
3052 if(r == NULL)
3053 break;
3054
3055 *r++ = ch;
3056
3057 if(comma || prev < 0 || empty)
3058 break;
3059
3060 if(ch == ' ')
3061 r++;
3062 if(*r == ' ')
3063 r++;
3064 p = r;
3065 }
3066
3067 if(prev == 1 || prev == 3)
3068 prev = -2;
3069
3070 if(bad_start)
3071 {
3072 ErrPostEx(SEV_REJECT, ERR_QUALIFIER_InvalidPCRprimer,
3073 "Unknown text found at the beginning of /PCR_primers qualifier: \"%s\". Entry dropped.",
3074 &val_buf[0]);
3075 got_problem = true;
3076 break;
3077 }
3078
3079 if(comma)
3080 {
3081 ErrPostEx(SEV_REJECT, ERR_QUALIFIER_PCRprimerEmbeddedComma,
3082 "Encountered embedded comma within /PCR_primers qualifier's field value: \"%s\". Entry dropped.",
3083 &val_buf[0]);
3084 got_problem = true;
3085 break;
3086 }
3087
3088 if(prev == -1)
3089 {
3090 ErrPostEx(SEV_REJECT, ERR_QUALIFIER_InvalidPCRprimer,
3091 "Encountered incorrect order of \"forward\" and \"reversed\" sequences within /PCR_primers qualifier: \"%s\". Entry dropped.",
3092 &val_buf[0]);
3093 got_problem = true;
3094 break;
3095 }
3096
3097 if(prev == -2)
3098 {
3099 ErrPostEx(SEV_REJECT, ERR_QUALIFIER_MissingPCRprimerSeq,
3100 "/PCR_primers qualifier \"%s\" is missing or has an empty required fwd_seq or rev_seq fields (or both). Entry dropped.",
3101 &val_buf[0]);
3102 got_problem = true;
3103 break;
3104 }
3105
3106 if(empty)
3107 {
3108 ErrPostEx(SEV_REJECT, ERR_QUALIFIER_InvalidPCRprimer,
3109 "/PCR_primers qualifier \"%s\" has an empty field value. Entry dropped.",
3110 &val_buf[0]);
3111 got_problem = true;
3112 break;
3113 }
3114
3115 if(tppp->fwd_seq == NULL || tppp->fwd_seq[0] == '\0' ||
3116 tppp->rev_seq == NULL || tppp->rev_seq[0] == '\0')
3117 {
3118 ErrPostEx(SEV_REJECT, ERR_QUALIFIER_MissingPCRprimerSeq,
3119 "/PCR_primers qualifier \"%s\" is missing or has an empty required fwd_seq or rev_seq fields (or both). Entry dropped.",
3120 &val_buf[0]);
3121 got_problem = true;
3122 break;
3123 }
3124 }
3125
3126 if (got_problem)
3127 {
3128 PcrPrimersFree(ppp);
3129 break;
3130 }
3131
3132 PopulatePcrPrimers(*sfbp->bio_src, ppp, count);
3133 PcrPrimersFree(ppp);
3134 ppp = NULL;
3135 }
3136
3137 if(sfbp == NULL)
3138 return true;
3139 return false;
3140 }
3141
3142 /**********************************************************/
CheckCollectionDate(SourceFeatBlkPtr sfbp,Parser::ESource source)3143 static void CheckCollectionDate(SourceFeatBlkPtr sfbp, Parser::ESource source)
3144 {
3145 const char *Mmm[] = {"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul",
3146 "Aug", "Sep", "Oct", "Nov", "Dec", NULL};
3147 const char **b;
3148 const char *q;
3149
3150 char* p;
3151 char* r;
3152 char* val;
3153 Int4 year;
3154 Int4 month;
3155 Int4 day;
3156 Int4 bad;
3157 Int4 num_slash;
3158 Int4 num_T;
3159 Int4 num_colon;
3160 Int4 num_Z;
3161 Int4 len;
3162
3163 CTime time(CTime::eCurrent);
3164 objects::CDate_std date(time);
3165
3166 for(; sfbp != NULL; sfbp = sfbp->next)
3167 {
3168 if (sfbp->quals.empty() || sfbp->bio_src.Empty())
3169 continue;
3170
3171 ITERATE(TQualVector, cur, sfbp->quals)
3172 {
3173 bad = 0;
3174 if ((*cur)->GetQual() != "collection_date" ||
3175 !(*cur)->IsSetVal() || (*cur)->GetVal().empty())
3176 continue;
3177
3178 val = (char *) (*cur)->GetVal().c_str();
3179 for(num_slash = 0, p = val; *p != '\0'; p++)
3180 if(*p == '/')
3181 num_slash++;
3182
3183 if(num_slash > 1)
3184 {
3185 p = StringSave(sfbp->location);
3186 if(p != NULL && StringLen(p) > 50)
3187 p[50] = '\0';
3188 ErrPostEx(SEV_ERROR, ERR_SOURCE_InvalidCollectionDate,
3189 "/collection_date \"%s\" for source feature at \"%s\" has too many components.",
3190 val, (p == NULL) ? "unknown location" : p);
3191 if(p != NULL)
3192 MemFree(p);
3193 continue;
3194 }
3195
3196 for(val = (char *) (*cur)->GetVal().c_str();;)
3197 {
3198 r = StringChr(val, '/');
3199 if(r != NULL)
3200 *r = '\0';
3201
3202 len = StringLen(val);
3203
3204 if(len == 4)
3205 {
3206 for(q = val; *q == '0';)
3207 q++;
3208 for(p = (char*) q; *p != '\0'; p++)
3209 if(*p < '0' || *p > '9')
3210 break;
3211 if(*p != '\0')
3212 bad = 1;
3213 else if (atoi(q) > date.GetYear())
3214 bad = 3;
3215 }
3216 else if(len == 8)
3217 {
3218 if(val[3] != '-')
3219 bad = 1;
3220 else
3221 {
3222 p = val;
3223 p[3] = '\0';
3224 if(source == Parser::ESource::DDBJ)
3225 {
3226 if(p[0] >= 'a' && p[0] <= 'z')
3227 p[0] &= ~040;
3228 if(p[1] >= 'A' && p[1] <= 'Z')
3229 p[1] |= 040;
3230 if(p[2] >= 'A' && p[2] <= 'Z')
3231 p[2] |= 040;
3232 }
3233 for(b = Mmm, month = 1; *b != NULL; b++, month++)
3234 if(StringCmp(*b, p) == 0)
3235 break;
3236 if(*b == NULL)
3237 bad = 1;
3238 p[3] = '-';
3239 }
3240 if(bad == 0)
3241 {
3242 for(q = val + 4; *q == '0';)
3243 q++;
3244 for(p = (char*) q; *p != '\0'; p++)
3245 if(*p < '0' || *p > '9')
3246 break;
3247 if(*p != '\0')
3248 bad = 1;
3249 else
3250 {
3251 year = atoi(q);
3252 if(year > date.GetYear() ||
3253 (year == date.GetYear() && month > date.GetMonth()))
3254 bad = 3;
3255 }
3256 }
3257 }
3258 else if(len == 11)
3259 {
3260 if(val[2] != '-' || val[6] != '-')
3261 bad = 1;
3262 else
3263 {
3264 p = val;
3265 val[2] = '\0';
3266 val[6] = '\0';
3267 if(p[0] < '0' || p[0] > '3' || p[1] < '0' || p[1] > '9')
3268 bad = 1;
3269 else
3270 {
3271 if(*p == '0')
3272 p++;
3273 day = atoi(p);
3274 p = val + 3;
3275 if(source == Parser::ESource::DDBJ)
3276 {
3277 if(p[0] >= 'a' && p[0] <= 'z')
3278 p[0] &= ~040;
3279 if(p[1] >= 'A' && p[1] <= 'Z')
3280 p[1] |= 040;
3281 if(p[2] >= 'A' && p[2] <= 'Z')
3282 p[2] |= 040;
3283 }
3284 for(b = Mmm, month = 1; *b != NULL; b++, month++)
3285 if(StringCmp(*b, p) == 0)
3286 break;
3287 if(*b == NULL)
3288 bad = 1;
3289 else
3290 {
3291 if(day < 1 || day > 31)
3292 bad = 2;
3293 else if(month == 2 && day > 29)
3294 bad = 2;
3295 else if((month == 4 || month == 6 || month == 9 || month == 11) && day > 30)
3296 bad = 2;
3297 }
3298 }
3299 if(bad == 0)
3300 {
3301 for(q = val + 7; *q == '0';)
3302 q++;
3303 for(p = (char*) q; *p != '\0'; p++)
3304 if(*p < '0' || *p > '9')
3305 break;
3306 if(*p != '\0')
3307 bad = 1;
3308 else
3309 {
3310 year = atoi(q) - 1900;
3311 if(year > date.GetYear() ||
3312 (year == date.GetYear() && month > date.GetMonth()) ||
3313 (year == date.GetYear() && month == date.GetMonth() && day > date.GetDay()))
3314 bad = 3;
3315 }
3316 }
3317 val[2] = '-';
3318 val[6] = '-';
3319 }
3320 }
3321 else if(len == 7 || len == 10 || len == 14 || len == 17 ||
3322 len == 20)
3323 {
3324 num_T = 0;
3325 num_Z = 0;
3326 num_colon = 0;
3327 for(p = val; *p != '\0'; p++)
3328 {
3329 if((*p < 'a' || *p > 'z') && (*p < 'A' || *p > 'Z') &&
3330 (*p < '0' || *p > '9') && *p != '-' && *p != '/' &&
3331 *p != ':')
3332 {
3333 bad = 3;
3334 break;
3335 }
3336 if(*p == ':')
3337 num_colon++;
3338 else if(*p == 'T')
3339 num_T++;
3340 else if(*p == 'Z')
3341 num_Z++;
3342 }
3343 if(len == 7 || len == 10)
3344 {
3345 if(num_T > 0)
3346 bad = 4;
3347 if(num_Z > 0)
3348 bad = 5;
3349 if(num_colon > 0)
3350 bad = 6;
3351 }
3352 else
3353 {
3354 if(num_Z > 1)
3355 bad = 5;
3356 if(num_T > 1)
3357 bad = 4;
3358 if((len == 14 && num_colon > 0) ||
3359 (len == 17 && num_colon > 1) ||
3360 (len == 20 && num_colon > 2))
3361 bad = 6;
3362 }
3363 }
3364 else
3365 bad = 8;
3366
3367 if(bad == 0)
3368 {
3369 if(r == NULL)
3370 break;
3371
3372 *r = '/';
3373 val = r + 1;
3374 continue;
3375 }
3376
3377 p = StringSave(sfbp->location);
3378 if(p != NULL && StringLen(p) > 50)
3379 p[50] = '\0';
3380 if(bad == 1)
3381 q = "is not of the format DD-Mmm-YYYY, Mmm-YYYY, or YYYY";
3382 else if(bad == 2)
3383 q = "has an illegal day value for the stated month";
3384 else if(bad == 3)
3385 q = "has invalid characters";
3386 else if(bad == 4)
3387 q = "has too many time values";
3388 else if(bad == 5)
3389 q = "has too many Zulu indicators";
3390 else if(bad == 6)
3391 q = "has too many hour and minute delimiters";
3392 else
3393 q = "has not yet occured";
3394 ErrPostEx(SEV_ERROR, ERR_SOURCE_InvalidCollectionDate,
3395 "/collection_date \"%s\" for source feature at \"%s\" %s.",
3396 val, (p == NULL) ? "unknown location" : p, q);
3397 if(p != NULL)
3398 MemFree(p);
3399
3400 if(r == NULL)
3401 break;
3402
3403 *r = '/';
3404 val = r + 1;
3405 }
3406 }
3407 }
3408 }
3409
3410 /**********************************************************/
CheckNeedSYNFocus(SourceFeatBlkPtr sfbp)3411 static bool CheckNeedSYNFocus(SourceFeatBlkPtr sfbp)
3412 {
3413 const char **b;
3414
3415 if(sfbp == NULL || sfbp->next == NULL)
3416 return false;
3417
3418 for(; sfbp != NULL; sfbp = sfbp->next)
3419 {
3420 if(!sfbp->full)
3421 continue;
3422
3423 for(b = special_orgs; *b != NULL; b++)
3424 if(StringICmp(*b, sfbp->name) == 0)
3425 break;
3426
3427 if(*b != NULL)
3428 break;
3429 }
3430
3431 if(sfbp != NULL)
3432 return false;
3433 return true;
3434 }
3435
3436 /**********************************************************/
CheckMetagenome(objects::CBioSource & bio)3437 static void CheckMetagenome(objects::CBioSource& bio)
3438 {
3439 if (!bio.IsSetOrg())
3440 return;
3441
3442 bool metatax = false;
3443 bool metalin = false;
3444
3445 if (bio.IsSetOrgname() && bio.GetOrgname().IsSetLineage() &&
3446 StringStr(bio.GetOrgname().GetLineage().c_str(), "metagenomes") != NULL)
3447 metalin = true;
3448
3449 if (bio.GetOrg().IsSetTaxname() &&
3450 StringStr(bio.GetOrg().GetTaxname().c_str(), "metagenome") != NULL)
3451 metatax = true;
3452
3453 if(!metalin && !metatax)
3454 return;
3455
3456 const Char* taxname = bio.GetOrg().IsSetTaxname() ? bio.GetOrg().GetTaxname().c_str() : NULL;
3457 if (taxname == NULL || taxname[0] == 0)
3458 taxname = "unknown";
3459
3460 if (metalin && metatax)
3461 {
3462 CRef<objects::CSubSource> sub(new objects::CSubSource);
3463 sub->SetSubtype(37);
3464 sub->SetName("");
3465 bio.SetSubtype().push_back(sub);
3466 }
3467 else if(!metalin)
3468 ErrPostEx(SEV_ERROR, ERR_ORGANISM_LineageLacksMetagenome,
3469 "Organism name \"%s\" contains \"metagenome\" but the lineage lacks the \"metagenomes\" classification.",
3470 taxname);
3471 else
3472 ErrPostEx(SEV_ERROR, ERR_ORGANISM_OrgNameLacksMetagenome,
3473 "Lineage includes the \"metagenomes\" classification but organism name \"%s\" lacks \"metagenome\".",
3474 taxname);
3475 }
3476
3477 /**********************************************************/
CheckSubmitterSeqidQuals(SourceFeatBlkPtr sfbp,char * acc)3478 static bool CheckSubmitterSeqidQuals(SourceFeatBlkPtr sfbp, char* acc)
3479 {
3480 SourceFeatBlkPtr tsfbp;
3481 char* ssid;
3482 Int4 count_feat;
3483 Int4 count_qual;
3484
3485 if(sfbp == NULL)
3486 return(true);
3487
3488 count_feat = 0;
3489 count_qual = 0;
3490 for(ssid = NULL, tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
3491 {
3492 count_feat++;
3493 if(tsfbp->submitter_seqid == NULL)
3494 continue;
3495
3496 count_qual++;
3497 if(tsfbp->submitter_seqid[0] == '\0')
3498 {
3499 ErrPostEx(SEV_REJECT, ERR_SOURCE_MultipleSubmitterSeqids,
3500 "Multiple /submitter_seqid qualifiers were encountered within source feature at location \"%s\". Entry dropped.",
3501 (tsfbp->location == NULL) ? "?empty?" : tsfbp->location);
3502 break;
3503 }
3504
3505 if(ssid == NULL)
3506 ssid = tsfbp->submitter_seqid;
3507 else if(StringCmp(ssid, tsfbp->submitter_seqid) != 0)
3508 {
3509 ErrPostEx(SEV_REJECT, ERR_SOURCE_DifferentSubmitterSeqids,
3510 "Different /submitter_seqid qualifiers were encountered amongst source features: \"%s\" and \"%s\" at least. Entry dropped.",
3511 ssid, tsfbp->submitter_seqid);
3512 break;
3513 }
3514 }
3515
3516 if(tsfbp != NULL)
3517 return(false);
3518
3519 if(count_feat == count_qual)
3520 return(true);
3521
3522 ErrPostEx(SEV_REJECT, ERR_SOURCE_LackingSubmitterSeqids,
3523 "One ore more source features are lacking /submitter_seqid qualifiers provided in others. Entry dropped.");
3524 return(false);
3525 }
3526
3527 /**********************************************************/
ParseSourceFeat(ParserPtr pp,DataBlkPtr dbp,TSeqIdList & seqids,Int2 type,objects::CBioseq & bioseq,TSeqFeatList & seq_feats)3528 void ParseSourceFeat(ParserPtr pp, DataBlkPtr dbp, TSeqIdList& seqids,
3529 Int2 type, objects::CBioseq& bioseq, TSeqFeatList& seq_feats)
3530 {
3531 SourceFeatBlkPtr sfbp;
3532 SourceFeatBlkPtr tsfbp;
3533
3534 MinMaxPtr mmp;
3535 IndexblkPtr ibp;
3536 char* res;
3537 char* acc;
3538 char* p;
3539 Int4 i;
3540 Int4 use_what = USE_ALL;
3541 bool err;
3542 ErrSev sev;
3543 bool need_focus;
3544 bool already;
3545
3546 ibp = pp->entrylist[pp->curindx];
3547 acc = ibp->acnum;
3548 size_t len = ibp->bases;
3549
3550 if(ibp->segnum < 2)
3551 pp->errstat = 0;
3552
3553 sfbp = CollectSourceFeats(dbp, type);
3554 if(sfbp == NULL)
3555 {
3556 ErrPostEx(SEV_REJECT, ERR_SOURCE_FeatureMissing,
3557 "Required source feature is missing. Entry dropped.");
3558 return;
3559 }
3560
3561 RemoveSourceFeatSpaces(sfbp);
3562 CheckForExemption(sfbp);
3563
3564 if(!CheckSourceFeatLocFuzz(sfbp))
3565 {
3566 SourceFeatBlkSetFree(sfbp);
3567 return;
3568 }
3569
3570 res = CheckSourceFeatLocAccs(sfbp, acc);
3571 if(res != NULL)
3572 {
3573 ErrPostEx(SEV_REJECT, ERR_SOURCE_BadLocation,
3574 "Source feature location points to another record: \"%s\". Entry dropped.",
3575 res);
3576 SourceFeatBlkSetFree(sfbp);
3577 return;
3578 }
3579
3580 if(!SourceFeatStructFillIn(ibp, sfbp, use_what))
3581 {
3582 ErrPostEx(SEV_REJECT, ERR_SOURCE_MultipleMolTypes,
3583 "Multiple /mol_type qualifiers were encountered within source feature. Entry dropped.");
3584 SourceFeatBlkSetFree(sfbp);
3585 return;
3586 }
3587
3588 if(ibp->submitter_seqid && !CheckSubmitterSeqidQuals(sfbp, acc))
3589 {
3590 MemFree(ibp->submitter_seqid);
3591 ibp->submitter_seqid = NULL;
3592 SourceFeatBlkSetFree(sfbp);
3593 return;
3594 }
3595
3596 if(!CheckMoltypeConsistency(sfbp, &ibp->moltype))
3597 {
3598 ErrPostEx(SEV_REJECT, ERR_SOURCE_InconsistentMolType,
3599 "Inconsistent /mol_type qualifiers were encountered. Entry dropped.");
3600 SourceFeatBlkSetFree(sfbp);
3601 return;
3602 }
3603
3604 res = CheckSourceFeatFocusAndTransposon(sfbp);
3605 if(res != NULL)
3606 {
3607 ErrPostEx(SEV_REJECT, ERR_SOURCE_FocusAndTransposonNotAllowed,
3608 "/transposon (or /insertion_seq) qualifiers should not be used in conjunction with /focus. Source feature at \"%s\". Entry dropped.",
3609 res);
3610 SourceFeatBlkSetFree(sfbp);
3611 return;
3612 }
3613
3614 res = CheckSourceFeatOrgs(sfbp, &i);
3615 if(res != NULL)
3616 {
3617 if(i == 1)
3618 {
3619 ErrPostEx(SEV_REJECT, ERR_SOURCE_NoOrganismQual,
3620 "/organism qualifier contains only organell/genome name. No genus/species present. Source feature at \"%s\". Entry dropped.",
3621 res);
3622 }
3623 else
3624 {
3625 ErrPostEx(SEV_REJECT, ERR_SOURCE_OrganismIncomplete,
3626 "Required /organism qualifier is containing genome info only at \"%s\". Entry dropped.",
3627 res);
3628 }
3629 SourceFeatBlkSetFree(sfbp);
3630 return;
3631 }
3632
3633 CompareDescrFeatSources(sfbp, bioseq);
3634
3635 CreateRawBioSources(pp, sfbp, use_what);
3636
3637 if(!CheckSourceLineage(sfbp, pp->source, ibp->is_pat))
3638 {
3639 SourceFeatBlkSetFree(sfbp);
3640 return;
3641 }
3642
3643 PropogateSuppliedLineage(bioseq, sfbp, pp->taxserver);
3644
3645 mmp = (MinMaxPtr) MemNew(sizeof(MinMax));
3646 mmp->orgname = NULL;
3647 mmp->min = 0;
3648 mmp->max = 0;
3649 mmp->skip = false;
3650 i = CheckSourceFeatCoverage(sfbp, mmp, len);
3651 if(i != 0)
3652 {
3653 if(i == 1)
3654 {
3655 ErrPostEx(SEV_REJECT, ERR_SOURCE_IncompleteCoverage,
3656 "Supplied source features do not span every base of the sequence. Entry dropped.");
3657 }
3658 else
3659 {
3660 ErrPostEx(SEV_REJECT, ERR_SOURCE_ExcessCoverage,
3661 "Sequence is spanned by too many source features. Entry dropped.");
3662 }
3663 SourceFeatBlkSetFree(sfbp);
3664 MinMaxFree(mmp);
3665 return;
3666 }
3667
3668 if(!CheckForENV(sfbp, ibp, pp->source))
3669 {
3670 SourceFeatBlkSetFree(sfbp);
3671 MinMaxFree(mmp);
3672 return;
3673 }
3674
3675 if(!CheckSYNTGNDivision(sfbp, ibp->division))
3676 {
3677 SourceFeatBlkSetFree(sfbp);
3678 MinMaxFree(mmp);
3679 return;
3680 }
3681
3682 if(pp->source == Parser::ESource::EMBL)
3683 need_focus = CheckNeedSYNFocus(sfbp);
3684 else
3685 need_focus = true;
3686
3687 already = false;
3688 i = CheckTransgenicSourceFeats(sfbp);
3689 if(i == 5)
3690 {
3691 if(pp->source == Parser::ESource::DDBJ || pp->source == Parser::ESource::EMBL)
3692 sev = SEV_WARNING;
3693 else
3694 sev = SEV_ERROR;
3695 ErrPostEx(sev, ERR_SOURCE_TransSingleOrgName,
3696 "Use of /transgenic requires at least two source features with differences among /organism, /strain, /organelle, and /isolate, between the host and foreign organisms.");
3697 }
3698 else if(i > 0)
3699 {
3700 sev = SEV_REJECT;
3701 if(i == 1)
3702 {
3703 ErrPostEx(sev, ERR_SOURCE_TransgenicTooShort,
3704 "Source feature with /transgenic qualifier does not span the entire sequence. Entry dropped.");
3705 }
3706 else if(i == 2)
3707 {
3708 ErrPostEx(sev, ERR_SOURCE_FocusAndTransgenicQuals,
3709 "Both /focus and /transgenic qualifiers exist; these quals are mutually exclusive. Entry dropped.");
3710 }
3711 else if(i == 3)
3712 {
3713 ErrPostEx(sev, ERR_SOURCE_MultipleTransgenicQuals,
3714 "Multiple source features have /transgenic qualifiers. Entry dropped.");
3715 }
3716 else
3717 {
3718 already = true;
3719 if(!need_focus)
3720 sev = SEV_ERROR;
3721 ErrPostEx(sev, ERR_SOURCE_FocusQualMissing,
3722 "Multiple organism names exist, but no source feature has a /focus qualifier.%s",
3723 (sev == SEV_ERROR) ? "" : " Entry dropped.");
3724 }
3725
3726 if(sev == SEV_REJECT)
3727 {
3728 SourceFeatBlkSetFree(sfbp);
3729 MinMaxFree(mmp);
3730 return;
3731 }
3732 }
3733
3734 res = CheckWholeSourcesVersusFocused(sfbp);
3735 if(res != NULL)
3736 {
3737 ErrPostEx(SEV_REJECT, ERR_SOURCE_FocusQualNotFullLength,
3738 "/focus qualifier should be used for the full-length source feature, not on source feature at \"%s\".",
3739 res);
3740 SourceFeatBlkSetFree(sfbp);
3741 MinMaxFree(mmp);
3742 return;
3743 }
3744 i = CheckFocusInOrgs(sfbp, len, &pp->errstat);
3745 if(pp->errstat != 0 && (ibp->segnum == 0 || pp->errstat == ibp->segtotal))
3746 i = 1;
3747 if(i > 0)
3748 {
3749 sev = SEV_REJECT;
3750 if(i == 1)
3751 {
3752 ErrPostEx(sev, ERR_SOURCE_FocusQualNotNeeded,
3753 "/focus qualifier present, but only one organism name exists. Entry dropped.");
3754 }
3755 else if(i == 2)
3756 {
3757 ErrPostEx(sev, ERR_SOURCE_MultipleOrganismWithFocus,
3758 "/focus qualifiers exist on source features with differing organism names. Entry dropped.");
3759 }
3760 else
3761 {
3762 if(!need_focus)
3763 sev = SEV_ERROR;
3764 if(!already)
3765 ErrPostEx(sev, ERR_SOURCE_FocusQualMissing,
3766 "Multiple organism names exist, but no source feature has a /focus qualifier.%s",
3767 (sev == SEV_ERROR) ? "" : " Entry dropped.");
3768 }
3769
3770 if(sev == SEV_REJECT)
3771 {
3772 SourceFeatBlkSetFree(sfbp);
3773 MinMaxFree(mmp);
3774 return;
3775 }
3776 }
3777 res = CheckSourceOverlap(mmp->next, len);
3778 MinMaxFree(mmp);
3779 if(res != NULL)
3780 {
3781 ErrPostEx(SEV_REJECT, ERR_SOURCE_MultiOrgOverlap,
3782 "Overlapping source features have different organism names %s. Entry dropped.",
3783 res);
3784 SourceFeatBlkSetFree(sfbp);
3785 MemFree(res);
3786 return;
3787 }
3788
3789 res = CheckForUnusualFullLengthOrgs(sfbp);
3790 if(res != NULL)
3791 {
3792 ErrPostEx(SEV_WARNING, ERR_SOURCE_UnusualOrgName,
3793 "Unusual organism name \"%s\" encountered for full-length source feature.",
3794 res);
3795 }
3796
3797 for(tsfbp = sfbp, i = 0; tsfbp != NULL; tsfbp = tsfbp->next)
3798 i++;
3799 if(i > BIOSOURCES_THRESHOLD)
3800 {
3801 ErrPostEx(SEV_WARNING, ERR_SOURCE_ManySourceFeats,
3802 "This record has more than %d source features.",
3803 BIOSOURCES_THRESHOLD);
3804 }
3805
3806 if(!ParsePcrPrimers(sfbp))
3807 {
3808 SourceFeatBlkSetFree(sfbp);
3809 return;
3810 }
3811
3812 CheckCollectionDate(sfbp, pp->source);
3813
3814 sfbp = PickTheDescrSource(sfbp);
3815 if(sfbp == NULL || !UpdateRawBioSource(sfbp, pp->source, ibp, pp->taxserver))
3816 {
3817 SourceFeatBlkSetFree(sfbp);
3818 return;
3819 }
3820
3821 if (sfbp->focus)
3822 sfbp->bio_src->SetIs_focus();
3823 else
3824 sfbp->bio_src->ResetIs_focus();
3825
3826
3827 for (tsfbp = sfbp; tsfbp != NULL; tsfbp = tsfbp->next)
3828 {
3829 CheckMetagenome(*tsfbp->bio_src);
3830
3831 CRef<objects::CSeq_feat> feat(new objects::CSeq_feat);
3832 feat->SetData().SetBiosrc(*tsfbp->bio_src);
3833
3834 if(pp->buf != NULL)
3835 MemFree(pp->buf);
3836 pp->buf = NULL;
3837
3838 GetSeqLocation(*feat, tsfbp->location, seqids, &err,
3839 pp, (char*) "source");
3840
3841 if(err)
3842 {
3843 ErrPostEx(SEV_ERROR, ERR_FEATURE_Dropped,
3844 "/source|%s| range check detects problems. Entry dropped.",
3845 tsfbp->location);
3846 break;
3847 }
3848
3849 if (!tsfbp->quals.empty())
3850 {
3851 p = GetTheQualValue(tsfbp->quals, "evidence");
3852 if(p != NULL)
3853 {
3854 if(StringICmp(p, "experimental") == 0)
3855 feat->SetExp_ev(objects::CSeq_feat::eExp_ev_experimental);
3856 else if(StringICmp(p, "not_experimental") == 0)
3857 feat->SetExp_ev(objects::CSeq_feat::eExp_ev_not_experimental);
3858 MemFree(p);
3859 }
3860 }
3861
3862 seq_feats.push_back(feat);
3863 }
3864
3865 SourceFeatBlkSetFree(sfbp);
3866
3867 if(tsfbp != NULL)
3868 seq_feats.clear();
3869 }
3870
3871 END_NCBI_SCOPE
3872