1 /*  valid.c
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name:  valid.c
27 *
28 * Author:  James Ostell
29 *
30 * Version Creation Date: 1/1/94
31 *
32 * $Revision: 6.2010 $
33 *
34 * File Description:  Sequence editing utilities
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date       Name        Description of modification
39 * -------  ----------  -----------------------------------------------------
40 *
41 *
42 *
43 * ==========================================================================
44 */
45 
46 static char    *this_module = "valid";
47 
48 #define THIS_MODULE this_module
49 
50 static char    *this_file = __FILE__;
51 
52 #define THIS_FILE this_file
53 
54 #include <ncbi.h>
55 #include <objfdef.h>
56 #include <valid.h>
57 #include <validerr.h>
58 #include <sqnutils.h>
59 #include <gbftdef.h>
60 #include <gbfeat.h>
61 #include <objsub.h>
62 #include <asn2gnbi.h>
63 #include <explore.h>
64 #include <gather.h>
65 #include <subutil.h>
66 #include <tofasta.h>
67 #include <findrepl.h>
68 #include <edutil.h>
69 #define NLM_GENERATED_CODE_PROTO
70 #include <objmacro.h>
71 #include <macroapi.h>
72 #include <objvalid.h>
73 #include <valapi.h>
74 #include "ecnum_specific.inc"
75 #include "ecnum_ambiguous.inc"
76 #include "ecnum_deleted.inc"
77 #include "ecnum_replaced.inc"
78 
79 /*****************************************************************************
80 *
81 *   NOTE: look at all the ValidErr calls with severity=0. Some should be
82 *   bumped up later. Look also for string "PARSER"
83 *
84 *****************************************************************************/
85 
86 
87 
88 #ifdef VAR_ARGS
89 #include <varargs.h>
90 #else
91 #include <stdarg.h>
92 #endif
93 
94 static ValidStructPtr globalvsp;        /* for spell checker */
95 
96 NLM_EXTERN void CDECL ValidErr VPROTO ((ValidStructPtr vsp, int severity, int code1, int code2, const char *fmt, ...));
97 static void     ValidateBioseqInst (GatherContextPtr gcp);
98 static void     ValidateBioseqContext (GatherContextPtr gcp);
99 static void     ValidateBioseqSet (GatherContextPtr gcp);
100 static void     ValidateGraphsOnBioseq (GatherContextPtr gcp);
101 static void     ValidateBioseqHist (GatherContextPtr gcp);
102 static void     SpellCheckSeqDescr (GatherContextPtr gcp);
103 NLM_EXTERN void CdTransCheck (ValidStructPtr vsp, SeqFeatPtr sfp);
104 NLM_EXTERN void MrnaTransCheck (ValidStructPtr vsp, SeqFeatPtr sfp);
105 NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp);
106 NLM_EXTERN void ValidateSeqLoc (ValidStructPtr vsp, SeqLocPtr slp, Boolean report_abutting, CharPtr prefix);
107 NLM_EXTERN Boolean PatchBadSequence (BioseqPtr bsp);
108 NLM_EXTERN CharPtr FindIDForEntry (SeqEntryPtr sep, CharPtr buf);
109 NLM_EXTERN void SpellCheckSeqFeat (GatherContextPtr gcp);
110 NLM_EXTERN void SpellCheckString (ValidStructPtr vsp, CharPtr str);
111 NLM_EXTERN void SpliceCheck (ValidStructPtr vsp, SeqFeatPtr sfp);
112 static void     CdConflictCheck (ValidStructPtr vsp, SeqFeatPtr sfp);
113 static void     SpliceCheckEx (ValidStructPtr vsp, SeqFeatPtr sfp, Boolean checkAll);
114 static void     CdsProductIdCheck (ValidStructPtr vsp, SeqFeatPtr sfp);
115 static void     ValidateBioSource (ValidStructPtr vsp, GatherContextPtr gcp, BioSourcePtr biop, SeqFeatPtr sfp, ValNodePtr sdp);
116 static void     ValidatePubdesc (ValidStructPtr vsp, GatherContextPtr gcp, PubdescPtr pdp);
117 static void     LookForMultiplePubs (ValidStructPtr vsp, GatherContextPtr gcp, SeqDescrPtr sdp);
118 static void     ValidateSfpCit (ValidStructPtr vsp, GatherContextPtr gcp, SeqFeatPtr sfp);
119 static void     ValidateAffil (ValidStructPtr vsp, AffilPtr ap);
120 static TextFsaPtr GetSpecificECNumberFSA (ValidStructPtr vsp);
121 static TextFsaPtr GetAmbiguousECNumberFSA (ValidStructPtr vsp);
122 static TextFsaPtr GetDeletedECNumberFSA (ValidStructPtr vsp);
123 static TextFsaPtr GetReplacedECNumberFSA (ValidStructPtr vsp);
124 static void ValidateCitSub (ValidStructPtr vsp, CitSubPtr csp);
125 
HasFeatId(SeqFeatPtr sfp,Int4 num)126 static Boolean HasFeatId(SeqFeatPtr sfp, Int4 num)
127 {
128   Boolean rval = FALSE;
129   ObjectIdPtr oip;
130 
131   if (sfp == NULL) {
132     return FALSE;
133   }
134   if (sfp->id.choice == 3) {
135     oip = (ObjectIdPtr) sfp->id.value.ptrvalue;
136     if (oip->id == num) {
137       rval = TRUE;
138     }
139   }
140   return rval;
141 }
142 
143 
144 /* alignment validator */
145 NLM_EXTERN Boolean ValidateSeqAlignWithinValidator (ValidStructPtr vsp, SeqEntryPtr sep, Boolean find_remote_bsp, Boolean do_hist_assembly);
146 
147 static ValNodePtr genetic_code_name_list = NULL;
148 
149 /*****************************************************************************
150 *
151 *   Perform Validation Checks on a SeqEntry
152 *
153 *****************************************************************************/
154 
ValidStructClear(ValidStructPtr vsp)155 NLM_EXTERN void ValidStructClear (ValidStructPtr vsp)
156 {                               /* 0 out a ValidStruct */
157   CharPtr         errbuf;
158   Int2            cutoff;
159   Boolean         patch_seq;
160   SpellCheckFunc  spellfunc;
161   SpellCallBackFunc spellcallback;
162   Boolean         onlyspell;
163   Boolean         justwarnonspell;
164   Boolean         useSeqMgrIndexes;
165   Boolean         suppressContext;
166   Boolean         validateAlignments;
167   Boolean         farIDsInAlignments;
168   Boolean         alignFindRemoteBsp;
169   Boolean         doSeqHistAssembly;
170   Boolean         alwaysRequireIsoJTA;
171   Boolean         farFetchCDSproducts;
172   Boolean         farFetchMRNAproducts;
173   Boolean         locusTagGeneralMatch;
174   Boolean         validateIDSet;
175   Boolean         seqSubmitParent;
176   Boolean         justShowAccession;
177   Boolean         ignoreExceptions;
178   Boolean         validateExons;
179   Boolean         inferenceAccnCheck;
180   Boolean         testLatLonSubregion;
181   Boolean         strictLatLonCountry;
182   Boolean         rubiscoTest;
183   Boolean         indexerVersion;
184   Boolean         disableSuppression;
185   Boolean         genomeSubmission;
186   Boolean         debugTestDuJour;
187   Int2            validationLimit;
188   ValidErrorFunc  errfunc;
189   Pointer         userdata;
190   Boolean         convertGiToAccn;
191   TextFsaPtr      sourceQualTags;
192   TextFsaPtr      modifiedBases;
193   TextFsaPtr      sgmlStrings;
194   Boolean         is_htg_in_sep;
195   Boolean         is_barcode_sep;
196   Boolean         is_refseq_in_sep;
197   Boolean         is_wp_in_sep;
198   Boolean         is_gpipe_in_sep;
199   Boolean         is_gps_in_sep;
200   Boolean         is_small_genome_set;
201   Boolean         is_embl_ddbj_in_sep;
202   Boolean         is_embl_tpe_in_sep;
203   Boolean         is_old_gb_in_sep;
204   Boolean         is_patent_in_sep;
205   Boolean         other_sets_in_sep;
206   Boolean         is_insd_in_sep;
207   Boolean         is_pdb_in_sep;
208   Boolean         only_lcl_gnl_in_sep;
209   Boolean         has_gi_or_accn_ver;
210   Boolean         has_gnl_prot_sep;
211   Boolean         bsp_genomic_in_sep;
212   Boolean         is_smupd_in_sep;
213   Boolean         feat_loc_has_gi;
214   Boolean         feat_prod_has_gi;
215   Boolean         has_multi_int_genes;
216   Boolean         has_seg_bioseqs;
217   Boolean         far_fetch_failure;
218   Boolean         use_heartbeat;
219   Boolean         is_geneious;
220 
221   if (vsp == NULL)
222     return;
223 
224   errbuf = vsp->errbuf;
225   cutoff = vsp->cutoff;
226   patch_seq = vsp->patch_seq;
227   spellfunc = vsp->spellfunc;
228   spellcallback = vsp->spellcallback;
229   onlyspell = vsp->onlyspell;
230   justwarnonspell = vsp->justwarnonspell;
231   useSeqMgrIndexes = vsp->useSeqMgrIndexes;
232   suppressContext = vsp->suppressContext;
233   validateAlignments = vsp->validateAlignments;
234   farIDsInAlignments = vsp->farIDsInAlignments;
235   alignFindRemoteBsp = vsp->alignFindRemoteBsp;
236   doSeqHistAssembly = vsp->doSeqHistAssembly;
237   alwaysRequireIsoJTA = vsp->alwaysRequireIsoJTA;
238   farFetchCDSproducts = vsp->farFetchCDSproducts;
239   farFetchMRNAproducts = vsp->farFetchMRNAproducts;
240   locusTagGeneralMatch = vsp->locusTagGeneralMatch;
241   validateIDSet = vsp->validateIDSet;
242   seqSubmitParent = vsp->seqSubmitParent;
243   justShowAccession = vsp->justShowAccession;
244   ignoreExceptions = vsp->ignoreExceptions;
245   validateExons = vsp->validateExons;
246   inferenceAccnCheck = vsp->inferenceAccnCheck;
247   testLatLonSubregion = vsp->testLatLonSubregion;
248   strictLatLonCountry = vsp->strictLatLonCountry;
249   rubiscoTest = vsp->rubiscoTest;
250   indexerVersion = vsp->indexerVersion;
251   disableSuppression = vsp->disableSuppression;
252   genomeSubmission = vsp->genomeSubmission;
253   debugTestDuJour = vsp->debugTestDuJour;
254   validationLimit = vsp->validationLimit;
255   errfunc = vsp->errfunc;
256   userdata = vsp->userdata;
257   convertGiToAccn = vsp->convertGiToAccn;
258   sourceQualTags = vsp->sourceQualTags;
259   modifiedBases = vsp->modifiedBases;
260   sgmlStrings = vsp->sgmlStrings;
261   is_htg_in_sep = vsp->is_htg_in_sep;
262   is_barcode_sep = vsp->is_barcode_sep;
263   is_refseq_in_sep = vsp->is_refseq_in_sep;
264   is_wp_in_sep = vsp->is_wp_in_sep;
265   is_gpipe_in_sep = vsp->is_gpipe_in_sep;
266   is_gps_in_sep = vsp->is_gps_in_sep;
267   is_small_genome_set = vsp->is_small_genome_set;
268   other_sets_in_sep = vsp->other_sets_in_sep;
269   is_embl_ddbj_in_sep = vsp->is_embl_ddbj_in_sep;
270   is_embl_tpe_in_sep = vsp->is_embl_tpe_in_sep;
271   is_old_gb_in_sep = vsp->is_old_gb_in_sep;
272   is_patent_in_sep = vsp->is_patent_in_sep;
273   is_insd_in_sep = vsp->is_insd_in_sep;
274   is_pdb_in_sep = vsp->is_pdb_in_sep;
275   only_lcl_gnl_in_sep = vsp->only_lcl_gnl_in_sep;
276   has_gi_or_accn_ver = vsp->has_gi_or_accn_ver;
277   has_gnl_prot_sep = vsp->has_gnl_prot_sep;
278   bsp_genomic_in_sep = vsp->bsp_genomic_in_sep;
279   is_smupd_in_sep = vsp->is_smupd_in_sep;
280   feat_loc_has_gi = vsp->feat_loc_has_gi;
281   feat_prod_has_gi = vsp->feat_prod_has_gi;
282   has_multi_int_genes = vsp->has_multi_int_genes;
283   has_seg_bioseqs = vsp->has_seg_bioseqs;
284   far_fetch_failure = vsp->far_fetch_failure;
285   use_heartbeat = vsp->use_heartbeat;
286   is_geneious = vsp->is_geneious;
287   MemSet ((VoidPtr) vsp, 0, sizeof (ValidStruct));
288   vsp->errbuf = errbuf;
289   vsp->cutoff = cutoff;
290   vsp->patch_seq = patch_seq;
291   vsp->spellfunc = spellfunc;
292   vsp->spellcallback = spellcallback;
293   vsp->onlyspell = onlyspell;
294   vsp->justwarnonspell = justwarnonspell;
295   vsp->useSeqMgrIndexes = useSeqMgrIndexes;
296   vsp->suppressContext = suppressContext;
297   vsp->validateAlignments = validateAlignments;
298   vsp->farIDsInAlignments = farIDsInAlignments;
299   vsp->alignFindRemoteBsp = alignFindRemoteBsp;
300   vsp->doSeqHistAssembly = doSeqHistAssembly;
301   vsp->alwaysRequireIsoJTA = alwaysRequireIsoJTA;
302   vsp->farFetchCDSproducts = farFetchCDSproducts;
303   vsp->farFetchMRNAproducts = farFetchMRNAproducts;
304   vsp->locusTagGeneralMatch = locusTagGeneralMatch;
305   vsp->validateIDSet = validateIDSet;
306   vsp->seqSubmitParent = seqSubmitParent;
307   vsp->justShowAccession = justShowAccession;
308   vsp->ignoreExceptions = ignoreExceptions;
309   vsp->validateExons = validateExons;
310   vsp->inferenceAccnCheck = inferenceAccnCheck;
311   vsp->testLatLonSubregion = testLatLonSubregion;
312   vsp->strictLatLonCountry = strictLatLonCountry;
313   vsp->rubiscoTest = rubiscoTest;
314   vsp->indexerVersion = indexerVersion;
315   vsp->disableSuppression = disableSuppression;
316   vsp->genomeSubmission = genomeSubmission;
317   vsp->debugTestDuJour = debugTestDuJour;
318   vsp->validationLimit = validationLimit;
319   vsp->errfunc = errfunc;
320   vsp->userdata = userdata;
321   vsp->convertGiToAccn = convertGiToAccn;
322   vsp->sourceQualTags = sourceQualTags;
323   vsp->modifiedBases = modifiedBases;
324   vsp->sgmlStrings = sgmlStrings;
325   vsp->is_htg_in_sep = is_htg_in_sep;
326   vsp->is_barcode_sep = is_barcode_sep;
327   vsp->is_refseq_in_sep = is_refseq_in_sep;
328   vsp->is_wp_in_sep = is_wp_in_sep;
329   vsp->is_gpipe_in_sep = is_gpipe_in_sep;
330   vsp->is_gps_in_sep = is_gps_in_sep;
331   vsp->is_small_genome_set = is_small_genome_set;
332   vsp->other_sets_in_sep = other_sets_in_sep;
333   vsp->is_embl_ddbj_in_sep = is_embl_ddbj_in_sep;
334   vsp->is_embl_tpe_in_sep = is_embl_tpe_in_sep;
335   vsp->is_old_gb_in_sep = is_old_gb_in_sep;
336   vsp->is_patent_in_sep = is_patent_in_sep;
337   vsp->is_insd_in_sep = is_insd_in_sep;
338   vsp->is_pdb_in_sep = is_pdb_in_sep;
339   vsp->only_lcl_gnl_in_sep = only_lcl_gnl_in_sep;
340   vsp->has_gi_or_accn_ver = has_gi_or_accn_ver;
341   vsp->has_gnl_prot_sep = has_gnl_prot_sep;
342   vsp->bsp_genomic_in_sep = bsp_genomic_in_sep;
343   vsp->is_smupd_in_sep = is_smupd_in_sep;
344   vsp->feat_loc_has_gi = feat_loc_has_gi;
345   vsp->feat_prod_has_gi = feat_prod_has_gi;
346   vsp->has_multi_int_genes = has_multi_int_genes;
347   vsp->has_seg_bioseqs = has_seg_bioseqs;
348   vsp->far_fetch_failure = far_fetch_failure;
349   vsp->use_heartbeat = use_heartbeat;
350   vsp->is_geneious = is_geneious;
351   return;
352 }
353 
ValidStructNew(void)354 NLM_EXTERN ValidStructPtr ValidStructNew (void)
355 {
356   ValidStructPtr  vsp;
357 
358   vsp = (ValidStructPtr) MemNew (sizeof (ValidStruct));
359   return vsp;
360 }
361 
ValidStructFree(ValidStructPtr vsp)362 NLM_EXTERN ValidStructPtr ValidStructFree (ValidStructPtr vsp)
363 {
364   if (vsp == NULL)
365     return vsp;
366 
367   MemFree (vsp->errbuf);
368   TextFsaFree (vsp->sourceQualTags);
369   TextFsaFree (vsp->modifiedBases);
370   TextFsaFree (vsp->sgmlStrings);
371   return (ValidStructPtr) MemFree (vsp);
372 }
373 
374 /*****************************************************************************
375 *
376 *   ValidErr()
377 *
378 *****************************************************************************/
379 
ChangeSeqIdToBestID(SeqIdPtr sip)380 static void ChangeSeqIdToBestID (SeqIdPtr sip)
381 {
382   BioseqPtr       bsp;
383   SeqIdPtr        id;
384   Pointer         pnt;
385 
386   if (sip == NULL)
387     return;
388   bsp = BioseqFindCore (sip);
389   if (bsp == NULL)
390     return;
391   id = SeqIdDup (SeqIdFindWorst (bsp->id));
392   if (id == NULL)
393     return;
394   /* now remove SeqId contents to reuse SeqId valnode */
395   pnt = sip->data.ptrvalue;
396   switch (sip->choice) {
397   case SEQID_LOCAL:            /* local */
398     ObjectIdFree ((ObjectIdPtr) pnt);
399     break;
400   case SEQID_GIBBSQ:           /* gibbseq */
401   case SEQID_GIBBMT:           /* gibbmt */
402     break;
403   case SEQID_GIIM:             /* giimid */
404     GiimFree ((GiimPtr) pnt);
405     break;
406   case SEQID_GENBANK:          /* genbank */
407   case SEQID_EMBL:             /* embl */
408   case SEQID_PIR:              /* pir   */
409   case SEQID_SWISSPROT:        /* swissprot */
410   case SEQID_OTHER:            /* other */
411   case SEQID_DDBJ:
412   case SEQID_PRF:
413   case SEQID_TPG:
414   case SEQID_TPE:
415   case SEQID_TPD:
416   case SEQID_GPIPE:
417     TextSeqIdFree ((TextSeqIdPtr) pnt);
418     break;
419   case SEQID_PATENT:           /* patent seq id */
420     PatentSeqIdFree ((PatentSeqIdPtr) pnt);
421     break;
422   case SEQID_GENERAL:          /* general */
423     DbtagFree ((DbtagPtr) pnt);
424     break;
425   case SEQID_GI:               /* gi */
426     break;
427   case SEQID_PDB:
428     PDBSeqIdFree ((PDBSeqIdPtr) pnt);
429     break;
430   }
431   sip->choice = id->choice;
432   sip->data.ptrvalue = id->data.ptrvalue;
433   SeqIdStripLocus (sip);
434 }
435 
ChangeSeqLocToBestID(SeqLocPtr slp)436 static void ChangeSeqLocToBestID (SeqLocPtr slp)
437 {
438   SeqLocPtr       loc;
439   PackSeqPntPtr   psp;
440   SeqBondPtr      sbp;
441   SeqIntPtr       sinp;
442   SeqIdPtr        sip;
443   SeqPntPtr       spp;
444 
445   while (slp != NULL) {
446     switch (slp->choice) {
447     case SEQLOC_NULL:
448       break;
449     case SEQLOC_EMPTY:
450     case SEQLOC_WHOLE:
451       sip = (SeqIdPtr) slp->data.ptrvalue;
452       ChangeSeqIdToBestID (sip);
453       break;
454     case SEQLOC_INT:
455       sinp = (SeqIntPtr) slp->data.ptrvalue;
456       if (sinp != NULL) {
457         sip = sinp->id;
458         ChangeSeqIdToBestID (sip);
459       }
460       break;
461     case SEQLOC_PNT:
462       spp = (SeqPntPtr) slp->data.ptrvalue;
463       if (spp != NULL) {
464         sip = spp->id;
465         ChangeSeqIdToBestID (sip);
466       }
467       break;
468     case SEQLOC_PACKED_PNT:
469       psp = (PackSeqPntPtr) slp->data.ptrvalue;
470       if (psp != NULL) {
471         sip = psp->id;
472         ChangeSeqIdToBestID (sip);
473       }
474       break;
475     case SEQLOC_PACKED_INT:
476     case SEQLOC_MIX:
477     case SEQLOC_EQUIV:
478       loc = (SeqLocPtr) slp->data.ptrvalue;
479       while (loc != NULL) {
480         ChangeSeqLocToBestID (loc);
481         loc = loc->next;
482       }
483       break;
484     case SEQLOC_BOND:
485       sbp = (SeqBondPtr) slp->data.ptrvalue;
486       if (sbp != NULL) {
487         spp = (SeqPntPtr) sbp->a;
488         if (spp != NULL) {
489           sip = spp->id;
490           ChangeSeqIdToBestID (sip);
491         }
492         spp = (SeqPntPtr) sbp->b;
493         if (spp != NULL) {
494           sip = spp->id;
495           ChangeSeqIdToBestID (sip);
496         }
497       }
498       break;
499     case SEQLOC_FEAT:
500       break;
501     default:
502       break;
503     }
504     slp = slp->next;
505   }
506 }
507 
508 //LCOV_EXCL_START
509 //function associated with unused options
WorstBioseqLabel(BioseqPtr bsp,CharPtr buffer,Int2 buflen,Uint1 content)510 static Int2 WorstBioseqLabel (BioseqPtr bsp, CharPtr buffer, Int2 buflen, Uint1 content)
511 {
512   CharPtr         tmp;
513   Char            label[60];
514   Int2            diff, len;
515   SeqIdPtr        sip;
516   AsnModulePtr    amp;
517   AsnTypePtr      ratp, matp;
518 
519   if ((bsp == NULL) || (buflen < 1))
520     return 0;
521 
522   len = buflen;
523   label[0] = '\0';
524 
525   if (content != OM_LABEL_TYPE) {
526     sip = SeqIdStripLocus (SeqIdDup (SeqIdFindWorst (bsp->id)));
527     SeqIdWrite (sip, label, PRINTID_FASTA_SHORT, 39);
528     SeqIdFree (sip);
529     if (content == OM_LABEL_CONTENT)
530       return LabelCopy (buffer, label, buflen);
531 
532     diff = LabelCopyExtra (buffer, label, buflen, NULL, ": ");
533     buflen -= diff;
534     buffer += diff;
535   }
536 
537   amp = AsnAllModPtr ();
538   ratp = AsnTypeFind (amp, "Seq-inst.repr");
539   matp = AsnTypeFind (amp, "Seq-inst.mol");
540 
541   label[0] = '\0';
542   tmp = label;
543   tmp = StringMove (tmp, AsnEnumTypeStr (ratp, (Int2) (bsp->repr)));
544   tmp = StringMove (tmp, ", ");
545   tmp = StringMove (tmp, AsnEnumTypeStr (matp, (Int2) (bsp->mol)));
546   sprintf (tmp, " len= %ld", (long) (bsp->length));
547   diff = LabelCopy (buffer, label, buflen);
548   buflen -= diff;
549   buffer += diff;
550 
551   if (content != OM_LABEL_SUMMARY)
552     return (len - buflen);
553 
554   return (len - buflen);        /* SUMMARY not done yet */
555 }
556 //LCOV_EXCL_STOP
557 
558 static CharPtr categoryLabel [] = {
559   NULL, "SEQ_INST", "SEQ_DESCR", "GENERIC", "SEQ_PKG", "SEQ_FEAT", "SEQ_ALIGN", "SEQ_GRAPH", "SEQ_ANNOT"
560 };
561 
GetValidCategoryName(int errcode)562 NLM_EXTERN CharPtr GetValidCategoryName (int errcode)
563 
564 {
565   if (errcode >= 1 && errcode < sizeof (categoryLabel)) return categoryLabel [errcode];
566   return NULL;
567 }
568 
569 static CharPtr err1Label [] = {
570   NULL,
571   "ExtNotAllowed",
572   "ExtBadOrMissing",
573   "SeqDataNotFound",
574   "SeqDataNotAllowed",
575   "ReprInvalid",
576   "CircularProtein",
577   "DSProtein",
578   "MolNotSet",
579   "MolOther",
580   "FuzzyLen",
581   "InvalidLen",
582   "InvalidAlphabet",
583   "SeqDataLenWrong",
584   "SeqPortFail",
585   "InvalidResidue",
586   "StopInProtein",
587   "PartialInconsistent",
588   "ShortSeq",
589   "NoIdOnBioseq",
590   "BadDeltaSeq",
591   "LongHtgsSequence",
592   "LongLiteralSequence",
593   "SequenceExceeds350kbp",
594   "ConflictingIdsOnBioseq",
595   "MolNuclAcid",
596   "ConflictingBiomolTech",
597   "SeqIdNameHasSpace",
598   "IdOnMultipleBioseqs",
599   "DuplicateSegmentReferences",
600   "TrailingX",
601   "BadSeqIdFormat",
602   "PartsOutOfOrder",
603   "BadSecondaryAccn",
604   "ZeroGiNumber",
605   "RnaDnaConflict",
606   "HistoryGiCollision",
607   "GiWithoutAccession",
608   "MultipleAccessions",
609   "HistAssemblyMissing",
610   "TerminalNs",
611   "UnexpectedIdentifierChange",
612   "InternalNsInSeqLit",
613   "SeqLitGapLength0",
614   "TpaAssmeblyProblem",
615   "SeqLocLength",
616   "MissingGaps",
617   "CompleteTitleProblem",
618   "CompleteCircleProblem",
619   "BadHTGSeq",
620   "GapInProtein",
621   "BadProteinStart",
622   "TerminalGap",
623   "OverlappingDeltaRange",
624   "LeadingX",
625   "InternalNsInSeqRaw",
626   "InternalNsAdjacentToGap",
627   "CaseDifferenceInSeqID",
628   "DeltaComponentIsGi0",
629   "FarFetchFailure",
630   "InternalGapsInSeqRaw",
631   "SelfReferentialSequence",
632   "WholeComponent",
633   "TSAHistAssemblyMissing",
634   "ProteinsHaveGeneralID",
635   "HighNContent",
636   "SeqLitDataLength0",
637   "DSmRNA",
638   "HighNContentStretch",
639   "HighNContentPercent",
640   "BadSegmentedSeq",
641   "SeqLitGapFuzzNot100",
642   "SeqGapProblem",
643   "WGSMasterLacksStrucComm",
644   "TSAMasterLacksStrucComm",
645   "AllNs"
646 };
647 
648 static CharPtr err2Label [] = {
649   NULL,
650   "BioSourceMissing",
651   "InvalidForType",
652   "FileOpenCollision",
653   "Unknown",
654   "NoPubFound",
655   "NoOrgFound",
656   "MultipleBioSources",
657   "NoMolInfoFound",
658   "BadCountryCode",
659   "NoTaxonID",
660   "InconsistentBioSources",
661   "MissingLineage",
662   "SerialInComment",
663   "BioSourceNeedsFocus",
664   "BadOrganelle",
665   "MultipleChromosomes",
666   "BadSubSource",
667   "BadOrgMod",
668   "InconsistentProteinTitle",
669   "Inconsistent",
670   "ObsoleteSourceLocation",
671   "ObsoleteSourceQual",
672   "StructuredSourceNote",
673   "UnnecessaryBioSourceFocus",
674   "RefGeneTrackingWithoutStatus",
675   "UnwantedCompleteFlag",
676   "CollidingPublications",
677   "TransgenicProblem",
678   "TaxonomyLookupProblem",
679   "MultipleTitles",
680   "RefGeneTrackingOnNonRefSeq",
681   "BioSourceInconsistency",
682   "FastaBracketTitle",
683   "MissingText",
684   "BadCollectionDate",
685   "BadPCRPrimerSequence",
686   "BadPunctuation",
687   "BadPCRPrimerName",
688   "BioSourceOnProtein",
689   "BioSourceDbTagConflict",
690   "DuplicatePCRPrimerSequence",
691   "MultipleNames",
692   "MultipleComments",
693   "LatLonProblem",
694   "LatLonFormat",
695   "LatLonRange",
696   "LatLonValue",
697   "LatLonCountry",
698   "LatLonState",
699   "BadSpecificHost",
700   "RefGeneTrackingIllegalStatus",
701   "ReplacedCountryCode",
702   "BadInstitutionCode",
703   "BadCollectionCode",
704   "BadVoucherID",
705   "UnstructuredVoucher",
706   "ChromosomeLocation",
707   "MultipleSourceQualifiers",
708   "UnbalancedParentheses",
709   "MultipleSourceVouchers",
710   "BadCountryCapitalization",
711   "WrongVoucherType",
712   "UserObjectProblem",
713   "TitleHasPMID",
714   "BadKeyword",
715   "NoOrganismInTitle",
716   "MissingChromosome",
717   "LatLonAdjacent",
718   "BadStrucCommInvalidFieldName",
719   "BadStrucCommInvalidFieldValue",
720   "BadStrucCommMissingField",
721   "BadStrucCommFieldOutOfOrder",
722   "BadStrucCommMultipleFields",
723   "BioSourceNeedsChromosome",
724   "MolInfoConflictsWithBioSource",
725   "MissingKeyword",
726   "FakeStructuredComment",
727   "StructuredCommentPrefixOrSuffixMissing",
728   "LatLonWater",
729   "LatLonOffshore",
730   "MissingPersonalCollectionName",
731   "LatLonPrecision",
732   "DBLinkProblem",
733   "FinishedStatusForWGS",
734   "BadTentativeName",
735   "OrganismNotFound",
736   "TaxonomyIsSpeciesProblem",
737   "TaxonomyConsultRequired",
738   "TaxonomyNucleomorphProblem",
739   "InconsistentMolTypeBiomol",
740   "BadInstitutionCountry",
741   "AmbiguousSpecificHost",
742   "BadAltitude",
743   "RefGeneTrackingOnNucProtSet",
744   "InconsistentDates",
745   "MultipleTaxonIDs",
746   "ScaffoldLacksBioProject",
747   "CompleteGenomeLacksBioProject",
748   "TaxonomyPlastidsProblem",
749   "OrganismIsUndefinedSpecies",
750   "WrongBiomolForTechnique",
751   "WrongOrganismFor16SrRNA",
752   "InconsistentWGSFlags"
753 };
754 
755 static CharPtr err3Label [] = {
756   NULL,
757   "NonAsciiAsn",
758   "Spell",
759   "AuthorListHasEtAl",
760   "MissingPubInfo",
761   "UnnecessaryPubEquiv",
762   "BadPageNumbering",
763   "MedlineEntryPub",
764   "BadDate",
765   "StructuredCitGenCit",
766   "CollidingSerialNumbers",
767   "EmbeddedScript",
768   "PublicationInconsistency",
769   "SgmlPresentInText",
770   "UnexpectedPubStatusComment",
771   "PastReleaseDate",
772   "MissingISOJTA",
773   "MissingVolume",
774   "MissingVolumeEpub",
775   "MissingPages",
776   "MissingPagesEpub"
777 };
778 
779 static CharPtr err4Label [] = {
780   NULL,
781   "NoCdRegionPtr",
782   "NucProtProblem",
783   "SegSetProblem",
784   "EmptySet",
785   "NucProtNotSegSet",
786   "SegSetNotParts",
787   "SegSetMixedBioseqs",
788   "PartsSetMixedBioseqs",
789   "PartsSetHasSets",
790   "FeaturePackagingProblem",
791   "GenomicProductPackagingProblem",
792   "InconsistentMolInfoBiomols",
793   "ArchaicFeatureLocation",
794   "ArchaicFeatureProduct",
795   "GraphPackagingProblem",
796   "InternalGenBankSet",
797   "ConSetProblem",
798   "NoBioseqFound",
799   "INSDRefSeqPackaging",
800   "GPSnonGPSPackaging",
801   "RefSeqPopSet",
802   "BioseqSetClassNotSet",
803   "OrphanedProtein",
804   "MissingSetTitle",
805   "NucProtSetHasTitle",
806   "ComponentMissingTitle",
807   "SingleItemSet",
808   "MisplacedMolInfo",
809   "ImproperlyNestedSets",
810   "SeqSubmitWithWgsSet"
811 };
812 
813 static CharPtr err5Label [] = {
814   NULL,
815   "InvalidForType",
816   "PartialProblem",
817   "InvalidType",
818   "Range",
819   "MixedStrand",
820   "SeqLocOrder",
821   "CdTransFail",
822   "StartCodon",
823   "InternalStop",
824   "NoProtein",
825   "MisMatchAA",
826   "TransLen",
827   "NoStop",
828   "TranslExcept",
829   "NoProtRefFound",
830   "NotSpliceConsensus",
831   "OrfCdsHasProduct",
832   "GeneRefHasNoData",
833   "ExceptInconsistent",
834   "ProtRefHasNoData",
835   "GenCodeMismatch",
836   "RNAtype0",
837   "UnknownImpFeatKey",
838   "UnknownImpFeatQual",
839   "WrongQualOnImpFeat",
840   "MissingQualOnImpFeat",
841   "PseudoCdsHasProduct",
842   "IllegalDbXref",
843   "FarLocation",
844   "DuplicateFeat",
845   "UnnecessaryGeneXref",
846   "TranslExceptPhase",
847   "TrnaCodonWrong",
848   "BothStrands",
849   "CDSgeneRange",
850   "CDSmRNArange",
851   "OverlappingPeptideFeat",
852   "SerialInComment",
853   "MultipleCDSproducts",
854   "FocusOnBioSourceFeature",
855   "PeptideFeatOutOfFrame",
856   "InvalidQualifierValue",
857   "MultipleMRNAproducts",
858   "mRNAgeneRange",
859   "TranscriptLen",
860   "TranscriptMismatches",
861   "CDSproductPackagingProblem",
862   "DuplicateInterval",
863   "PolyAsiteNotPoint",
864   "ImpFeatBadLoc",
865   "LocOnSegmentedBioseq",
866   "UnnecessaryCitPubEquiv",
867   "ImpCDShasTranslation",
868   "ImpCDSnotPseudo",
869   "MissingMRNAproduct",
870   "AbuttingIntervals",
871   "CollidingGeneNames",
872   "MultiIntervalGene",
873   "FeatContentDup",
874   "BadProductSeqId",
875   "RnaProductMismatch",
876   "MissingCDSproduct",
877   "BadTrnaCodon",
878   "BadTrnaAA",
879   "OnlyGeneXrefs",
880   "UTRdoesNotAbutCDS",
881   "BadConflictFlag",
882   "ConflictFlagSet",
883   "LocusTagProblem",
884   "CollidingLocusTags",
885   "AltStartCodon",
886   "PartialsInconsistent",
887   "GenesInconsistent",
888   "DuplicateTranslExcept",
889   "TranslExceptAndRnaEditing",
890   "NoNameForProtein",
891   "TaxonDbxrefOnFeature",
892   "UnindexedFeature",
893   "CDSmRNAmismatch",
894   "UnnecessaryException",
895   "LocusTagProductMismatch",
896   "MrnaTransFail",
897   "PseudoCdsViaGeneHasProduct",
898   "MissingGeneXref",
899   "FeatureCitationProblem",
900   "NestedSeqLocMix",
901   "WrongQualOnFeature",
902   "MissingQualOnFeature",
903   "CodonQualifierUsed",
904   "UnknownFeatureQual",
905   "BadCharInAuthorName",
906   "PolyATail",
907   "ProteinNameEndsInBracket",
908   "CDSwithMultipleMRNAs",
909   "MultipleEquivBioSources",
910   "MultipleEquivPublications",
911   "BadFullLengthFeature",
912   "RedundantFields",
913   "CDSwithNoMRNAOverlap",
914   "FeatureProductInconsistency",
915   "ImproperBondLocation",
916   "GeneXrefWithoutGene",
917   "SeqFeatXrefProblem",
918   "ProductFetchFailure",
919   "SuspiciousGeneXref",
920   "MissingTrnaAA",
921   "CollidingFeatureIDs",
922   "ExceptionProblem",
923   "PolyAsignalNotRange",
924   "OldLocusTagMismtach",
925   "DuplicateGeneOntologyTerm",
926   "InvalidInferenceValue",
927   "HpotheticalProteinMismatch",
928   "FeatureRefersToAccession",
929   "SelfReferentialProduct",
930   "ITSdoesNotAbutRRNA",
931   "FeatureSeqIDCaseDifference",
932   "FeatureLocationIsGi0",
933   "GapFeatureProblem",
934   "PseudoCdsHasProtXref",
935   "ErroneousException",
936   "SegmentedGeneProblem",
937   "WholeLocation",
938   "BadEcNumberFormat",
939   "BadEcNumberValue",
940   "EcNumberProblem",
941   "VectorContamination",
942   "MinusStrandProtein",
943   "BadProteinName",
944   "GeneXrefWithoutLocus",
945   "UTRdoesNotExtendToEnd",
946   "CDShasTooManyXs",
947   "SuspiciousFrame",
948   "TerminalXDiscrepancy",
949   "UnnecessaryTranslExcept",
950   "SuspiciousQualifierValue",
951   "NotSpliceConsensusDonor",
952   "NotSpliceConsensusAcceptor",
953   "RareSpliceConsensusDonor",
954   "SeqFeatXrefNotReciprocal",
955   "SeqFeatXrefFeatureMissing",
956   "FeatureInsideGap",
957   "FeatureCrossesGap",
958   "BadAuthorSuffix",
959   "BadAnticodonAA",
960   "BadAnticodonCodon",
961   "BadAnticodonStrand",
962   "UndesiredGeneSynonym",
963   "UndesiredProteinName",
964   "FeatureBeginsOrEndsInGap",
965   "GeneOntologyTermMissingGOID",
966   "PseudoRnaHasProduct",
967   "PseudoRnaViaGeneHasProduct",
968   "BadRRNAcomponentOrder",
969   "BadRRNAcomponentOverlap",
970   "MissingGeneLocusTag",
971   "MultipleProtRefs",
972   "BadInternalCharacter",
973   "BadTrailingCharacter",
974   "BadTrailingHyphen",
975   "MultipleGeneOverlap",
976   "BadCharInAuthorLastName",
977   "PseudoCDSmRNArange",
978   "ExtendablePartialProblem",
979   "GeneXrefNeeded",
980   "RubiscoProblem",
981   "UnqualifiedException",
982   "ProteinNameHasPMID",
983   "BadGeneOntologyFormat",
984   "InconsistentGeneOntologyTermAndId",
985   "MultiplyAnnotatedGenes",
986   "ReplicatedGeneSequence",
987   "ShortIntron",
988   "GeneXrefStrandProblem",
989   "CDSmRNAXrefLocationProblem",
990   "LocusCollidesWithLocusTag",
991   "IdenticalGeneSymbolAndSynonym",
992   "NeedsNote",
993   "RptUnitRangeProblem",
994   "TooManyInferenceAccessions",
995   "IntervalBeginsOrEndsInGap",
996   "InconsistentRRNAstrands",
997   "CDSonMinusStrandMRNA",
998   "tRNAmRNAmixup",
999   "ProductLength",
1000   "InconsistentPseudogeneCounts",
1001   "DeletedEcNumber",
1002   "ReplacedEcNumber",
1003   "SplitEcNumber",
1004   "PeptideFeatureLacksCDS",
1005   "EcNumberDataMissing",
1006   "CDSnotBetweenUTRs",
1007   "ShortExon",
1008   "ExtraProteinFeature",
1009   "AssemblyGapAdjacentToNs",
1010   "AssemblyGapCoversSequence",
1011   "FeatureBeginsOrEndsWithN",
1012   "FeatureIsMostlyNs",
1013   "CDSonMinusStrandTranscribedRNA",
1014   "MultipleGenCodes",
1015   "InvalidFuzz",
1016   "BadComment",
1017   "NonsenseIntron",
1018   "InconsistentPseudogeneValue",
1019   "MultiIntervalIntron",
1020   "SeqLocTypeProblem",
1021   "ColdShockProteinProblem"
1022 };
1023 
1024 static CharPtr err6Label [] = {
1025   NULL,
1026   "SeqIdProblem",
1027   "StrandRev",
1028   "DensegLenStart",
1029   "StartLessthanZero",
1030   "StartMorethanBiolen",
1031   "EndLessthanZero",
1032   "EndMorethanBiolen",
1033   "LenLessthanZero",
1034   "LenMorethanBiolen",
1035   "SumLenStart",
1036   "AlignDimSeqIdNotMatch",
1037   "SegsDimSeqIdNotMatch",
1038   "FastaLike",
1039   "NullSegs",
1040   "SegmentGap",
1041   "SegsDimOne",
1042   "AlignDimOne",
1043   "Segtype",
1044   "BlastAligns",
1045   "PercentIdentity",
1046   "ShortAln",
1047   "UnexpectedAlignmentType"
1048 };
1049 
1050 static CharPtr err7Label [] = {
1051   NULL,
1052   "GraphMin",
1053   "GraphMax",
1054   "GraphBelow",
1055   "GraphAbove",
1056   "GraphByteLen",
1057   "GraphOutOfOrder",
1058   "GraphBioseqLen",
1059   "GraphSeqLitLen",
1060   "GraphSeqLocLen",
1061   "GraphStartPhase",
1062   "GraphStopPhase",
1063   "GraphDiffNumber",
1064   "GraphACGTScore",
1065   "GraphNScore",
1066   "GraphGapScore",
1067   "GraphOverlap",
1068   "GraphBioseqId",
1069   "GraphACGTScoreMany",
1070   "GraphNScoreMany",
1071   "GraphLocInvalid"
1072 };
1073 
1074 static CharPtr err8Label [] = {
1075   NULL,
1076   "AnnotIDs",
1077   "AnnotLOCs"
1078 };
1079 
GetValidErrorName(int errcode,int subcode)1080 NLM_EXTERN CharPtr GetValidErrorName (int errcode, int subcode)
1081 
1082 {
1083   if (errcode < 1 || errcode >= sizeof (categoryLabel)) return NULL;
1084   switch (errcode) {
1085     case 1 :
1086       if (subcode >= 1 && subcode < sizeof (err1Label)) return err1Label [subcode];
1087       break;
1088     case 2 :
1089       if (subcode >= 1 && subcode < sizeof (err2Label)) return err2Label [subcode];
1090       break;
1091     case 3 :
1092       if (subcode >= 1 && subcode < sizeof (err3Label)) return err3Label [subcode];
1093       break;
1094     case 4 :
1095       if (subcode >= 1 && subcode < sizeof (err4Label)) return err4Label [subcode];
1096       break;
1097     case 5 :
1098       if (subcode >= 1 && subcode < sizeof (err5Label)) return err5Label [subcode];
1099       break;
1100     case 6 :
1101       if (subcode >= 1 && subcode < sizeof (err6Label)) return err6Label [subcode];
1102       break;
1103     case 7 :
1104       if (subcode >= 1 && subcode < sizeof (err7Label)) return err7Label [subcode];
1105       break;
1106     case 8 :
1107       if (subcode >= 1 && subcode < sizeof (err8Label)) return err8Label [subcode];
1108       break;
1109     default :
1110       break;
1111   }
1112   return NULL;
1113 }
1114 
1115 //LCOV_EXCL_START
GetValidExplanation(int errcode,int subcode)1116 NLM_EXTERN CharPtr GetValidExplanation (int errcode, int subcode)
1117 
1118 {
1119   return Nlm_GetErrLongText (THIS_MODULE, errcode, subcode);
1120 }
1121 //LCOV_EXCL_STOP
1122 
CustValErr(ValidStructPtr vsp,ErrSev severity,int errcode,int subcode)1123 static void CustValErr (ValidStructPtr vsp, ErrSev severity, int errcode, int subcode)
1124 
1125 {
1126   CharPtr           accession = NULL, context = NULL, label = NULL, location = NULL,
1127                     message = NULL, objtype = NULL, product = NULL, featureID = NULL,
1128                     seqid = NULL;
1129   BioseqPtr         bsp;
1130   BioseqSetPtr      bssp;
1131   Int2              buflen, diff, wrklen;
1132   CharPtr           ctmp, tmp;
1133   DbtagPtr          dbt;
1134   Uint2             entityID = 0, itemtype = 0;
1135   ValidErrorFunc    errfunc;
1136   GatherContextPtr  gcp;
1137   Char              id [64], id2 [64], numbuf [15];
1138   Uint4             itemID = 0;
1139   ObjectIdPtr       oip;
1140   ObjValNodePtr     ovp;
1141   SeqDescrPtr       sdp;
1142   SeqEntryPtr       sep;
1143   SeqFeatPtr        sfp = NULL;
1144   SeqIdPtr          sip;
1145   SeqLocPtr         slp;
1146 
1147   if (vsp == NULL) return;
1148   errfunc = vsp->errfunc;
1149   if (errfunc == NULL) return;
1150 
1151   gcp = vsp->gcp;
1152   if (gcp != NULL) {
1153     entityID = gcp->entityID;
1154     itemtype = gcp->thistype;
1155     itemID = gcp->itemID;
1156   }
1157 
1158   if (severity < SEV_NONE || severity > SEV_MAX) {
1159     severity = SEV_MAX;
1160   }
1161 
1162   sip = NULL;
1163   if (vsp->sfp != NULL) {
1164     sfp = vsp->sfp;
1165     bsp = BioseqFindFromSeqLoc (sfp->location);
1166     if (bsp != NULL) {
1167       sip = SeqIdFindWorst (bsp->id);
1168     }
1169   } else if (vsp->descr != NULL) {
1170     sdp = vsp->descr;
1171     if (sdp != NULL && sdp->extended != 0) {
1172       ovp = (ObjValNodePtr) sdp;
1173       if (ovp->idx.parenttype == OBJ_BIOSEQ) {
1174         bsp = (BioseqPtr) ovp->idx.parentptr;
1175         if (bsp != NULL) {
1176           sip = SeqIdFindWorst (bsp->id);
1177         }
1178       } else if (ovp->idx.parenttype == OBJ_BIOSEQSET) {
1179         bssp = (BioseqSetPtr) ovp->idx.parentptr;
1180         if (bssp != NULL) {
1181           sep = bssp->seqentry;
1182           if (sep != NULL) {
1183             sep = FindNthBioseq (sep, 1);
1184             if (sep != NULL) {
1185               bsp = (BioseqPtr) sep->data.ptrvalue;
1186               if (bsp != NULL) {
1187                 sip = SeqIdFindWorst (bsp->id);
1188               }
1189             }
1190           }
1191         }
1192       }
1193     }
1194   } else if (vsp->bsp != NULL) {
1195     bsp = vsp->bsp;
1196     sip = SeqIdFindWorst (bsp->id);
1197   } else if (vsp->bssp != NULL) {
1198     bssp = vsp->bssp;
1199     sep = bssp->seqentry;
1200     if (sep != NULL) {
1201       sep = FindNthBioseq (sep, 1);
1202       if (sep != NULL) {
1203         bsp = (BioseqPtr) sep->data.ptrvalue;
1204         if (bsp != NULL) {
1205           sip = SeqIdFindWorst (bsp->id);
1206         }
1207       }
1208     }
1209   }
1210   if (sip == NULL) {
1211     sep = FindNthBioseq (vsp->sep, 1);
1212     if (sep != NULL) {
1213       bsp = (BioseqPtr) sep->data.ptrvalue;
1214       if (bsp != NULL) {
1215         sip = SeqIdFindWorst (bsp->id);
1216       }
1217     }
1218   }
1219   if (sip != NULL) {
1220     SeqIdWrite (sip, id, PRINTID_REPORT, sizeof (id) - 1);
1221     accession = id;
1222     if (sip->choice == SEQID_GENERAL) {
1223       SeqIdWrite (sip, id2, PRINTID_FASTA_GENERAL, sizeof (id2) - 1);
1224       seqid = id2;
1225     } else {
1226       SeqIdWrite (sip, id2, PRINTID_FASTA_SHORT, sizeof (id2) - 1);
1227       seqid = id2;
1228     }
1229   }
1230 
1231   if (vsp->sfp != NULL) {
1232     objtype = "FEATURE";
1233   } else if (vsp->descr != NULL) {
1234     objtype = "DESCRIPTOR";
1235   } else if (vsp->bsp != NULL) {
1236     objtype = "BIOSEQ";
1237   } else if (vsp->bssp != NULL) {
1238     objtype = "BIOSEQ-SET";
1239   }
1240 
1241   message = vsp->errbuf;
1242 
1243   tmp = vsp->errbuf;
1244   buflen = 4000;
1245   while (*tmp != '\0') {
1246     buflen--;
1247     tmp++;
1248   }
1249   tmp++;
1250   *tmp = '\0';
1251 
1252   wrklen = buflen;
1253   if (wrklen > 2000) {
1254      wrklen -= 1000;
1255   }
1256 
1257   if (vsp->sfp != NULL) {
1258     sfp = vsp->sfp;
1259     label = tmp;
1260     diff = FeatDefLabel (sfp, tmp, wrklen, OM_LABEL_BOTH);
1261     buflen -= diff;
1262     tmp += diff;
1263     *tmp = '\0';
1264     tmp++;
1265     *tmp = '\0';
1266 
1267     featureID = tmp;
1268     dbt = NULL;
1269     oip = NULL;
1270     if (sfp->id.choice == 3) {
1271       oip = (ObjectIdPtr) sfp->id.value.ptrvalue;
1272     } else if (sfp->id.choice == 4) {
1273       dbt = (DbtagPtr) sfp->id.value.ptrvalue;
1274       if (dbt != NULL) {
1275         oip = dbt->tag;
1276       }
1277     }
1278     if (oip != NULL) {
1279       if (dbt != NULL && dbt->db != NULL) {
1280         diff = LabelCopyExtra (tmp, dbt->db, buflen, NULL, ":");
1281         buflen -= diff;
1282         tmp += diff;
1283         *tmp = '\0';
1284       }
1285       if (oip->str != NULL) {
1286         diff = LabelCopyExtra (tmp, oip->str, buflen, NULL, NULL);
1287       } else {
1288         sprintf (numbuf, "%ld", (long) oip->id);
1289         diff = LabelCopyExtra (tmp, numbuf, buflen, NULL, NULL);
1290       }
1291       buflen -= diff;
1292       tmp += diff;
1293       *tmp = '\0';
1294     }
1295 
1296     tmp++;
1297     *tmp = '\0';
1298   } else if (vsp->descr != NULL) {
1299     label = tmp;
1300     diff = SeqDescLabel (vsp->descr, tmp, wrklen, OM_LABEL_BOTH);
1301 
1302     if (diff > 100 && vsp->descr->choice == Seq_descr_comment && errcode == 2 && subcode == 77) {
1303       diff = 100;
1304       *(tmp + diff - 3) = '.';
1305       *(tmp + diff - 2) = '.';
1306       *(tmp + diff - 1) = '.';
1307     }
1308     buflen -= diff;
1309     tmp += diff;
1310     *tmp = '\0';
1311     tmp++;
1312     *tmp = '\0';
1313   } else if (vsp->bsp != NULL) {
1314     label = tmp;
1315     if (vsp->convertGiToAccn) {
1316       //LCOV_EXCL_START
1317       // option not used
1318       diff = WorstBioseqLabel (vsp->bsp, tmp, wrklen, OM_LABEL_CONTENT);
1319       //LCOV_EXCL_STOP
1320     } else {
1321       diff = BioseqLabel (vsp->bsp, tmp, wrklen, OM_LABEL_BOTH);
1322     }
1323     buflen -= diff;
1324     tmp += diff;
1325     *tmp = '\0';
1326     tmp++;
1327     *tmp = '\0';
1328   } else if (vsp->bssp != NULL) {
1329     label = tmp;
1330     diff = BioseqSetLabel (vsp->bssp, tmp, wrklen, OM_LABEL_BOTH);
1331     buflen -= diff;
1332     tmp += diff;
1333     *tmp = '\0';
1334     tmp++;
1335     *tmp = '\0';
1336   }
1337 
1338   if (vsp->sfp != NULL) {
1339     sfp = vsp->sfp;
1340 
1341     if (sfp->location != NULL) {
1342       ctmp = NULL;
1343       slp = NULL;
1344       /*
1345       if (vsp->suppressContext) {
1346         slp = AsnIoMemCopy (sfp->location, (AsnReadFunc) SeqLocAsnRead, (AsnWriteFunc) SeqLocAsnWrite);
1347         ChangeSeqLocToBestID (slp);
1348         ctmp = SeqLocPrint (slp);
1349         SeqLocFree (slp);
1350       } else {
1351         ctmp = SeqLocPrint (sfp->location);
1352       }
1353       */
1354       slp = AsnIoMemCopy (sfp->location, (AsnReadFunc) SeqLocAsnRead, (AsnWriteFunc) SeqLocAsnWrite);
1355       ChangeSeqLocToBestID (slp);
1356       ctmp = SeqLocPrint (slp);
1357       SeqLocFree (slp);
1358       if (ctmp != NULL) {
1359         if (StringLen (ctmp) > 800) {
1360           StringCpy (ctmp + 797, "...");
1361         }
1362         location = tmp;
1363         diff = LabelCopyExtra (tmp, ctmp, buflen, "[", "]");
1364         buflen -= diff;
1365         tmp += diff;
1366         MemFree (ctmp);
1367         *tmp = '\0';
1368         tmp++;
1369         *tmp = '\0';
1370 
1371         sip = SeqLocId (sfp->location);
1372         if (sip != NULL) {
1373           bsp = BioseqFind (sip);
1374           if (bsp != NULL) {
1375             context = tmp;
1376             diff = LabelCopy (tmp, "[", buflen);
1377             buflen -= diff;
1378             tmp += diff;
1379 
1380             diff = BioseqLabel (bsp, tmp, buflen, OM_LABEL_BOTH);
1381             buflen -= diff;
1382             tmp += diff;
1383 
1384             diff = LabelCopy (tmp, "]", buflen);
1385             buflen -= diff;
1386             tmp += diff;
1387           }
1388         }
1389         *tmp = '\0';
1390         tmp++;
1391         *tmp = '\0';
1392       }
1393     }
1394 
1395     if (sfp->product != NULL) {
1396       ctmp = NULL;
1397       slp = NULL;
1398       /*
1399       if (vsp->suppressContext) {
1400         slp = AsnIoMemCopy (sfp->product, (AsnReadFunc) SeqLocAsnRead, (AsnWriteFunc) SeqLocAsnWrite);
1401         ChangeSeqLocToBestID (slp);
1402         ctmp = SeqLocPrint (slp);
1403         SeqLocFree (slp);
1404       } else {
1405         ctmp = SeqLocPrint (sfp->product);
1406       }
1407       */
1408       slp = AsnIoMemCopy (sfp->product, (AsnReadFunc) SeqLocAsnRead, (AsnWriteFunc) SeqLocAsnWrite);
1409       ChangeSeqLocToBestID (slp);
1410       ctmp = SeqLocPrint (slp);
1411       SeqLocFree (slp);
1412       if (ctmp != NULL) {
1413         if (StringLen (ctmp) > 800) {
1414           StringCpy (ctmp + 797, "...");
1415         }
1416         product = tmp;
1417         diff = LabelCopyExtra (tmp, ctmp, buflen, "[", "]");
1418         buflen -= diff;
1419         tmp += diff;
1420         *tmp = '\0';
1421         tmp++;
1422         *tmp = '\0';
1423         MemFree (ctmp);
1424       }
1425     }
1426   } else if (vsp->descr != NULL) {
1427     if (vsp->bsp != NULL) {
1428       context = tmp;
1429       diff = LabelCopy (tmp, "BIOSEQ: ", buflen);
1430       buflen -= diff;
1431       tmp += diff;
1432       if (vsp->suppressContext || vsp->convertGiToAccn) {
1433           //LCOV_EXCL_START
1434           // option not used
1435           diff = WorstBioseqLabel(vsp->bsp, tmp, buflen, OM_LABEL_CONTENT);
1436           //LCOV_EXCL_STOP
1437       } else {
1438         diff = BioseqLabel (vsp->bsp, tmp, buflen, OM_LABEL_BOTH);
1439       }
1440       buflen -= diff;
1441       tmp += diff;
1442       *tmp = '\0';
1443       tmp++;
1444       *tmp = '\0';
1445     } else if (vsp->bssp != NULL) {
1446       context = tmp;
1447       diff = LabelCopy (tmp, "BIOSEQ-SET: ", buflen);
1448       buflen -= diff;
1449       tmp += diff;
1450 
1451       if (vsp->suppressContext || vsp->convertGiToAccn) {
1452         diff = BioseqSetLabel (vsp->bssp, tmp, buflen, OM_LABEL_CONTENT);
1453       } else {
1454         diff = BioseqSetLabel (vsp->bssp, tmp, buflen, OM_LABEL_BOTH);
1455       }
1456       buflen -= diff;
1457       tmp += diff;
1458       *tmp = '\0';
1459       tmp++;
1460       *tmp = '\0';
1461     }
1462   }
1463 
1464   (*errfunc) (severity, errcode, subcode, entityID, itemtype, itemID, accession, seqid,
1465               featureID, message, objtype, label, context, location, product, vsp->userdata);
1466 }
1467 
1468 
1469 /* framework for suppressing validator errors using a list-based strategy */
1470 typedef Boolean (*ValidErrSuppressFunc) PROTO ((ValidStructPtr));
1471 
IsGenomicPipeline(ValidStructPtr vsp)1472 static Boolean IsGenomicPipeline (ValidStructPtr vsp)
1473 {
1474   if (vsp == NULL) {
1475     return FALSE;
1476   } else if (vsp->bsp_genomic_in_sep && vsp->is_gpipe_in_sep) {
1477     return TRUE;
1478   } else {
1479     return FALSE;
1480   }
1481 }
1482 
1483 
IsUnclassifiedExcept(ValidStructPtr vsp)1484 static Boolean IsUnclassifiedExcept (ValidStructPtr vsp)
1485 {
1486   Boolean rval = FALSE;
1487   if (vsp == NULL || vsp->sfp == NULL) {
1488     return FALSE;
1489   }
1490   if (vsp->sfp->excpt && (! vsp->ignoreExceptions)) {
1491     if (vsp->sfp->data.choice == SEQFEAT_CDREGION) {
1492       if (StringStr (vsp->sfp->except_text, "unclassified translation discrepancy") != NULL) {
1493         //LCOV_EXCL_START
1494         // if text was present, error to be suppressed  would never have been calculated
1495         rval = TRUE;
1496         //LCOV_EXCL_STOP
1497       }
1498       //LCOV_EXCL_START
1499       //errors suppressed with this function are never for mRNA features
1500     } else if (vsp->sfp->idx.subtype == FEATDEF_mRNA) {
1501       if (StringStr (vsp->sfp->except_text, "unclassified transcription discrepancy") != NULL) {
1502         rval = TRUE;
1503       }
1504       //LCOV_EXCL_STOP
1505     }
1506   }
1507   return rval;
1508 }
1509 
1510 
IsNotUnclassifiedExcept(ValidStructPtr vsp)1511 static Boolean IsNotUnclassifiedExcept (ValidStructPtr vsp)
1512 {
1513   return !IsUnclassifiedExcept(vsp);
1514 }
1515 
1516 
IsUnclassifedExceptAndGenomicPipeline(ValidStructPtr vsp)1517 static Boolean IsUnclassifedExceptAndGenomicPipeline (ValidStructPtr vsp)
1518 {
1519   if (IsGenomicPipeline(vsp) && IsUnclassifiedExcept(vsp)) {
1520     return TRUE;
1521   } else {
1522     return FALSE;
1523   }
1524 }
1525 
1526 
NonconsensusExcept(ValidStructPtr vsp)1527 static Boolean NonconsensusExcept (ValidStructPtr vsp)
1528 {
1529   Boolean rval = FALSE;
1530   if (vsp == NULL || vsp->sfp == NULL) {
1531     return FALSE;
1532   }
1533   if (vsp->sfp->excpt && (! vsp->ignoreExceptions)) {
1534     if (StringISearch (vsp->sfp->except_text, "nonconsensus splice site") != NULL ||
1535         StringISearch (vsp->sfp->except_text, "heterogeneous population sequenced") != NULL ||
1536         StringISearch (vsp->sfp->except_text, "low-quality sequence region") != NULL ||
1537         StringISearch (vsp->sfp->except_text, "artificial location") != NULL) {
1538       rval = TRUE;
1539     }
1540   }
1541   return rval;
1542 }
1543 
1544 
1545 typedef struct validerrsuppression {
1546   int code1;
1547   int code2;
1548   CharPtr search_phrase;
1549   CharPtr exclude_phrase;
1550   ValidErrSuppressFunc func;
1551 } ValidErrSuppressionData, PNTR ValidErrSuppressionPtr;
1552 
1553 static ValidErrSuppressionData valid_suppress[] = {
1554   {ERR_SEQ_FEAT_PartialProblem, "When SeqFeat.product is a partial Bioseq, SeqFeat.location should also be partial", NULL, IsGenomicPipeline },
1555   {ERR_SEQ_FEAT_PartialProblem, "End of location should probably be partial", NULL, IsGenomicPipeline},
1556   {ERR_SEQ_FEAT_PartialProblem, "This SeqFeat should not be partial", NULL, IsGenomicPipeline},
1557   {ERR_SEQ_FEAT_PartialProblem, "AND is not at consensus splice site", NULL, IsGenomicPipeline},
1558   {ERR_SEQ_FEAT_PartialProblem, "PartialLocation: Internal partial intervals do not include first/last residue of sequence", NULL, IsGenomicPipeline},
1559   {ERR_SEQ_FEAT_PartialProblem, "AND is not at consensus splice site", NULL, NonconsensusExcept},
1560   {ERR_SEQ_FEAT_PartialProblem, "(but is at consensus splice site)", NULL, IsGenomicPipeline},
1561   {ERR_SEQ_FEAT_PartialProblem, "PartialLocation: Start does not include first/last residue of sequence", NULL, IsGenomicPipeline},
1562   {ERR_SEQ_FEAT_PartialProblem, "PartialLocation: Stop does not include first/last residue of sequence", NULL, IsGenomicPipeline},
1563   {ERR_SEQ_FEAT_PartialsInconsistent, NULL, NULL, IsGenomicPipeline },
1564   {ERR_SEQ_FEAT_PolyATail, NULL, NULL, IsGenomicPipeline },
1565   {ERR_SEQ_FEAT_InternalStop, NULL, NULL, IsUnclassifedExceptAndGenomicPipeline},
1566   {ERR_SEQ_FEAT_StartCodon , NULL, NULL, IsUnclassifiedExcept}
1567 
1568 };
1569 
1570 const Int4 kNumSuppressionRules = sizeof (valid_suppress) / sizeof (ValidErrSuppressionData);
1571 
ShouldSuppressValidErr(ValidStructPtr vsp,int code1,int code2,const char * fmt)1572 static Boolean ShouldSuppressValidErr (ValidStructPtr vsp, int code1, int code2, const char *fmt)
1573 {
1574   Int4 i;
1575   Boolean rval = FALSE;
1576 
1577   if (vsp->disableSuppression) return FALSE;
1578 
1579   for (i = 0; i < kNumSuppressionRules && !rval; i++) {
1580     if (code1 == valid_suppress[i].code1 && code2 == valid_suppress[i].code2
1581         && (valid_suppress[i].search_phrase == NULL || StringISearch (fmt, valid_suppress[i].search_phrase) != NULL)
1582         && (valid_suppress[i].func == NULL || valid_suppress[i].func(vsp))
1583         && (valid_suppress[i].exclude_phrase == NULL || StringISearch (fmt, valid_suppress[i].exclude_phrase) == NULL)) {
1584         // note: all exclude phrases are NULL
1585       rval = TRUE;
1586     }
1587   }
1588 
1589   return rval;
1590 }
1591 
1592 
1593 /* framework for changing validator warnings using a list-based strategy */
1594 typedef int (*ValidErrSevChangeFunc) PROTO ((int, ValidStructPtr));
1595 
1596 typedef struct validerrsevchange {
1597   int code1;
1598   int code2;
1599   CharPtr search_phrase;
1600   CharPtr exclude_phrase;
1601   ValidErrSevChangeFunc func;
1602 } ValidErrSevChangeData, PNTR ValidErrSevChangePtr;
1603 
1604 
LowerToInfoForGenomic(int severity,ValidStructPtr vsp)1605 static int LowerToInfoForGenomic (int severity, ValidStructPtr vsp)
1606 {
1607   if (IsGenomicPipeline(vsp)) {
1608     return SEV_INFO;
1609   } else {
1610     return severity;
1611   }
1612 }
1613 
1614 
WarnForGPSOrRefSeq(int severity,ValidStructPtr vsp)1615 static int WarnForGPSOrRefSeq (int severity, ValidStructPtr vsp)
1616 {
1617   Boolean         gpsOrRefSeq = FALSE;
1618   SeqEntryPtr     sep;
1619   SeqFeatPtr      sfp;
1620   BioseqSetPtr    bssp;
1621   SeqLocPtr       head, slp = NULL, nxt;
1622   SeqIdPtr        sip, id;
1623   BioseqPtr       bsp;
1624   TextSeqIdPtr    tsip;
1625 
1626   sep = vsp->sep;
1627   if (sep != NULL && IS_Bioseq_set (sep)) {
1628     bssp = (BioseqSetPtr) sep->data.ptrvalue;
1629     if (bssp != NULL && bssp->_class == BioseqseqSet_class_gen_prod_set) {
1630       gpsOrRefSeq = TRUE;
1631     }
1632   }
1633 
1634   if (!gpsOrRefSeq) {
1635     sfp = vsp->sfp;
1636     head = sfp->location;
1637     slp = SeqLocFindPart (head, slp, EQUIV_IS_ONE);
1638     while (slp != NULL && !gpsOrRefSeq) {
1639       sip = SeqLocId (slp);
1640       if (sip == NULL)
1641         break;
1642       nxt = SeqLocFindPart (head, slp, EQUIV_IS_ONE);
1643 
1644       /* genomic product set or NT_ contig always relaxes to SEV_WARNING */
1645       bsp = BioseqFind (sip);
1646       if (bsp != NULL) {
1647         for (id = bsp->id; id != NULL; id = id->next) {
1648           if (id->choice == SEQID_OTHER) {
1649             tsip = (TextSeqIdPtr) id->data.ptrvalue;
1650             if (tsip != NULL && tsip->accession != NULL) {
1651               gpsOrRefSeq = TRUE;
1652             }
1653           }
1654         }
1655       }
1656 
1657       slp = nxt;
1658     }
1659   }
1660   if (gpsOrRefSeq) {
1661     if (severity > SEV_WARNING) {
1662       severity = SEV_WARNING;
1663     }
1664   }
1665   return severity;
1666 }
1667 
1668 
1669 static ValidErrSevChangeData valid_sevchange[] = {
1670   {ERR_SEQ_FEAT_NotSpliceConsensusDonor, "Splice donor consensus (GT) not found at start of intron, position", NULL, LowerToInfoForGenomic},
1671   {ERR_SEQ_FEAT_NotSpliceConsensusAcceptor, "Splice acceptor consensus (AG) not found at end of intron, position", NULL, LowerToInfoForGenomic},
1672   {ERR_SEQ_FEAT_NotSpliceConsensusDonor, "Splice donor consensus (GT) not found after exon", NULL, LowerToInfoForGenomic},
1673   {ERR_SEQ_FEAT_NotSpliceConsensusDonor, "Splice donor consensus (GT) not found after exon", NULL, WarnForGPSOrRefSeq},
1674   {ERR_SEQ_FEAT_NotSpliceConsensusAcceptor, "Splice acceptor consensus (AG) not found before exon", NULL, LowerToInfoForGenomic},
1675   {ERR_SEQ_FEAT_NotSpliceConsensusAcceptor, "Splice acceptor consensus (AG) not found before exon", NULL, WarnForGPSOrRefSeq},
1676 };
1677 
1678 const Int4 kNumSevChangeRules = sizeof (valid_sevchange) / sizeof (ValidErrSevChangeData);
1679 
AdjustSeverity(int severity,ValidStructPtr vsp,int code1,int code2,const char * fmt)1680 static int AdjustSeverity (int severity, ValidStructPtr vsp, int code1, int code2, const char *fmt)
1681 {
1682   Int4 i;
1683   int rval = severity;
1684 
1685   for (i = 0; i < kNumSevChangeRules; i++) {
1686     if (code1 == valid_sevchange[i].code1 && code2 == valid_sevchange[i].code2
1687         && (valid_sevchange[i].search_phrase == NULL || StringISearch (fmt, valid_sevchange[i].search_phrase) != NULL)
1688         && (valid_sevchange[i].exclude_phrase == NULL || StringISearch (fmt, valid_sevchange[i].exclude_phrase) == NULL)
1689         && valid_sevchange[i].func != NULL) {
1690       rval = (valid_sevchange[i].func)(rval, vsp);
1691     }
1692   }
1693 
1694   return rval;
1695 }
1696 
1697 typedef struct validerrraise {
1698   int code1;
1699   int code2;
1700 } ValidErrRaiseData, PNTR ValidErrSevRaisePtr;
1701 
1702 static ValidErrRaiseData valid_genome_raise [] = {
1703   {ERR_SEQ_INST_ShortSeq},
1704   {ERR_SEQ_INST_ConflictingBiomolTech},
1705   {ERR_SEQ_INST_DuplicateSegmentReferences},
1706   {ERR_SEQ_INST_TrailingX},
1707   {ERR_SEQ_INST_BadSeqIdFormat},
1708   {ERR_SEQ_INST_TerminalNs},
1709   {ERR_SEQ_INST_UnexpectedIdentifierChange},
1710   {ERR_SEQ_INST_TpaAssmeblyProblem},
1711   {ERR_SEQ_INST_SeqLocLength},
1712   {ERR_SEQ_INST_CompleteTitleProblem},
1713   {ERR_SEQ_INST_BadHTGSeq},
1714   {ERR_SEQ_INST_OverlappingDeltaRange},
1715   {ERR_SEQ_INST_LeadingX},
1716   {ERR_SEQ_INST_InternalNsInSeqRaw},
1717   {ERR_SEQ_INST_FarFetchFailure},
1718   {ERR_SEQ_INST_InternalGapsInSeqRaw},
1719   {ERR_SEQ_INST_HighNContentStretch},
1720   {ERR_SEQ_INST_HighNContentPercent},
1721   {ERR_SEQ_INST_SeqLitGapFuzzNot100},
1722   {ERR_SEQ_DESCR_BioSourceMissing},
1723   {ERR_SEQ_DESCR_InvalidForType},
1724   {ERR_SEQ_DESCR_InconsistentBioSources},
1725   {ERR_SEQ_DESCR_BadOrganelle},
1726   {ERR_SEQ_DESCR_MultipleChromosomes},
1727   {ERR_SEQ_DESCR_BadOrgMod},
1728   {ERR_SEQ_DESCR_Inconsistent},
1729   {ERR_SEQ_DESCR_ObsoleteSourceLocation},
1730   {ERR_SEQ_DESCR_ObsoleteSourceQual},
1731   {ERR_SEQ_DESCR_UnwantedCompleteFlag},
1732   {ERR_SEQ_DESCR_CollidingPublications},
1733   {ERR_SEQ_DESCR_TransgenicProblem},
1734   {ERR_SEQ_DESCR_BioSourceInconsistency},
1735   {ERR_SEQ_DESCR_BadCollectionDate},
1736   {ERR_SEQ_DESCR_BadPCRPrimerSequence},
1737   {ERR_SEQ_DESCR_BioSourceOnProtein},
1738   {ERR_SEQ_DESCR_BioSourceDbTagConflict},
1739   {ERR_SEQ_DESCR_DuplicatePCRPrimerSequence},
1740   {ERR_SEQ_DESCR_MultipleNames},
1741   {ERR_SEQ_DESCR_LatLonProblem},
1742   {ERR_SEQ_DESCR_LatLonRange},
1743   {ERR_SEQ_DESCR_LatLonValue},
1744   {ERR_SEQ_DESCR_LatLonCountry},
1745   {ERR_SEQ_DESCR_BadInstitutionCode},
1746   {ERR_SEQ_DESCR_BadCollectionCode},
1747   {ERR_SEQ_DESCR_BadVoucherID},
1748   {ERR_SEQ_DESCR_MultipleSourceQualifiers},
1749   {ERR_SEQ_DESCR_MultipleSourceVouchers},
1750   {ERR_SEQ_DESCR_WrongVoucherType},
1751   {ERR_SEQ_DESCR_UserObjectProblem},
1752   {ERR_SEQ_DESCR_BadKeyword},
1753   {ERR_SEQ_DESCR_BioSourceNeedsChromosome},
1754   {ERR_SEQ_DESCR_MolInfoConflictsWithBioSource},
1755   {ERR_SEQ_DESCR_OrganismIsUndefinedSpecies},
1756   {ERR_SEQ_DESCR_WrongBiomolForTechnique},
1757   {ERR_GENERIC_MissingPubInfo},
1758   {ERR_GENERIC_UnnecessaryPubEquiv},
1759   {ERR_GENERIC_CollidingSerialNumbers},
1760   {ERR_GENERIC_SgmlPresentInText},
1761   {ERR_SEQ_PKG_EmptySet},
1762   {ERR_SEQ_PKG_FeaturePackagingProblem},
1763   {ERR_SEQ_PKG_GenomicProductPackagingProblem},
1764   {ERR_SEQ_PKG_InconsistentMolInfoBiomols},
1765   {ERR_SEQ_PKG_ArchaicFeatureLocation},
1766   {ERR_SEQ_PKG_ArchaicFeatureProduct},
1767   {ERR_SEQ_PKG_InternalGenBankSet},
1768   {ERR_SEQ_PKG_BioseqSetClassNotSet},
1769   {ERR_SEQ_PKG_MissingSetTitle},
1770   {ERR_SEQ_PKG_NucProtSetHasTitle},
1771   {ERR_SEQ_PKG_ComponentMissingTitle},
1772   {ERR_SEQ_PKG_SingleItemSet},
1773   {ERR_SEQ_PKG_MisplacedMolInfo},
1774   {ERR_SEQ_PKG_ImproperlyNestedSets},
1775   {ERR_SEQ_PKG_SeqSubmitWithWgsSet},
1776   {ERR_SEQ_FEAT_Range},
1777   {ERR_SEQ_FEAT_MixedStrand},
1778   {ERR_SEQ_FEAT_SeqLocOrder},
1779   {ERR_SEQ_FEAT_TransLen},
1780   {ERR_SEQ_FEAT_TranslExcept},
1781   {ERR_SEQ_FEAT_OrfCdsHasProduct},
1782   {ERR_SEQ_FEAT_GeneRefHasNoData},
1783   {ERR_SEQ_FEAT_ProtRefHasNoData},
1784   {ERR_SEQ_FEAT_RNAtype0},
1785   {ERR_SEQ_FEAT_UnknownImpFeatKey},
1786   {ERR_SEQ_FEAT_UnknownImpFeatQual},
1787   {ERR_SEQ_FEAT_WrongQualOnImpFeat},
1788   {ERR_SEQ_FEAT_MissingQualOnImpFeat},
1789   {ERR_SEQ_FEAT_IllegalDbXref},
1790   {ERR_SEQ_FEAT_FarLocation},
1791   {ERR_SEQ_FEAT_TranslExceptPhase},
1792   {ERR_SEQ_FEAT_PeptideFeatOutOfFrame},
1793   {ERR_SEQ_FEAT_InvalidQualifierValue},
1794   {ERR_SEQ_FEAT_CDSproductPackagingProblem},
1795   {ERR_SEQ_FEAT_DuplicateInterval},
1796   {ERR_SEQ_FEAT_AbuttingIntervals},
1797   {ERR_SEQ_FEAT_MissingCDSproduct},
1798   {ERR_SEQ_FEAT_OnlyGeneXrefs},
1799   {ERR_SEQ_FEAT_UTRdoesNotAbutCDS},
1800   {ERR_SEQ_FEAT_ConflictFlagSet},
1801   {ERR_SEQ_FEAT_LocusTagProblem},
1802   {ERR_SEQ_FEAT_GenesInconsistent},
1803   {ERR_SEQ_FEAT_TranslExceptAndRnaEditing},
1804   {ERR_SEQ_FEAT_NoNameForProtein},
1805   {ERR_SEQ_FEAT_MissingGeneXref},
1806   {ERR_SEQ_FEAT_FeatureCitationProblem},
1807   {ERR_SEQ_FEAT_WrongQualOnFeature},
1808   {ERR_SEQ_FEAT_UnknownFeatureQual},
1809   {ERR_SEQ_FEAT_BadCharInAuthorName},
1810   {ERR_SEQ_FEAT_CDSwithMultipleMRNAs},
1811   {ERR_SEQ_FEAT_MultipleEquivBioSources},
1812   {ERR_SEQ_FEAT_MultipleEquivPublications},
1813   {ERR_SEQ_FEAT_BadFullLengthFeature},
1814   {ERR_SEQ_FEAT_RedundantFields},
1815   {ERR_SEQ_FEAT_CDSwithNoMRNAOverlap},
1816   {ERR_SEQ_FEAT_ImproperBondLocation},
1817   {ERR_SEQ_FEAT_GeneXrefWithoutGene},
1818   {ERR_SEQ_FEAT_MissingTrnaAA},
1819   {ERR_SEQ_FEAT_OldLocusTagMismtach},
1820   {ERR_SEQ_FEAT_InvalidInferenceValue},
1821   {ERR_SEQ_FEAT_HpotheticalProteinMismatch},
1822   {ERR_SEQ_FEAT_WholeLocation},
1823   {ERR_SEQ_FEAT_BadEcNumberFormat},
1824   {ERR_SEQ_FEAT_EcNumberProblem},
1825   {ERR_SEQ_FEAT_VectorContamination},
1826   {ERR_SEQ_FEAT_MinusStrandProtein},
1827   {ERR_SEQ_FEAT_BadProteinName},
1828   {ERR_SEQ_FEAT_GeneXrefWithoutLocus},
1829   {ERR_SEQ_FEAT_CDShasTooManyXs},
1830   {ERR_SEQ_FEAT_TerminalXDiscrepancy},
1831   {ERR_SEQ_FEAT_UnnecessaryTranslExcept},
1832   {ERR_SEQ_FEAT_FeatureInsideGap},
1833   {ERR_SEQ_FEAT_BadAnticodonAA},
1834   {ERR_SEQ_FEAT_BadAnticodonCodon},
1835   {ERR_SEQ_FEAT_FeatureBeginsOrEndsInGap},
1836   {ERR_SEQ_FEAT_GeneOntologyTermMissingGOID},
1837   {ERR_SEQ_FEAT_PseudoRnaHasProduct},
1838   {ERR_SEQ_FEAT_PseudoRnaViaGeneHasProduct},
1839   {ERR_SEQ_FEAT_BadRRNAcomponentOrder},
1840   {ERR_SEQ_FEAT_BadRRNAcomponentOverlap},
1841   {ERR_SEQ_FEAT_MultipleProtRefs},
1842   {ERR_SEQ_FEAT_BadInternalCharacter},
1843   {ERR_SEQ_FEAT_BadTrailingCharacter},
1844   {ERR_SEQ_FEAT_BadTrailingHyphen},
1845   {ERR_SEQ_FEAT_BadCharInAuthorLastName},
1846   {ERR_SEQ_FEAT_GeneXrefNeeded},
1847   {ERR_SEQ_FEAT_ProteinNameHasPMID},
1848   {ERR_SEQ_FEAT_BadGeneOntologyFormat},
1849   {ERR_SEQ_FEAT_InconsistentGeneOntologyTermAndId},
1850   {ERR_SEQ_FEAT_ShortIntron},
1851   {ERR_SEQ_FEAT_GeneXrefStrandProblem},
1852   {ERR_SEQ_FEAT_CDSmRNAXrefLocationProblem},
1853   {ERR_SEQ_FEAT_LocusCollidesWithLocusTag},
1854   {ERR_SEQ_FEAT_NeedsNote},
1855   {ERR_SEQ_FEAT_RptUnitRangeProblem},
1856   {ERR_SEQ_FEAT_InconsistentRRNAstrands},
1857   {ERR_SEQ_FEAT_PeptideFeatureLacksCDS},
1858   {ERR_SEQ_GRAPH_GraphAbove},
1859   {ERR_SEQ_GRAPH_GraphOutOfOrder},
1860   {ERR_SEQ_GRAPH_GraphSeqLocLen},
1861   {ERR_SEQ_GRAPH_GraphBioseqId},
1862 };
1863 
CompareRaiseCode(int a1,int a2,int b1,int b2)1864 static Int2 CompareRaiseCode (int a1, int a2, int b1, int b2)
1865 
1866 {
1867   if (a1 == b1) return (a2 - b2);
1868   return (a1 - b1);
1869 }
1870 
RaiseGenomeSeverity(int code1,int code2)1871 static Boolean RaiseGenomeSeverity (int code1, int code2)
1872 {
1873   Int2 L, R, mid;
1874 
1875   L = 0;
1876   R = sizeof (valid_genome_raise) / sizeof (ValidErrRaiseData) - 1;
1877   while (L < R) {
1878     mid = (L + R) / 2;
1879     if (CompareRaiseCode (valid_genome_raise [mid].code1, valid_genome_raise [mid].code2, code1, code2) < 0)
1880       L = mid + 1;
1881     else
1882       R = mid;
1883   }
1884   if (CompareRaiseCode (valid_genome_raise [R].code1, valid_genome_raise [R].code2, code1, code2) == 0) {
1885     return TRUE;
1886   }
1887 
1888   return FALSE;
1889 }
1890 
1891 /*
1892 static Boolean genome_raise_order_tested = FALSE;
1893 */
1894 
1895 #ifdef VAR_ARGS
ValidErr(vsp,severity,code1,code2,fmt,va_alist)1896 NLM_EXTERN void CDECL ValidErr (vsp, severity, code1, code2, fmt, va_alist)
1897      ValidStructPtr vsp;
1898      int severity;
1899      int code1;
1900      int code2;
1901      const char     *fmt;
1902      va_dcl
1903 #else
1904 NLM_EXTERN void CDECL ValidErr (ValidStructPtr vsp, int severity, int code1, int code2, const char *fmt, ...)
1905 #endif
1906 {
1907   va_list           args;
1908   BioseqPtr         bsp;
1909   BioseqSetPtr      bssp;
1910   Int2              buflen, diff;
1911   CharPtr           ctmp, tmp;
1912   GatherContextPtr  gcp;
1913   Char              id [64];
1914   SeqLocPtr         loc = NULL;
1915   ObjValNodePtr     ovp;
1916   SeqDescrPtr       sdp;
1917   SeqEntryPtr       sep;
1918   SeqFeatPtr        sfp;
1919   SeqIdPtr          sip;
1920 
1921   if (vsp == NULL || severity < vsp->cutoff || ShouldSuppressValidErr(vsp, code1, code2, fmt))
1922     return;
1923 
1924   severity = AdjustSeverity(severity, vsp, code1, code2, fmt);
1925 
1926   /*
1927   if (! genome_raise_order_tested) {
1928     Int2  i;
1929     Boolean bad_order = FALSE;
1930     for (i = 0; i < sizeof (valid_genome_raise) / sizeof (ValidErrRaiseData) - 1; i++) {
1931       if (CompareRaiseCode (valid_genome_raise [i].code1, valid_genome_raise [i].code2,
1932           valid_genome_raise [i + 1].code1, valid_genome_raise [i + 1].code2) < 0) continue;
1933         bad_order = TRUE;
1934     }
1935     genome_raise_order_tested = TRUE;
1936     if (bad_order) {
1937       Beep ();
1938     }
1939   }
1940   */
1941 
1942   if (vsp->genomeSubmission && severity < SEV_ERROR) {
1943     if (RaiseGenomeSeverity (code1, code2)) {
1944       severity = SEV_ERROR;
1945     }
1946   }
1947 
1948   if (vsp->errbuf == NULL) {
1949     vsp->errbuf = MemNew (8192);
1950     if (vsp->errbuf == NULL)
1951       AbnormalExit (1);
1952   }
1953   tmp = vsp->errbuf;
1954 
1955   vsp->errors[severity]++;
1956 
1957 #ifdef VAR_ARGS
1958   va_start (args);
1959 #else
1960   va_start (args, fmt);
1961 #endif
1962 
1963   gcp = vsp->gcp;
1964   buflen = 1023;
1965   vsprintf (tmp, fmt, args);
1966   if (tmp != NULL) {
1967     while (*tmp != '\0') {
1968       buflen--;
1969       tmp++;
1970     }
1971   }
1972 
1973   va_end (args);
1974 
1975   if (vsp->errfunc != NULL) {
1976     CustValErr (vsp, (ErrSev) (severity), code1, code2);
1977     vsp->errbuf[0] = '\0';
1978     return;
1979   }
1980 
1981   //LCOV_EXCL_START
1982   //commandline tool always uses CustValErr
1983 
1984   if (vsp->justShowAccession) {
1985     if (vsp->errbuf != NULL) {
1986       vsp->errbuf[0] = '\0';
1987     }
1988     tmp = vsp->errbuf;
1989     sip = NULL;
1990 
1991     if (vsp->sfp != NULL) {
1992       sfp = vsp->sfp;
1993       bsp = BioseqFindFromSeqLoc (sfp->location);
1994       if (bsp != NULL) {
1995         sip = SeqIdFindWorst (bsp->id);
1996       }
1997     } else if (vsp->descr != NULL) {
1998       sdp = vsp->descr;
1999       if (sdp != NULL && sdp->extended != 0) {
2000         ovp = (ObjValNodePtr) sdp;
2001         if (ovp->idx.parenttype == OBJ_BIOSEQ) {
2002           bsp = (BioseqPtr) ovp->idx.parentptr;
2003           if (bsp != NULL) {
2004             sip = SeqIdFindWorst (bsp->id);
2005           }
2006         } else if (ovp->idx.parenttype == OBJ_BIOSEQSET) {
2007           bssp = (BioseqSetPtr) ovp->idx.parentptr;
2008           if (bssp != NULL) {
2009             sep = bssp->seqentry;
2010             if (sep != NULL) {
2011               sep = FindNthBioseq (sep, 1);
2012               if (sep != NULL) {
2013                 bsp = (BioseqPtr) sep->data.ptrvalue;
2014                 if (bsp != NULL) {
2015                   sip = SeqIdFindWorst (bsp->id);
2016                 }
2017               }
2018             }
2019           }
2020         }
2021       }
2022     } else if (vsp->bsp != NULL) {
2023       bsp = vsp->bsp;
2024       sip = SeqIdFindWorst (bsp->id);
2025     } else if (vsp->bssp != NULL) {
2026       bssp = vsp->bssp;
2027       sep = bssp->seqentry;
2028       if (sep != NULL) {
2029         sep = FindNthBioseq (sep, 1);
2030         if (sep != NULL) {
2031           bsp = (BioseqPtr) sep->data.ptrvalue;
2032           if (bsp != NULL) {
2033             sip = SeqIdFindWorst (bsp->id);
2034           }
2035         }
2036       }
2037     }
2038 
2039     if (sip != NULL) {
2040       SeqIdWrite (sip, id, PRINTID_REPORT, sizeof (id) - 1);
2041       diff = LabelCopy (tmp, id, buflen);
2042       buflen -= diff;
2043       tmp += diff;
2044     }
2045 
2046     ErrPostItem ((ErrSev) (severity), code1, code2, "%s", vsp->errbuf);
2047     if (vsp->errbuf != NULL) {
2048       vsp->errbuf[0] = '\0';
2049     }
2050     return;
2051   }
2052 
2053   if (vsp->sfp != NULL) {
2054     diff = LabelCopy (tmp, " FEATURE: ", buflen);
2055     buflen -= diff;
2056     tmp += diff;
2057 
2058     diff = FeatDefLabel (vsp->sfp, tmp, buflen, OM_LABEL_BOTH);
2059     buflen -= diff;
2060     tmp += diff;
2061 
2062     if (vsp->suppressContext) {
2063       loc = AsnIoMemCopy (vsp->sfp->location, (AsnReadFunc) SeqLocAsnRead, (AsnWriteFunc) SeqLocAsnWrite);
2064       ChangeSeqLocToBestID (loc);
2065       ctmp = SeqLocPrint (loc);
2066       SeqLocFree (loc);
2067     } else {
2068       ctmp = SeqLocPrint (vsp->sfp->location);
2069     }
2070     if (ctmp != NULL && StringLen (ctmp) > 800) {
2071       StringCpy (ctmp + 797, "...");
2072     }
2073     if (ctmp != NULL) {
2074       diff = LabelCopyExtra (tmp, ctmp, buflen, " [", "]");
2075       buflen -= diff;
2076       tmp += diff;
2077       MemFree (ctmp);
2078     }
2079 
2080     if (!vsp->suppressContext) {
2081       sip = SeqLocId (vsp->sfp->location);
2082       if (sip != NULL) {
2083         bsp = BioseqFind (sip);
2084         if (bsp != NULL) {
2085           diff = LabelCopy (tmp, " [", buflen);
2086           buflen -= diff;
2087           tmp += diff;
2088 
2089           diff = BioseqLabel (bsp, tmp, buflen, OM_LABEL_BOTH);
2090           buflen -= diff;
2091           tmp += diff;
2092 
2093           diff = LabelCopy (tmp, "]", buflen);
2094           buflen -= diff;
2095           tmp += diff;
2096         }
2097       }
2098     }
2099     if (vsp->sfp->product != NULL) {
2100       if (vsp->suppressContext) {
2101         loc = AsnIoMemCopy (vsp->sfp->product, (AsnReadFunc) SeqLocAsnRead, (AsnWriteFunc) SeqLocAsnWrite);
2102         ChangeSeqLocToBestID (loc);
2103         ctmp = SeqLocPrint (loc);
2104         SeqLocFree (loc);
2105       } else {
2106         ctmp = SeqLocPrint (vsp->sfp->product);
2107       }
2108       if (ctmp != NULL && StringLen (ctmp) > 800) {
2109         StringCpy (ctmp + 797, "...");
2110       }
2111       if (ctmp != NULL) {
2112         diff = LabelCopyExtra (tmp, ctmp, buflen, " -> [", "]");
2113         buflen -= diff;
2114         tmp += diff;
2115         MemFree (ctmp);
2116       }
2117     }
2118   } else if (vsp->descr != NULL) {
2119     diff = LabelCopy (tmp, " DESCRIPTOR: ", buflen);
2120     buflen -= diff;
2121     tmp += diff;
2122 
2123     if (vsp->descr->choice == Seq_descr_comment) {
2124       diff = SeqDescLabel (vsp->descr, tmp, buflen, OM_LABEL_BOTH);
2125       if (diff > 100) {
2126         /* truncate long comment in message */
2127         tmp [94] = ' ';
2128         tmp [95] = '.';
2129         tmp [96] = '.';
2130         tmp [97] = '.';
2131         tmp [98] = '\0';
2132         diff = 98;
2133         buflen -= diff;
2134         tmp += diff;
2135       } else {
2136         buflen -= diff;
2137         tmp += diff;
2138       }
2139     } else {
2140       diff = SeqDescLabel (vsp->descr, tmp, buflen, OM_LABEL_BOTH);
2141       buflen -= diff;
2142       tmp += diff;
2143     }
2144   }
2145 
2146   /*
2147      if (vsp->suppressContext)
2148      {
2149      }
2150      else */
2151   if (vsp->sfp == NULL) {       /* sfp adds its own context */
2152     if (vsp->bsp != NULL) {
2153       diff = LabelCopy (tmp, " BIOSEQ: ", buflen);
2154       buflen -= diff;
2155       tmp += diff;
2156 
2157       if (vsp->bsp == NULL) {
2158         diff = LabelCopy (tmp, "??", buflen);
2159       } else if (vsp->suppressContext) {
2160           diff = WorstBioseqLabel(vsp->bsp, tmp, buflen, OM_LABEL_CONTENT);
2161       } else {
2162         diff = BioseqLabel (vsp->bsp, tmp, buflen, OM_LABEL_BOTH);
2163       }
2164       buflen -= diff;
2165       tmp += diff;
2166     } else if (vsp->bssp != NULL) {
2167       diff = LabelCopy (tmp, " BIOSEQ-SET: ", buflen);
2168       buflen -= diff;
2169       tmp += diff;
2170 
2171       if (vsp->suppressContext) {
2172         diff = BioseqSetLabel (vsp->bssp, tmp, buflen, OM_LABEL_CONTENT);
2173       } else {
2174         diff = BioseqSetLabel (vsp->bssp, tmp, buflen, OM_LABEL_BOTH);
2175       }
2176       buflen -= diff;
2177       tmp += diff;
2178     }
2179   }
2180 
2181   if (vsp->errbuf == NULL) return;
2182   ErrPostItem ((ErrSev) (severity), code1, code2, "%s", vsp->errbuf);
2183   vsp->errbuf[0] = '\0';
2184   //LCOV_EXCL_STOP
2185 }
2186 
2187 
GetUserFieldLabelString(UserFieldPtr ufp)2188 static CharPtr GetUserFieldLabelString (UserFieldPtr ufp)
2189 {
2190   Char buf[15];
2191 
2192   if (ufp == NULL || ufp->label == NULL) {
2193     return StringSave ("Unlabeled field");
2194   } else if (ufp->label->id > 0) {
2195     sprintf (buf, "%d", ufp->label->id);
2196     return StringSave (buf);
2197   } else {
2198     return StringSave (ufp->label->str);
2199   }
2200 }
2201 
2202 
GetUserFieldValueString(UserFieldPtr ufp)2203 static CharPtr GetUserFieldValueString (UserFieldPtr ufp)
2204 {
2205   Char buf[15];
2206 
2207   if (ufp == NULL) {
2208     return StringSave ("Value is missing");
2209   } else if (ufp->choice == 1) {
2210     return StringSave (ufp->data.ptrvalue);
2211   } else if (ufp->choice == 2) {
2212     sprintf (buf, "%d", ufp->data.intvalue);
2213     return StringSave (buf);
2214   } else {
2215     return StringSave ("Bad format for value");
2216   }
2217 }
2218 
2219 
ErrorLevelFromFieldRuleSev(Uint2 severity)2220 static ErrSev ErrorLevelFromFieldRuleSev (Uint2 severity)
2221 {
2222   ErrSev sev = SEV_ERROR;
2223   switch (severity) {
2224     case Severity_level_none:
2225       sev = SEV_NONE;
2226       break;
2227     case Severity_level_info:
2228       sev = SEV_INFO;
2229       break;
2230     case Severity_level_warning:
2231       sev = SEV_WARNING;
2232       break;
2233     case Severity_level_error:
2234       sev = SEV_ERROR;
2235       break;
2236     case Severity_level_reject:
2237       sev = SEV_REJECT;
2238       break;
2239     case Severity_level_fatal:
2240       sev = SEV_FATAL;
2241       break;
2242   }
2243   return sev;
2244 }
2245 
IsGenomeAssembly(UserObjectPtr uop)2246 static Boolean IsGenomeAssembly (UserObjectPtr uop)
2247 
2248 {
2249   UserFieldPtr  curr;
2250   CharPtr       field;
2251   ObjectIdPtr   oip;
2252 
2253   if (uop == NULL) return FALSE;
2254 
2255   for (curr = uop->data; curr != NULL; curr = curr->next) {
2256     if (curr->choice != 1) continue;
2257     oip = curr->label;
2258     if (oip == NULL) continue;
2259     field = oip->str;
2260     if (StringHasNoText (field)) continue;
2261     if (StringCmp (field, "StructuredCommentPrefix") != 0) continue;
2262     if (StringCmp ((CharPtr) curr->data.ptrvalue, "##Genome-Assembly-Data-START##") == 0) return TRUE;
2263   }
2264 
2265   return FALSE;
2266 }
2267 
StructuredCommentError(EFieldValid err_code,FieldRulePtr field_rule,UserFieldPtr ufp,UserFieldPtr depend_ufp,Pointer data,UserObjectPtr uop)2268 static void StructuredCommentError (EFieldValid err_code, FieldRulePtr field_rule, UserFieldPtr ufp, UserFieldPtr depend_ufp, Pointer data, UserObjectPtr uop)
2269 {
2270   ValidStructPtr     vsp;
2271   CharPtr            label, val;
2272   CharPtr            depend_label, depend_val, depend_str = NULL;
2273   CharPtr            depend_fmt = " when %s has value '%s'";
2274   ErrSev             sev = SEV_ERROR;
2275 
2276   if ((vsp = (ValidStructPtr) data) == NULL) {
2277     return;
2278   }
2279 
2280   if (field_rule != NULL) {
2281     sev = ErrorLevelFromFieldRuleSev(field_rule->severity);
2282   }
2283 
2284   if (depend_ufp != NULL) {
2285     depend_label = GetUserFieldLabelString  (depend_ufp);
2286     depend_val = GetUserFieldValueString (depend_ufp);
2287     depend_str = (CharPtr) MemNew (sizeof (Char) * (StringLen (depend_fmt) + StringLen (depend_label) + StringLen (depend_val)));
2288     sprintf (depend_str, depend_fmt, depend_label, depend_val);
2289     depend_val = MemFree (depend_val);
2290     depend_label = MemFree (depend_label);
2291   }
2292 
2293   switch (err_code) {
2294     case eFieldValid_Invalid:
2295       label = GetUserFieldLabelString  (ufp);
2296       if (field_rule == NULL && StringCmp (label, "StructuredCommentPrefix") != 0 && StringCmp (label, "StructuredCommentSuffix") != 0) {
2297         ValidErr (vsp, sev, ERR_SEQ_DESCR_BadStrucCommInvalidFieldName, "%s is not a valid field name%s", label, depend_str == NULL ? "" : depend_str);
2298       } else {
2299         val = GetUserFieldValueString (ufp);
2300         if (StringICmp (label, "Finishing Goal") == 0 && IsGenomeAssembly (uop)) {
2301           sev = SEV_ERROR;
2302         } else if (StringICmp (label, "Current Finishing Status") == 0 && IsGenomeAssembly (uop)) {
2303           sev = SEV_ERROR;
2304         }
2305         ValidErr (vsp, sev, ERR_SEQ_DESCR_BadStrucCommInvalidFieldValue, "%s is not a valid value for %s%s", val, label, depend_str == NULL ? "" : depend_str);
2306         val = MemFree (val);
2307       }
2308       label = MemFree (label);
2309       break;
2310     case eFieldValid_MissingRequiredField:
2311       ValidErr (vsp, sev, ERR_SEQ_DESCR_BadStrucCommMissingField, "Required field %s is missing%s", field_rule == NULL ? "" : field_rule->field_name, depend_str == NULL ? "" : depend_str);
2312       break;
2313     case eFieldValid_FieldOutOfOrder:
2314       ValidErr (vsp, sev, ERR_SEQ_DESCR_BadStrucCommFieldOutOfOrder, "%s field is out of order%s", field_rule == NULL ? "" : field_rule->field_name, depend_str == NULL ? "" : depend_str);
2315       break;
2316     case eFieldValid_DuplicateField:
2317       ValidErr (vsp, sev, ERR_SEQ_DESCR_BadStrucCommMultipleFields, "Multiple values for %s field%s", field_rule == NULL ? "" : field_rule->field_name, depend_str == NULL ? "" : depend_str);
2318       break;
2319     case eFieldValid_Disallowed:
2320         //LCOV_EXCL_START
2321         //no rules currently have disallowed fields
2322       label = GetUserFieldLabelString  (ufp);
2323       ValidErr (vsp, sev, ERR_SEQ_DESCR_BadStrucCommInvalidFieldName, "%s is not a valid field name%s", label, depend_str == NULL ? "" : depend_str);
2324       label = MemFree (label);
2325       break;
2326       //LCOV_EXCL_STOP
2327     case eFieldValid_Inappropriate:
2328         //LCOV_EXCL_START
2329         // this code is not used
2330       val = GetUserFieldValueString (ufp);
2331       ValidErr(vsp, sev, ERR_SEQ_DESCR_BadStrucCommInvalidFieldValue, "'%s' is inappropriate for a GenBank submisison", val);
2332       val = MemFree (val);
2333       break;
2334       //LCOV_EXCL_STOP
2335     default:
2336       /* do nothing */
2337       break;
2338   }
2339   depend_str = MemFree (depend_str);
2340 }
2341 
2342 
StringLooksLikeFakeStructuredComment(CharPtr str)2343 static Boolean StringLooksLikeFakeStructuredComment (CharPtr str)
2344 {
2345   if (StringHasNoText (str)) {
2346     return FALSE;
2347   }
2348   if (StringSearch (str, "::") != NULL) {
2349     return TRUE;
2350   }
2351   return FALSE;
2352 }
2353 
2354 
ValidateUserObject(ValidStructPtr vsp,UserObjectPtr uop)2355 static void ValidateUserObject(ValidStructPtr vsp, UserObjectPtr uop)
2356 {
2357   CharPtr            prefix;
2358   ObjectIdPtr        oip;
2359   EFieldValid        sc_valid;
2360   UserFieldPtr       curr;
2361   CharPtr            field;
2362   CharPtr            str;
2363 
2364   if (uop == NULL || vsp == NULL) {
2365     return;
2366   }
2367   oip = uop->type;
2368   if (oip == NULL) {
2369       //LCOV_EXCL_START
2370       //can't test with valid ASN.1
2371     ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_UserObjectProblem, "User object with no type");
2372     //LCOV_EXCL_STOP
2373   }
2374   if (uop->data == NULL) {
2375     if (oip == NULL || oip->str == NULL
2376         || (StringICmp (oip->str, "NcbiAutofix") != 0
2377             && StringICmp (oip->str, "Unverified") != 0)) {
2378       ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_UserObjectProblem, "User object with no data");
2379     }
2380   }
2381 
2382   if (uop->type != NULL && StringICmp (uop->type->str, "StructuredComment") == 0) {
2383     sc_valid = IsStructuredCommentValid (uop, StructuredCommentError, vsp);
2384     /* report ? */
2385     if (sc_valid != eFieldValid_Valid) {
2386       prefix = GetStructuredCommentPrefix(uop);
2387       if (!StringHasNoText(prefix)) {
2388         ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_BadStrucCommInvalidFieldValue, "Structured Comment invalid");
2389       }
2390     }
2391     for (curr = uop->data; curr != NULL; curr = curr->next) {
2392       if (curr->choice != 1) continue;
2393       oip = curr->label;
2394       if (oip == NULL) continue;
2395       field = oip->str;
2396       if (StringStr (field, "::") != NULL) {
2397         ValidErr (vsp, SEV_REJECT, ERR_SEQ_DESCR_BadStrucCommInvalidFieldName, "Structured comment field '%s' contains double colons", field);
2398       }
2399       str = (CharPtr) curr->data.ptrvalue;
2400       if (StringStr (str, "::") != NULL) {
2401         ValidErr (vsp, SEV_REJECT, ERR_SEQ_DESCR_BadStrucCommInvalidFieldValue, "Structured comment value '%s' contains double colons", str);
2402       }
2403     }
2404   }
2405 }
2406 
2407 
2408 /*****************************************************************************
2409 *
2410 *   Valid1GatherProc(gcp)
2411 *     top level gather callback
2412 *     dispatches to other levels
2413 *
2414 *****************************************************************************/
Valid1GatherProc(GatherContextPtr gcp)2415 static Boolean Valid1GatherProc (GatherContextPtr gcp)
2416 {
2417   ValidStructPtr     vsp;
2418   AnnotDescrPtr      desc;
2419   SeqAnnotPtr        sap;
2420   Boolean            is_blast_align;
2421   Int2               limit;
2422   SeqFeatPtr         sfp;
2423   ValNodePtr         sdp;
2424   SeqGraphPtr        sgp;
2425   BioSourcePtr       biop;
2426   ObjectIdPtr        oip;
2427   PubdescPtr         pdp;
2428   CharPtr            ptr;
2429   BioseqPtr          bsp;
2430   SeqIdPtr           sip;
2431   CharPtr            str;
2432   Char               buf [128];
2433   Char               tmp [128];
2434   ValNodePtr         vnp2;
2435   SeqMgrFeatContext  context;
2436 
2437   vsp = (ValidStructPtr) (gcp->userdata);
2438   vsp->gcp = gcp;               /* needed for ValidErr */
2439 
2440   limit = vsp->validationLimit;
2441 
2442   switch (gcp->thistype) {
2443   case OBJ_BIOSEQ:
2444     if (!vsp->onlyspell) {
2445       if (limit == VALIDATE_ALL || limit == VALIDATE_INST) {
2446         ValidateBioseqInst (gcp);
2447       }
2448       if (limit == VALIDATE_ALL || limit == VALIDATE_CONTEXT) {
2449         ValidateBioseqContext (gcp);
2450       }
2451       if (limit == VALIDATE_ALL || limit == VALIDATE_INST) {
2452         ValidateBioseqHist (gcp);
2453       }
2454       if (limit == VALIDATE_ALL || limit == VALIDATE_GRAPH) {
2455         ValidateGraphsOnBioseq (gcp);
2456       }
2457     }
2458     break;
2459   case OBJ_BIOSEQSET:
2460     if (!vsp->onlyspell) {
2461       if (limit == VALIDATE_ALL || limit == VALIDATE_SET) {
2462         ValidateBioseqSet (gcp);
2463       }
2464     }
2465     break;
2466   case OBJ_SEQANNOT:
2467     if (!vsp->onlyspell) {
2468       if (limit == VALIDATE_ALL) {
2469         sap = (SeqAnnotPtr) gcp->thisitem;
2470         if (sap != NULL) {
2471           if (sap->type == 2) {
2472             is_blast_align = FALSE;
2473             desc = NULL;
2474             while ((desc = ValNodeFindNext (sap->desc, desc, Annot_descr_user)) != NULL) {
2475               if (desc->data.ptrvalue != NULL) {
2476                 oip = ((UserObjectPtr) desc->data.ptrvalue)->type;
2477                 if (oip != NULL && StringCmp (oip->str, "Blast Type") == 0) {
2478                   is_blast_align = TRUE;
2479                 }
2480               }
2481             }
2482             if (is_blast_align) {
2483               ValidErr (vsp, SEV_ERROR, ERR_SEQ_ALIGN_BlastAligns, "Record contains BLAST alignments");
2484             }
2485           }
2486           if (sap->type == 4) {
2487             vsp->bssp = NULL;
2488             vsp->bsp = NULL;
2489             vsp->descr = NULL;
2490             vsp->sfp = NULL;
2491             ValidErr (vsp, SEV_ERROR, ERR_SEQ_ANNOT_AnnotIDs, "Record contains Seq-annot.data.ids");
2492           }
2493           if (sap->type == 5) {
2494             vsp->bssp = NULL;
2495             vsp->bsp = NULL;
2496             vsp->descr = NULL;
2497             vsp->sfp = NULL;
2498             ValidErr (vsp, SEV_ERROR, ERR_SEQ_ANNOT_AnnotLOCs, "Record contains Seq-annot.data.locs");
2499           }
2500         }
2501       }
2502     }
2503     break;
2504   case OBJ_SEQFEAT:
2505     if (!vsp->onlyspell) {
2506       if (limit == VALIDATE_ALL || limit == VALIDATE_FEAT) {
2507         ValidateSeqFeat (gcp);
2508         sfp = (SeqFeatPtr) (gcp->thisitem);
2509         if (sfp != NULL) {
2510           if (sfp->data.choice == SEQFEAT_BIOSRC) {
2511             biop = (BioSourcePtr) sfp->data.value.ptrvalue;
2512             ValidateBioSource (vsp, gcp, biop, sfp, NULL);
2513           }
2514           if (sfp->data.choice == SEQFEAT_PUB) {
2515             pdp = (PubdescPtr) sfp->data.value.ptrvalue;
2516             ValidatePubdesc (vsp, gcp, pdp);
2517           }
2518           if (sfp->cit != NULL) {
2519             ValidateSfpCit (vsp, gcp, sfp);
2520           }
2521           if (vsp->useSeqMgrIndexes) {
2522             if (SeqMgrGetDesiredFeature (gcp->entityID, NULL, 0, 0, sfp, &context) == NULL) {
2523               StringCpy (buf, "?");
2524               bsp = vsp->bsp;
2525               if (bsp != NULL) {
2526                 SeqIdWrite (bsp->id, buf, PRINTID_FASTA_LONG, sizeof (buf) - 1);
2527               }
2528               ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_UnindexedFeature, "Feature is not indexed on Bioseq %s", buf);
2529             } else {
2530               bsp = BioseqFindFromSeqLoc (sfp->location);
2531               if (bsp != NULL) {
2532                 sip = SeqLocId (sfp->location);
2533                 if (sip != NULL && sip->choice != SEQID_GI && sip->choice != SEQID_GIBBSQ && sip->choice != SEQID_GIBBMT) {
2534                   SeqIdWrite (sip, buf, PRINTID_FASTA_SHORT, sizeof (buf) - 1);
2535                   for (sip = bsp->id; sip != NULL; sip = sip->next) {
2536                     if (sip->choice == SEQID_GI || sip->choice == SEQID_GIBBSQ || sip->choice == SEQID_GIBBMT) continue;
2537                     SeqIdWrite (sip, tmp, PRINTID_FASTA_SHORT, sizeof (tmp) - 1);
2538                     if (StringICmp (buf, tmp) != 0) continue;
2539                     if (StringCmp (buf, tmp) == 0) continue;
2540                     ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_FeatureSeqIDCaseDifference,
2541                               "Sequence identifier in feature location differs in capitalization with identifier on Bioseq");
2542                   }
2543                 }
2544               }
2545             }
2546           }
2547         }
2548       }
2549     }
2550     if (limit == VALIDATE_ALL || limit == VALIDATE_FEAT) {
2551       SpellCheckSeqFeat (gcp);
2552     }
2553     break;
2554   case OBJ_SEQGRAPH :
2555     if (!vsp->onlyspell) {
2556       if (limit == VALIDATE_ALL || limit == VALIDATE_GRAPH) {
2557         sgp = (SeqGraphPtr) gcp->thisitem;
2558         if (sgp != NULL) {
2559           if (StringICmp (sgp->title, "Phrap Quality") == 0 ||
2560               StringICmp (sgp->title, "Phred Quality") == 0 ||
2561               StringICmp (sgp->title, "Gap4") == 0) {
2562             if (sgp->flags[2] == 3) {
2563               sip = SeqLocId (sgp->loc);
2564               if (sip != NULL) {
2565                 if (BioseqFindCore (sip) == NULL) {
2566                   SeqIdWrite (sip, buf, PRINTID_FASTA_LONG, sizeof (buf) - 1);
2567                   ValidErr (vsp, SEV_WARNING, ERR_SEQ_GRAPH_GraphBioseqId, "Bioseq not found for Graph location %s", buf);
2568                 }
2569               }
2570             }
2571           }
2572         }
2573       }
2574     }
2575     break;
2576   case OBJ_SEQDESC:
2577     if (limit == VALIDATE_ALL || limit == VALIDATE_DESC) {
2578       SpellCheckSeqDescr (gcp);
2579                           /**
2580               ValidateSeqDescr (gcp);
2581               **/
2582       sdp = (ValNodePtr) (gcp->thisitem);
2583       if (sdp != NULL) {
2584         if (sdp->choice == Seq_descr_source) {
2585           biop = (BioSourcePtr) sdp->data.ptrvalue;
2586           ValidateBioSource (vsp, gcp, biop, NULL, sdp);
2587         }
2588         if (sdp->choice == Seq_descr_pub) {
2589           pdp = (PubdescPtr) sdp->data.ptrvalue;
2590           ValidatePubdesc (vsp, gcp, pdp);
2591           LookForMultiplePubs (vsp, gcp, sdp);
2592         }
2593         if (sdp->choice == Seq_descr_user) {
2594           ValidateUserObject(vsp, (UserObjectPtr) sdp->data.ptrvalue);
2595         }
2596         if (sdp->choice == Seq_descr_comment) {
2597           str = (CharPtr) sdp->data.ptrvalue;
2598           if (StringHasNoText (str)) {
2599             ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_MissingText, "Comment descriptor needs text");
2600           }
2601           if (SerialNumberInString (str)) {
2602             ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_SerialInComment,
2603                       "Comment may refer to reference by serial number - attach reference specific comments to the reference REMARK instead.");
2604           }
2605           if (StringLooksLikeFakeStructuredComment (str)) {
2606             ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_FakeStructuredComment,
2607                       "Comment may be formatted to look like a structured comment.");
2608           }
2609           for (vnp2 = sdp->next; vnp2 != NULL; vnp2 = vnp2->next) {
2610             if (vnp2->choice == Seq_descr_comment) {
2611               ptr = (CharPtr) vnp2->data.ptrvalue;
2612               if (StringDoesHaveText (ptr) && StringICmp (str, ptr) == 0) {
2613                 ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MultipleComments, "Undesired multiple comment descriptors, identical text");
2614               }
2615             }
2616           }
2617         }
2618         if (sdp->choice == Seq_descr_mol_type) {
2619           ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "MolType descriptor is obsolete");
2620         }
2621         if (sdp->choice == Seq_descr_modif) {
2622           ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "Modif descriptor is obsolete");
2623         }
2624         if (sdp->choice == Seq_descr_method) {
2625           ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "Method descriptor is obsolete");
2626         }
2627         if (sdp->choice == Seq_descr_org) {
2628           ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "OrgRef descriptor is obsolete");
2629         }
2630       }
2631     }
2632     break;
2633   default:
2634     break;
2635 
2636   }
2637   return TRUE;
2638 }
2639 
2640 
DiscrepanciesToValidationErrs(ValNodePtr discrepancy_list,Uint4 item_type,ValidStructPtr vsp,int severity,int code1,int code2,char * msg)2641 static void DiscrepanciesToValidationErrs (ValNodePtr discrepancy_list, Uint4 item_type, ValidStructPtr vsp, int severity, int code1, int code2, char *msg)
2642 {
2643   ValNodePtr vnp, obj;
2644   ValNodePtr tvnp;
2645   ClickableItemPtr cip;
2646 
2647   if (discrepancy_list == NULL || vsp == NULL) {
2648     return;
2649   }
2650   vsp->bssp = NULL;
2651   vsp->bsp = NULL;
2652   vsp->sfp = NULL;
2653   vsp->descr = NULL;
2654   for (vnp = discrepancy_list; vnp != NULL; vnp = vnp->next) {
2655     cip = (ClickableItemPtr) vnp->data.ptrvalue;
2656     if (cip != NULL) {
2657       if (cip->clickable_item_type == item_type) {
2658         if (cip->item_list == NULL) {
2659           DiscrepanciesToValidationErrs (cip->subcategories, item_type, vsp, severity, code1, code2, msg);
2660         } else {
2661           for (obj = cip->item_list; obj != NULL; obj = obj->next) {
2662             if (obj->choice == OBJ_SEQFEAT) {
2663               vsp->sfp = obj->data.ptrvalue;
2664               vsp->gcp->entityID = vsp->sfp->idx.entityID;
2665               vsp->gcp->thistype = OBJ_SEQFEAT;
2666               vsp->gcp->itemID = vsp->sfp->idx.itemID;
2667 
2668               if(item_type == DISC_SHORT_INTRON)
2669               {
2670                   for(tvnp = vsp->sisfp; tvnp != NULL; tvnp = tvnp->next)
2671                       if(tvnp->data.ptrvalue == vsp->sfp)
2672                           break;
2673               }
2674               else
2675                   tvnp = NULL;
2676               if(tvnp == NULL)
2677                   ValidErr (vsp, severity, code1, code2, msg);
2678               vsp->sfp = NULL;
2679             }
2680           }
2681         }
2682       }
2683     }
2684   }
2685   vsp->sfp = NULL;
2686 }
2687 
2688 
ValidateGeneLocusTags(SeqEntryPtr sep,ValidStructPtr vsp)2689 static void ValidateGeneLocusTags (SeqEntryPtr sep, ValidStructPtr vsp)
2690 {
2691   ValNode vn;
2692   ValNodePtr discrepancy_list = NULL;
2693 
2694   if (sep == NULL || vsp == NULL) {
2695     return;
2696   }
2697 
2698   vn.choice = 0;
2699   vn.data.ptrvalue = sep;
2700   vn.next = NULL;
2701 
2702   AddDiscrepanciesForMissingOrNonUniqueGeneLocusTagsEx (&discrepancy_list, &vn, TRUE);
2703 
2704   DiscrepanciesToValidationErrs (discrepancy_list, DISC_GENE_MISSING_LOCUS_TAG, vsp, SEV_WARNING, ERR_SEQ_FEAT_MissingGeneLocusTag, "Missing gene locus tag");
2705 
2706   discrepancy_list = FreeClickableList (discrepancy_list);
2707 }
2708 
2709 
ValidateShortIntrons(SeqEntryPtr sep,ValidStructPtr vsp)2710 static void ValidateShortIntrons (SeqEntryPtr sep, ValidStructPtr vsp)
2711 {
2712   ValNode vn;
2713   ValNodePtr discrepancy_list = NULL;
2714 
2715   if (sep == NULL || vsp == NULL) {
2716     return;
2717   }
2718 
2719   vn.choice = 0;
2720   vn.data.ptrvalue = sep;
2721   vn.next = NULL;
2722 
2723   FindShortIntronsEx (&discrepancy_list, &vn, vsp->indexerVersion);
2724 
2725   DiscrepanciesToValidationErrs (discrepancy_list, DISC_SHORT_INTRON, vsp, SEV_WARNING, ERR_SEQ_FEAT_ShortIntron, "Introns should be at least 10 nt long");
2726 
2727   discrepancy_list = FreeClickableList (discrepancy_list);
2728 }
2729 
2730 
LookForAnyPubAndOrg(SeqEntryPtr sep,BoolPtr no_pub,BoolPtr no_cit_sub,BoolPtr no_biosrc)2731 static void LookForAnyPubAndOrg (SeqEntryPtr sep, BoolPtr no_pub, BoolPtr no_cit_sub, BoolPtr no_biosrc)
2732 {
2733   BioseqPtr       bsp;
2734   BioseqSetPtr    bssp;
2735   PubdescPtr      pdp;
2736   SeqAnnotPtr     sap = NULL;
2737   ValNodePtr      sdp = NULL;
2738   SeqFeatPtr      sfp;
2739   SeqEntryPtr     tmp;
2740   ValNodePtr      vnp;
2741 
2742   if (sep == NULL || no_pub == NULL || no_cit_sub == NULL || no_biosrc == NULL)
2743     return;
2744   if (IS_Bioseq (sep)) {
2745     bsp = (BioseqPtr) sep->data.ptrvalue;
2746     if (bsp == NULL)
2747       return;
2748     sap = bsp->annot;
2749     sdp = bsp->descr;
2750   } else if (IS_Bioseq_set (sep)) {
2751     bssp = (BioseqSetPtr) sep->data.ptrvalue;
2752     if (bssp == NULL)
2753       return;
2754     for (tmp = bssp->seq_set; tmp != NULL; tmp = tmp->next) {
2755       LookForAnyPubAndOrg (tmp, no_pub, no_cit_sub, no_biosrc);
2756     }
2757     sap = bssp->annot;
2758     sdp = bssp->descr;
2759   } else
2760     return;
2761   while (sap != NULL) {
2762     if (sap->type == 1) {
2763       sfp = (SeqFeatPtr) sap->data;
2764       while (sfp != NULL) {
2765         if (sfp->data.choice == SEQFEAT_PUB) {
2766           *no_pub = FALSE;
2767         } else if (sfp->data.choice == SEQFEAT_BIOSRC) {
2768           *no_biosrc = FALSE;
2769         }
2770         sfp = sfp->next;
2771       }
2772     }
2773     sap = sap->next;
2774   }
2775   while (sdp != NULL) {
2776     if (sdp->choice == Seq_descr_pub) {
2777       *no_pub = FALSE;
2778       pdp = (PubdescPtr) sdp->data.ptrvalue;
2779       if (pdp != NULL) {
2780         for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
2781           if (vnp->choice == PUB_Sub) {
2782             *no_cit_sub = FALSE;
2783           }
2784         }
2785       }
2786     } else if (sdp->choice == Seq_descr_source) {
2787       *no_biosrc = FALSE;
2788     }
2789     sdp = sdp->next;
2790   }
2791 }
2792 
2793 typedef struct ftprob {
2794   Uint4    num_misplaced_features;
2795   Uint4    num_small_genome_set_misplaced;
2796   Uint4    num_archaic_locations;
2797   Uint4    num_archaic_products;
2798   Uint4    num_misplaced_graphs;
2799   Uint4    num_gene_feats;
2800   Uint4    num_gene_xrefs;
2801   Uint4    num_tpa_with_hist;
2802   Uint4    num_tpa_without_hist;
2803   Uint4    num_pseudo;
2804   Uint4    num_pseudogene;
2805   Uint4    first_taxid;
2806   Int2     num_super_kingdom;
2807   Boolean  has_gi;
2808   Boolean  loc_has_gi;
2809   Boolean  loc_has_just_accn;
2810   Boolean  loc_has_accn_ver;
2811   Boolean  prod_has_gi;
2812   Boolean  prod_has_just_accn;
2813   Boolean  prod_has_accn_ver;
2814   Boolean  mult_taxids;
2815   Boolean  super_kingdoms_different;
2816   CharPtr  super_kingdom_name;
2817 } FeatProb, PNTR FeatProbPtr;
2818 
CheckFeatPacking(BioseqPtr bsp,SeqFeatPtr sfp,Uint4Ptr num_misplaced_features,Uint4Ptr num_small_genome_set_misplaced)2819 static void CheckFeatPacking (BioseqPtr bsp, SeqFeatPtr sfp, Uint4Ptr num_misplaced_features, Uint4Ptr num_small_genome_set_misplaced)
2820 {
2821   SeqAnnotPtr     sap;
2822   BioseqSetPtr    bssp, parent;
2823   BioseqPtr       par;
2824 
2825   if (sfp->idx.parenttype == OBJ_SEQANNOT) {
2826     sap = (SeqAnnotPtr) sfp->idx.parentptr;
2827     if (sap == NULL)
2828       return;
2829     if (sap->idx.parenttype == OBJ_BIOSEQ) {
2830       /* if feature packaged on bioseq, must be target bioseq */
2831       par = (BioseqPtr) sap->idx.parentptr;
2832       if (par != bsp && SeqMgrGetParentOfPart (par, NULL) != bsp) {
2833         /* generated gap feature is an exception */
2834         if (par == NULL || par->id != NULL) {
2835           (*num_misplaced_features)++;
2836         }
2837       }
2838       return;
2839     }
2840     if (sap->idx.parenttype == OBJ_BIOSEQSET) {
2841       /* if feature packaged on set, set must contain bioseq */
2842       bssp = (BioseqSetPtr) sap->idx.parentptr;
2843       if (bssp == NULL)
2844         return;
2845       if (bsp->idx.parenttype == OBJ_BIOSEQSET) {
2846         parent = (BioseqSetPtr) bsp->idx.parentptr;
2847         while (parent != NULL) {
2848           if (parent == bssp) return;
2849           if (parent->idx.parenttype == OBJ_BIOSEQSET && parent->_class == BioseqseqSet_class_small_genome_set) {
2850             (*num_small_genome_set_misplaced)++;
2851             return;
2852           }
2853           if (parent->idx.parenttype != OBJ_BIOSEQSET) {
2854             (*num_misplaced_features)++;
2855             return;
2856           }
2857           parent = (BioseqSetPtr) parent->idx.parentptr;
2858         }
2859         (*num_misplaced_features)++;
2860       }
2861     }
2862   }
2863 }
2864 
IdIsArchaic(SeqIdPtr sip)2865 static Boolean IdIsArchaic (SeqIdPtr sip)
2866 
2867 {
2868   BioseqPtr  bsp;
2869   DbtagPtr   dbt;
2870   SeqIdPtr   id;
2871 
2872   if (sip == NULL) return FALSE;
2873   if (sip->choice != SEQID_LOCAL && sip->choice != SEQID_GENERAL) return FALSE;
2874   bsp = BioseqFind (sip);
2875   if (bsp == NULL) return FALSE;
2876   for (id = bsp->id; id != NULL; id = id->next) {
2877     switch (id->choice) {
2878       case SEQID_GENERAL :
2879         if (sip->choice == SEQID_LOCAL) {
2880           dbt = (DbtagPtr) id->data.ptrvalue;
2881           if (dbt != NULL && !IsSkippableDbtag(dbt)) {
2882             return TRUE;
2883           }
2884         }
2885         break;
2886       case SEQID_GI :
2887       case SEQID_GENBANK :
2888       case SEQID_EMBL :
2889       case SEQID_PATENT :
2890       case SEQID_OTHER :
2891       case SEQID_DDBJ :
2892       case SEQID_TPG :
2893       case SEQID_TPE :
2894       case SEQID_TPD :
2895       case SEQID_GPIPE :
2896         return TRUE;
2897       default :
2898         break;
2899     }
2900   }
2901   return FALSE;
2902 }
2903 
CheckFeatLocAndProd(SeqFeatPtr sfp,FeatProbPtr fpp)2904 static void CheckFeatLocAndProd (SeqFeatPtr sfp, FeatProbPtr fpp)
2905 
2906 {
2907   SeqLocPtr  slp;
2908 
2909   if (sfp == NULL || fpp == NULL) return;
2910   if (sfp->product != NULL && IdIsArchaic (SeqLocId (sfp->product))) {
2911     (fpp->num_archaic_products)++;
2912   }
2913   slp = SeqLocFindNext (sfp->location, NULL);
2914   while (slp != NULL) {
2915     if (IdIsArchaic (SeqLocId (slp))) {
2916       (fpp->num_archaic_locations)++;
2917       return;
2918     }
2919     slp = SeqLocFindNext (sfp->location, slp);
2920   }
2921 }
2922 
CheckGraphPacking(SeqGraphPtr sgp,Pointer userdata)2923 static void CheckGraphPacking (SeqGraphPtr sgp, Pointer userdata)
2924 
2925 {
2926   BioseqPtr    bsp;
2927   FeatProbPtr  fpp;
2928   SeqAnnotPtr  sap;
2929   BioseqPtr    par;
2930 
2931   if (sgp == NULL || userdata == NULL) return;
2932   fpp = (FeatProbPtr) userdata;
2933   bsp = BioseqFindFromSeqLoc (sgp->loc);
2934   if (sgp->idx.parenttype == OBJ_SEQANNOT) {
2935     sap = (SeqAnnotPtr) sgp->idx.parentptr;
2936     if (sap == NULL) return;
2937     if (sap->idx.parenttype == OBJ_BIOSEQ) {
2938       /* if graph packaged on bioseq, must be target bioseq */
2939       par = (BioseqPtr) sap->idx.parentptr;
2940       if (par != bsp && SeqMgrGetParentOfPart (par, NULL) != bsp) {
2941         (fpp->num_misplaced_graphs)++;
2942       }
2943       return;
2944     }
2945     (fpp->num_misplaced_graphs)++;
2946   }
2947 }
2948 
CountMisplacedFeatures(BioseqPtr bsp,SeqMgrBioseqContextPtr bcontext)2949 static Boolean LIBCALLBACK CountMisplacedFeatures (BioseqPtr bsp, SeqMgrBioseqContextPtr bcontext)
2950 
2951 {
2952   SeqMgrFeatContext  fcontext;
2953   FeatProbPtr        fpp;
2954   SeqFeatPtr         sfp;
2955 
2956   fpp = (FeatProbPtr) bcontext->userdata;
2957   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
2958   while (sfp != NULL) {
2959     if (! fcontext.ts_image) {
2960       CheckFeatPacking (bsp, sfp, &(fpp->num_misplaced_features), &(fpp->num_small_genome_set_misplaced));
2961       CheckFeatLocAndProd (sfp, fpp);
2962     }
2963     sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext);
2964   }
2965 
2966   return TRUE;
2967 }
2968 
CountGeneXrefs(SeqFeatPtr sfp,Pointer userdata)2969 static void CountGeneXrefs (SeqFeatPtr sfp, Pointer userdata)
2970 
2971 {
2972   FeatProbPtr  fpp;
2973   GeneRefPtr   grp;
2974 
2975   if (sfp == NULL || userdata == NULL) return;
2976   fpp = (FeatProbPtr) userdata;
2977 
2978   if (sfp->data.choice == SEQFEAT_GENE) {
2979     (fpp->num_gene_feats)++;
2980   }
2981 
2982   grp = SeqMgrGetGeneXref (sfp);
2983   if (grp == NULL || SeqMgrGeneIsSuppressed (grp)) return;
2984 
2985   (fpp->num_gene_xrefs)++;
2986 }
2987 
CountSfpLocIdTypes(SeqIdPtr sip,Pointer userdata)2988 static void CountSfpLocIdTypes (SeqIdPtr sip, Pointer userdata)
2989 
2990 {
2991   FeatProbPtr   fpp;
2992   TextSeqIdPtr  tsip;
2993 
2994   if (sip == NULL || userdata == NULL) return;
2995   fpp = (FeatProbPtr) userdata;
2996 
2997   switch (sip->choice) {
2998     case SEQID_GI :
2999       fpp->loc_has_gi = TRUE;
3000       break;
3001     case SEQID_GENBANK :
3002     case SEQID_EMBL :
3003     case SEQID_DDBJ :
3004     case SEQID_TPG :
3005     case SEQID_TPE :
3006     case SEQID_TPD :
3007     case SEQID_OTHER :
3008       tsip = (TextSeqIdPtr) sip->data.ptrvalue;
3009       if (tsip != NULL) {
3010         if (StringDoesHaveText (tsip->accession)) {
3011           if (tsip->version < 1) {
3012             fpp->loc_has_just_accn = TRUE;
3013           } else {
3014               //LCOV_EXCL_START
3015               //value not actually used anywhere
3016             fpp->loc_has_accn_ver = TRUE;
3017             //LCOV_EXCL_STOP
3018           }
3019         }
3020       }
3021       break;
3022     default :
3023       break;
3024   }
3025 }
3026 
CountSfpProdIdTypes(SeqIdPtr sip,Pointer userdata)3027 static void CountSfpProdIdTypes (SeqIdPtr sip, Pointer userdata)
3028 
3029 {
3030   FeatProbPtr   fpp;
3031   TextSeqIdPtr  tsip;
3032 
3033   if (sip == NULL || userdata == NULL) return;
3034   fpp = (FeatProbPtr) userdata;
3035 
3036   switch (sip->choice) {
3037     case SEQID_GI :
3038       fpp->prod_has_gi = TRUE;
3039       break;
3040     case SEQID_GENBANK :
3041     case SEQID_EMBL :
3042     case SEQID_DDBJ :
3043     case SEQID_TPG :
3044     case SEQID_TPE :
3045     case SEQID_TPD :
3046     case SEQID_OTHER :
3047       tsip = (TextSeqIdPtr) sip->data.ptrvalue;
3048       if (tsip != NULL) {
3049         if (StringDoesHaveText (tsip->accession)) {
3050           if (tsip->version < 1) {
3051             fpp->prod_has_just_accn = TRUE;
3052           } else {
3053             fpp->prod_has_accn_ver = TRUE;
3054           }
3055         }
3056       }
3057       break;
3058     default :
3059       break;
3060   }
3061 }
3062 
CountFeatLocIdTypes(SeqFeatPtr sfp,Pointer userdata)3063 static void CountFeatLocIdTypes (SeqFeatPtr sfp, Pointer userdata)
3064 
3065 {
3066   if (sfp == NULL || userdata == NULL) return;
3067 
3068   VisitSeqIdsInSeqLoc (sfp->location, userdata, CountSfpLocIdTypes);
3069   VisitSeqIdsInSeqLoc (sfp->product, userdata, CountSfpProdIdTypes);
3070 }
3071 
HasTpaUserObject(BioseqPtr bsp)3072 NLM_EXTERN Boolean HasTpaUserObject (BioseqPtr bsp)
3073 
3074 {
3075   SeqMgrDescContext  context;
3076   UserObjectPtr      uop;
3077   ObjectIdPtr        oip;
3078   ValNodePtr         vnp;
3079 
3080   if (bsp == NULL) return FALSE;
3081   vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_user, &context);
3082   while (vnp != NULL) {
3083     uop = (UserObjectPtr) vnp->data.ptrvalue;
3084     if (uop != NULL) {
3085       oip = uop->type;
3086       if (oip != NULL && StringICmp (oip->str, "TpaAssembly") == 0) return TRUE;
3087     }
3088     vnp = SeqMgrGetNextDescriptor (bsp, vnp, Seq_descr_user, &context);
3089   }
3090   return FALSE;
3091 }
3092 
CheckTpaHist(BioseqPtr bsp,Pointer userdata)3093 static void CheckTpaHist (BioseqPtr bsp, Pointer userdata)
3094 
3095 {
3096   FeatProbPtr  fpp;
3097   SeqHistPtr   shp;
3098   SeqIdPtr     sip;
3099 
3100   if (bsp == NULL || userdata == NULL) return;
3101   fpp = (FeatProbPtr) userdata;
3102   for (sip = bsp->id; sip != NULL; sip = sip->next) {
3103     if (sip->choice == SEQID_GI) {
3104       fpp->has_gi = TRUE;
3105     }
3106   }
3107   if (! HasTpaUserObject (bsp)) return;
3108   shp = bsp->hist;
3109   if (shp != NULL && shp->assembly != NULL) {
3110     (fpp->num_tpa_with_hist)++;
3111   } else {
3112     (fpp->num_tpa_without_hist)++;
3113   }
3114 }
3115 
IsNoncuratedRefSeq(BioseqPtr bsp,ErrSev * sev)3116 static Boolean IsNoncuratedRefSeq (BioseqPtr bsp, ErrSev *sev)
3117 
3118 {
3119   SeqIdPtr      sip;
3120   TextSeqIdPtr  tsip;
3121 
3122   if (bsp == NULL) return FALSE;
3123   for (sip = bsp->id; sip != NULL; sip = sip->next) {
3124     if (sip->choice == SEQID_OTHER) {
3125       tsip = (TextSeqIdPtr) sip->data.ptrvalue;
3126       if (tsip != NULL && tsip->accession != NULL) {
3127         if (StringNCmp (tsip->accession, "NM_", 3) == 0 ||
3128             StringNCmp (tsip->accession, "NP_", 3) == 0 ||
3129             StringNCmp (tsip->accession, "NG_", 3) == 0 ||
3130             StringNCmp (tsip->accession, "NR_", 3) == 0) {
3131           *sev = SEV_WARNING;
3132           return FALSE;
3133         }
3134         return TRUE;
3135       }
3136     }
3137   }
3138   return FALSE;
3139 }
3140 
IsGpipe(BioseqPtr bsp)3141 static Boolean IsGpipe (BioseqPtr bsp)
3142 
3143 {
3144   SeqIdPtr  sip;
3145 
3146   if (bsp == NULL) return FALSE;
3147   for (sip = bsp->id; sip != NULL; sip = sip->next) {
3148     if (sip->choice == SEQID_GPIPE) return TRUE;
3149   }
3150   return FALSE;
3151 }
3152 
IsWgsContig(BioseqPtr bsp)3153 static Boolean IsWgsContig (BioseqPtr bsp)
3154 
3155 {
3156   MolInfoPtr   mip;
3157   SeqDescrPtr  sdp;
3158 
3159   if (bsp == NULL) return FALSE;
3160   if (bsp->repr == Seq_repr_virtual) return FALSE;
3161   sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_molinfo, NULL);
3162   if (sdp == NULL) return FALSE;
3163   mip = (MolInfoPtr) sdp->data.ptrvalue;
3164   if (mip == NULL) return FALSE;
3165   if (mip->tech == MI_TECH_wgs) return TRUE;
3166   return FALSE;
3167 }
3168 
IsTsaContig(BioseqPtr bsp)3169 static Boolean IsTsaContig (BioseqPtr bsp)
3170 
3171 {
3172   MolInfoPtr   mip;
3173   SeqDescrPtr  sdp;
3174 
3175   if (bsp == NULL) return FALSE;
3176   /* if (bsp->repr == Seq_repr_virtual) return FALSE; */
3177   sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_molinfo, NULL);
3178   if (sdp == NULL) return FALSE;
3179   mip = (MolInfoPtr) sdp->data.ptrvalue;
3180   if (mip == NULL) return FALSE;
3181   if (mip->tech == MI_TECH_tsa) return TRUE;
3182   return FALSE;
3183 }
3184 
3185 typedef struct vfcdata {
3186   ValNodePtr      uids;
3187   ValNodePtr      unpub;
3188   ValNodePtr      publshd;
3189   ValNodePtr      serial;
3190   ValidStructPtr  vsp;
3191 } VfcData, PNTR VfcPtr;
3192 
SkipSerialOrUIDPub(ValNodePtr vnp)3193 static Boolean SkipSerialOrUIDPub (ValNodePtr vnp)
3194 
3195 {
3196   CitGenPtr  cgp;
3197 
3198   if (vnp == NULL || vnp->next == NULL) return FALSE;
3199   if (vnp->choice == PUB_Muid || vnp->choice == PUB_Muid) return TRUE;
3200   if (vnp->choice != PUB_Gen) return FALSE;
3201   cgp = (CitGenPtr) vnp->data.ptrvalue;
3202   if (cgp == NULL) return FALSE;
3203   if (StringNICmp ("BackBone id_pub", cgp->cit, 15) == 0) return FALSE;
3204   if (cgp->cit == NULL && cgp->journal == NULL && cgp->date == NULL && cgp->serial_number) return TRUE;
3205   return FALSE;
3206 }
3207 
MakePubTags(PubdescPtr pdp,Pointer userdata)3208 static void MakePubTags (PubdescPtr pdp, Pointer userdata)
3209 
3210 {
3211   Char        buf [1024];
3212   CitGenPtr   cgp;
3213   Int4        muid = 0, pmid = 0;
3214   VfcPtr      vfp;
3215   ValNodePtr  vnp, tmp;
3216 
3217   if (pdp == NULL || userdata == NULL) return;
3218   vfp = (VfcPtr) userdata;
3219 
3220   for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
3221     if (vnp->choice == PUB_Muid) {
3222       muid = vnp->data.intvalue;
3223     } else if (vnp->choice == PUB_PMid) {
3224       pmid = vnp->data.intvalue;
3225     } else if (vnp->choice == PUB_Gen) {
3226       cgp = (CitGenPtr) vnp->data.ptrvalue;
3227       if (cgp != NULL && cgp->serial_number > 0) {
3228           //LCOV_EXCL_START
3229           //serial numbers stripped by basic cleanup
3230         tmp = ValNodeNew (NULL);
3231         if (tmp != NULL) {
3232           tmp->data.intvalue = (Int4) cgp->serial_number;
3233           tmp->next = vfp->serial;
3234           vfp->serial = tmp;
3235         }
3236         //LCOV_EXCL_STOP
3237       }
3238     }
3239   }
3240 
3241   if (pmid != 0) {
3242     vnp = ValNodeNew (NULL);
3243     if (vnp != NULL) {
3244       vnp->choice = 1;
3245       vnp->data.intvalue = pmid;
3246       vnp->next = vfp->uids;
3247       vfp->uids = vnp;
3248     }
3249   }
3250   if (muid != 0) {
3251     vnp = ValNodeNew (NULL);
3252     if (vnp != NULL) {
3253       vnp->choice = 2;
3254       vnp->data.intvalue = muid;
3255       vnp->next = vfp->uids;
3256       vfp->uids = vnp;
3257     }
3258   }
3259 
3260   vnp = pdp->pub;
3261   while (vnp != NULL && SkipSerialOrUIDPub (vnp)) {
3262     vnp = vnp->next;
3263   }
3264   if (vnp != NULL && PubLabelUnique (vnp, buf, sizeof (buf) - 1, OM_LABEL_CONTENT, TRUE) > 0) {
3265     tmp = ValNodeCopyStr (NULL, 0, buf);
3266     if (tmp != NULL) {
3267       if (pmid != 0 || muid != 0) {
3268         tmp->next = vfp->publshd;
3269         vfp->publshd = tmp;
3270       } else {
3271         tmp->next = vfp->unpub;
3272         vfp->unpub = tmp;
3273       }
3274     }
3275   }
3276 }
3277 
CheckOneCit(SeqFeatPtr sfp,ValNodePtr ppr,VfcPtr vfp)3278 static void CheckOneCit (SeqFeatPtr sfp, ValNodePtr ppr, VfcPtr vfp)
3279 
3280 {
3281   Char              buf [1024];
3282   GatherContextPtr  gcp;
3283   size_t            len, lgth;
3284   CharPtr           str;
3285   Int4              uid;
3286   ValNodePtr        vnp;
3287   ValidStructPtr    vsp;
3288 
3289   if (sfp == NULL || ppr == NULL || vfp == NULL) return;
3290   vsp = vfp->vsp;
3291   if (vsp == NULL) return;
3292   gcp = vsp->gcp;
3293 
3294   if (gcp != NULL) {
3295     gcp->entityID = sfp->idx.entityID;
3296     gcp->itemID = sfp->idx.itemID;
3297     gcp->thistype = OBJ_SEQFEAT;
3298   }
3299   vsp->sfp = sfp;
3300 
3301   if (ppr->choice == PUB_PMid || ppr->choice == PUB_Muid) {
3302     uid = ppr->data.intvalue;
3303     for (vnp = vfp->uids; vnp != NULL; vnp = vnp->next) {
3304       if (uid == vnp->data.intvalue) return;
3305     }
3306     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_FeatureCitationProblem,
3307               "Citation on feature refers to uid [%ld] not on a publication in the record", (long) uid);
3308     vsp->sfp = NULL;
3309 
3310   } else if (ppr->choice == PUB_Equiv) {
3311     return;
3312 
3313   } else {
3314     PubLabelUnique (ppr, buf, sizeof (buf) - 1, OM_LABEL_CONTENT, TRUE);
3315     lgth = StringLen (buf);
3316     if (lgth > 0 && buf [lgth - 1] == '>') {
3317       buf [lgth - 1] = '\0';
3318      lgth--;
3319     }
3320     for (vnp = vfp->unpub; vnp != NULL; vnp = vnp->next) {
3321       str = (CharPtr) vnp->data.ptrvalue;
3322       if (StringHasNoText (str)) continue;
3323       len = MIN (lgth, StringLen (str));
3324       if (StringNICmp (str, buf, len) == 0) return;
3325     }
3326     for (vnp = vfp->publshd; vnp != NULL; vnp = vnp->next) {
3327       str = (CharPtr) vnp->data.ptrvalue;
3328       if (StringHasNoText (str)) continue;
3329       len = MIN (lgth, StringLen (str));
3330       if (StringNICmp (str, buf, len) == 0) {
3331         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_FeatureCitationProblem,
3332                   "Citation on feature needs to be updated to published uid");
3333         vsp->sfp = NULL;
3334         return;
3335       }
3336     }
3337     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_FeatureCitationProblem,
3338               "Citation on feature refers to a publication not in the record");
3339     vsp->sfp = NULL;
3340   }
3341 }
3342 
CheckFeatCits(SeqFeatPtr sfp,Pointer userdata)3343 static void CheckFeatCits (SeqFeatPtr sfp, Pointer userdata)
3344 
3345 {
3346   ValNodePtr  ppr, vnp;
3347   VfcPtr      vfp;
3348 
3349   if (sfp == NULL || sfp->cit == NULL || userdata == NULL) return;
3350   vfp = (VfcPtr) userdata;
3351 
3352   vnp = sfp->cit;
3353   for (ppr = vnp->data.ptrvalue; ppr != NULL; ppr = ppr->next) {
3354     CheckOneCit (sfp, ppr, vfp);
3355   }
3356 }
3357 
3358 //LCOV_EXCL_START
3359 //serial numbers are removed during basic cleanup
CheckForCollidingSerials(ValidStructPtr vsp,GatherContextPtr gcp,ValNodePtr list)3360 static void CheckForCollidingSerials (
3361   ValidStructPtr vsp,
3362   GatherContextPtr gcp,
3363   ValNodePtr list
3364 )
3365 
3366 {
3367   Int4        curr, last;
3368   Uint2       olditemtype = 0;
3369   Uint4       olditemid = 0;
3370   ValNodePtr  vnp, vnp_next;
3371 
3372   if (vsp == NULL || gcp == NULL || list == NULL) return;
3373 
3374   olditemid = gcp->itemID;
3375   olditemtype = gcp->thistype;
3376   gcp->itemID = 0;
3377   gcp->thistype = 0;
3378 
3379   last = (Int4) list->data.intvalue;
3380   for (vnp = list->next; vnp != NULL; vnp = vnp_next) {
3381     vnp_next = vnp->next;
3382     curr = (Int4) vnp->data.intvalue;
3383     if (last == curr) {
3384       ValidErr (vsp, SEV_WARNING, ERR_GENERIC_CollidingSerialNumbers,
3385                 "Multiple publications have serial number %ld", (long) curr);
3386       while (vnp != NULL && vnp->data.intvalue == last) {
3387         vnp = vnp->next;
3388       }
3389       if (vnp == NULL) {
3390         vnp_next = NULL;
3391       } else {
3392         last = vnp->data.intvalue;
3393         vnp_next = vnp->next;
3394       }
3395     } else {
3396       last = curr;
3397     }
3398   }
3399 
3400   gcp->itemID = olditemid;
3401   gcp->thistype = olditemtype;
3402 }
3403 //LCOV_EXCL_STOP
3404 
ValidateFeatCits(SeqEntryPtr sep,ValidStructPtr vsp)3405 static void ValidateFeatCits (SeqEntryPtr sep, ValidStructPtr vsp)
3406 
3407 {
3408   SeqEntryPtr    bsep;
3409   BioseqPtr      bsp = NULL;
3410   GatherContext  gc;
3411   VfcData        vfd;
3412 
3413   if (vsp == NULL || sep == NULL) return;
3414 
3415   bsep = FindNthBioseq (sep, 1);
3416   if (bsep != NULL && IS_Bioseq (bsep)) {
3417     bsp = (BioseqPtr) bsep->data.ptrvalue;
3418   }
3419 
3420   vsp->gcp = &gc;
3421   vsp->bssp = NULL;
3422   vsp->bsp = bsp;
3423   vsp->sfp = NULL;
3424   vsp->descr = NULL;
3425   MemSet ((Pointer) &gc, 0, sizeof (GatherContext));
3426   MemSet ((Pointer) &vfd, 0, sizeof (VfcData));
3427   vfd.vsp = vsp;
3428 
3429   VisitPubdescsInSep (sep, (Pointer) &vfd, MakePubTags);
3430 
3431   VisitFeaturesInSep (sep, (Pointer) &vfd, CheckFeatCits);
3432 
3433   vsp->bssp = NULL;
3434   vsp->bsp = bsp;
3435   vsp->sfp = NULL;
3436   vsp->descr = NULL;
3437   vfd.serial = ValNodeSort (vfd.serial, SortByIntvalue);
3438   CheckForCollidingSerials (vsp, vsp->gcp, vfd.serial);
3439 
3440   ValNodeFree (vfd.uids);
3441   ValNodeFreeData (vfd.unpub);
3442   ValNodeFreeData (vfd.publshd);
3443   ValNodeFree (vfd.serial);
3444 }
3445 
ValidateFeatIDs(SeqEntryPtr sep,Uint2 entityID,ValidStructPtr vsp)3446 static void ValidateFeatIDs (SeqEntryPtr sep, Uint2 entityID, ValidStructPtr vsp)
3447 
3448 {
3449   SMFidItemPtr PNTR  array;
3450  SeqEntryPtr         bsep;
3451   BioseqPtr           bsp = NULL;
3452   BioseqExtraPtr     bspextra;
3453   SMFeatItemPtr      feat;
3454   GatherContext      gc;
3455   GatherContextPtr   gcp;
3456   SMFidItemPtr       item;
3457   Int4               j;
3458   CharPtr            last = NULL;
3459   Int4               num;
3460   ObjMgrDataPtr      omdp;
3461   SeqFeatPtr         sfp;
3462 
3463   if (sep == NULL || entityID < 1 || vsp == NULL) return;
3464   omdp = ObjMgrGetData (entityID);
3465   if (omdp == NULL) return;
3466   bspextra = (BioseqExtraPtr) omdp->extradata;
3467   if (bspextra == NULL) return;
3468   array = bspextra->featsByFeatID;
3469   num = bspextra->numfids;
3470   if (array == NULL || num < 1) return;
3471 
3472   bsep = FindNthBioseq (sep, 1);
3473   if (bsep != NULL && IS_Bioseq (bsep)) {
3474     bsp = (BioseqPtr) bsep->data.ptrvalue;
3475   }
3476 
3477   vsp->gcp = &gc;
3478   vsp->bssp = NULL;
3479   vsp->bsp = bsp;
3480   vsp->sfp = NULL;
3481   vsp->descr = NULL;
3482   MemSet ((Pointer) &gc, 0, sizeof (GatherContext));
3483 
3484   for (j = 0; j < num; j++) {
3485     item = array [j];
3486     if (item == NULL) continue;
3487     if (StringDoesHaveText (last)) {
3488       if (StringICmp (item->fid, last) == 0) {
3489         feat = item->feat;
3490         if (feat == NULL) continue;
3491         sfp = feat->sfp;
3492         if (sfp == NULL) continue;
3493         gcp = &gc;
3494         gcp->entityID = sfp->idx.entityID;
3495         gcp->itemID = sfp->idx.itemID;
3496         gcp->thistype = OBJ_SEQFEAT;
3497         vsp->sfp = sfp;
3498         ValidErr (vsp, SEV_REJECT, ERR_SEQ_FEAT_CollidingFeatureIDs,
3499                   "Colliding feature ID %s", last);
3500       }
3501     }
3502     last = item->fid;
3503   }
3504 }
3505 
3506 typedef struct vsicdata {
3507   ValidStructPtr  vsp;
3508   ValNodePtr      headid;
3509   ValNodePtr      tailid;
3510 } VsicData, PNTR VsicDataPtr;
3511 
CaptureTextSeqIDs(BioseqPtr bsp,Pointer userdata)3512 static void CaptureTextSeqIDs (BioseqPtr bsp, Pointer userdata)
3513 
3514 {
3515   Char         buf [200];
3516   SeqIdPtr     sip;
3517   VsicDataPtr  vdp;
3518   ValNodePtr   vnp;
3519 
3520   if (bsp == NULL || userdata == NULL) return;
3521   vdp = (VsicDataPtr) userdata;
3522 
3523   for (sip = bsp->id; sip != NULL; sip = sip->next) {
3524     if (sip->choice == SEQID_GI || sip->choice == SEQID_GIBBSQ || sip->choice == SEQID_GIBBMT) continue;
3525     if (IsNCBIFileID (sip)) continue;
3526     SeqIdWrite (sip, buf, PRINTID_FASTA_SHORT, sizeof (buf) - 1);
3527     vnp = ValNodeCopyStr (&(vdp->tailid), 0, buf);
3528     if (vdp->headid == NULL) {
3529       vdp->headid = vnp;
3530     }
3531     vdp->tailid = vnp;
3532   }
3533 }
3534 
UniqueValNodeCaseSensitive(ValNodePtr list)3535 static ValNodePtr UniqueValNodeCaseSensitive (ValNodePtr list)
3536 
3537 {
3538   CharPtr       last;
3539   ValNodePtr    next;
3540   Pointer PNTR  prev;
3541   CharPtr       str;
3542   ValNodePtr    vnp;
3543 
3544   if (list == NULL) return NULL;
3545   last = (CharPtr) list->data.ptrvalue;
3546   vnp = list->next;
3547   prev = (Pointer PNTR) &(list->next);
3548   while (vnp != NULL) {
3549     next = vnp->next;
3550     str = (CharPtr) vnp->data.ptrvalue;
3551     if (StringCmp (last, str) == 0) {
3552       vnp->next = NULL;
3553       *prev = next;
3554       ValNodeFreeData (vnp);
3555     } else {
3556       last = (CharPtr) vnp->data.ptrvalue;
3557       prev = (Pointer PNTR) &(vnp->next);
3558     }
3559     vnp = next;
3560   }
3561 
3562   return list;
3563 }
3564 
3565 //LCOV_EXCL_START
3566 //C++ Toolkit automatically resolves Seq-ids that match, even without case
ValidateSeqIdCase(SeqEntryPtr sep,ValidStructPtr vsp)3567 static void ValidateSeqIdCase (SeqEntryPtr sep, ValidStructPtr vsp)
3568 
3569 {
3570   SeqEntryPtr       bsep;
3571   BioseqPtr         bsp = NULL;
3572   CharPtr           curr;
3573   GatherContext     gc;
3574   GatherContextPtr  gcp;
3575   CharPtr           prev;
3576   VsicData          vd;
3577   ValNodePtr        vnp;
3578 
3579   if (vsp == NULL || sep == NULL) return;
3580 
3581   bsep = FindNthBioseq (sep, 1);
3582   if (bsep != NULL && IS_Bioseq (bsep)) {
3583     bsp = (BioseqPtr) bsep->data.ptrvalue;
3584   }
3585 
3586   MemSet ((Pointer) &gc, 0, sizeof (GatherContext));
3587   MemSet ((Pointer) &vd, 0, sizeof (VsicData));
3588 
3589   gcp = &gc;
3590   vsp->gcp = &gc;
3591   vsp->bssp = NULL;
3592   vsp->bsp = bsp;
3593   vsp->sfp = NULL;
3594   vsp->descr = NULL;
3595   vd.vsp = vsp;
3596 
3597   VisitBioseqsInSep (sep, (Pointer) &vd, CaptureTextSeqIDs);
3598   vd.headid = ValNodeSort (vd.headid, SortVnpByString);
3599   vd.headid = UniqueValNodeCaseSensitive (vd.headid);
3600 
3601   curr = NULL;
3602   prev = NULL;
3603   for (vnp = vd.headid; vnp != NULL; vnp = vnp->next, prev = curr) {
3604     curr = (CharPtr) vnp->data.ptrvalue;
3605     if (StringHasNoText (curr)) continue;
3606     if (StringHasNoText (prev)) continue;
3607     if (StringICmp (curr, prev) != 0) continue;
3608     if (StringCmp (curr, prev) == 0) continue;
3609     ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_CaseDifferenceInSeqID,
3610               "Sequence identifier differs only by case - %s and %s", curr, prev);
3611   }
3612 
3613   vsp->bssp = NULL;
3614   vsp->bsp = NULL;
3615   vsp->sfp = NULL;
3616   vsp->descr = NULL;
3617 
3618   ValNodeFreeData (vd.headid);
3619 }
3620 //LCOV_EXCL_STOP
3621 
LookForBioseqFields(BioseqPtr bsp,Pointer userdata)3622 static void LookForBioseqFields (BioseqPtr bsp, Pointer userdata)
3623 
3624 {
3625   DbtagPtr        dbt;
3626   SeqIdPtr        sip;
3627   TextSeqIdPtr    tsip = NULL;
3628   ValidStructPtr  vsp;
3629 
3630   if (bsp == NULL || userdata == NULL) return;
3631   vsp = (ValidStructPtr) userdata;
3632 
3633   for (sip = bsp->id; sip != NULL; sip = sip->next) {
3634     switch (sip->choice) {
3635     case SEQID_EMBL:
3636       vsp->is_embl_tpe_in_sep = TRUE;
3637       /* and fall through */
3638     case SEQID_DDBJ:
3639       vsp->is_embl_ddbj_in_sep = TRUE;
3640       /* and fall through */
3641     case SEQID_GENBANK:
3642     case SEQID_TPG:
3643       vsp->is_insd_in_sep = TRUE;
3644       tsip = (TextSeqIdPtr) sip->data.ptrvalue;
3645       if (tsip != NULL) {
3646         if (StringLen (tsip->accession) == 6) {
3647           vsp->is_old_gb_in_sep = TRUE;
3648         }
3649       }
3650       break;
3651     case SEQID_TPE:
3652       vsp->is_embl_tpe_in_sep = TRUE;
3653       /* and fall through */
3654     case SEQID_TPD:
3655       vsp->is_insd_in_sep = TRUE;
3656       tsip = (TextSeqIdPtr) sip->data.ptrvalue;
3657       break;
3658     case SEQID_PATENT:
3659       vsp->is_patent_in_sep = TRUE;
3660       break;
3661     case SEQID_OTHER:
3662       vsp->is_refseq_in_sep = TRUE;
3663       tsip = (TextSeqIdPtr) sip->data.ptrvalue;
3664       if (tsip != NULL && tsip->accession != NULL) {
3665         if (StringNICmp (tsip->accession, "WP_", 3) == 0) {
3666           vsp->is_wp_in_sep = TRUE;
3667         }
3668       }
3669       break;
3670     case SEQID_GPIPE:
3671       vsp->is_gpipe_in_sep = TRUE;
3672       break;
3673     case SEQID_GENERAL:
3674       if (ISA_aa (bsp->mol)) {
3675         dbt = (DbtagPtr) sip->data.ptrvalue;
3676         if (dbt == NULL) break;
3677         if (IsSkippableDbtag (dbt)) break;
3678         vsp->has_gnl_prot_sep = TRUE;
3679       }
3680       break;
3681     case SEQID_PDB:
3682       vsp->is_pdb_in_sep = TRUE;
3683       break;
3684     case SEQID_GI:
3685       vsp->has_gi_or_accn_ver = TRUE;
3686       break;
3687     default:
3688       break;
3689     }
3690     if (tsip != NULL) {
3691       if (StringDoesHaveText (tsip->accession) && tsip->version >= 1) {
3692         vsp->has_gi_or_accn_ver = TRUE;
3693       }
3694     }
3695     if (sip->choice != SEQID_LOCAL && sip->choice != SEQID_GENERAL) {
3696       vsp->only_lcl_gnl_in_sep = FALSE;
3697     }
3698   }
3699 }
3700 
LookForBioseqSetFields(BioseqSetPtr bssp,Pointer userdata)3701 static void LookForBioseqSetFields (BioseqSetPtr bssp, Pointer userdata)
3702 
3703 {
3704   ValidStructPtr  vsp;
3705 
3706   if (bssp == NULL || userdata == NULL) return;
3707   vsp = (ValidStructPtr) userdata;
3708 
3709   /* the switch statement is erroneously reporting gen_prod_set for a pop_set under Xcode */
3710   /*
3711   switch (bssp->_class) {
3712   case BioseqseqSet_class_gen_prod_set:
3713     vsp->is_gps_in_sep = TRUE;
3714     break;
3715   case BioseqseqSet_class_mut_set:
3716   case BioseqseqSet_class_pop_set:
3717   case BioseqseqSet_class_phy_set:
3718   case BioseqseqSet_class_eco_set:
3719   case BioseqseqSet_class_wgs_set:
3720   case BioseqseqSet_class_small_genome_set:
3721     break;
3722     vsp->other_sets_in_sep = TRUE;
3723   default:
3724     break;
3725   }
3726   */
3727 
3728   if (bssp->_class == BioseqseqSet_class_gen_prod_set) {
3729     vsp->is_gps_in_sep = TRUE;
3730   } else if (bssp->_class == BioseqseqSet_class_mut_set ||
3731              bssp->_class == BioseqseqSet_class_pop_set ||
3732              bssp->_class == BioseqseqSet_class_phy_set ||
3733              bssp->_class == BioseqseqSet_class_eco_set ||
3734              bssp->_class == BioseqseqSet_class_wgs_set) {
3735     vsp->other_sets_in_sep = TRUE;
3736   } else if (bssp->_class == BioseqseqSet_class_small_genome_set) {
3737     vsp->is_small_genome_set = TRUE;
3738   }
3739 }
3740 
LookForSeqDescrFields(SeqDescrPtr sdp,Pointer userdata)3741 static void LookForSeqDescrFields (SeqDescrPtr sdp, Pointer userdata)
3742 
3743 {
3744   BioSourcePtr    biop;
3745   MolInfoPtr      mip;
3746   ObjectIdPtr     oip;
3747   UserFieldPtr    ufp;
3748   UserObjectPtr   uop;
3749   ValidStructPtr  vsp;
3750 
3751   if (sdp == NULL || userdata == NULL) return;
3752   vsp = (ValidStructPtr) userdata;
3753 
3754   switch (sdp->choice) {
3755   case Seq_descr_user:
3756     uop = (UserObjectPtr) sdp->data.ptrvalue;
3757     if (uop == NULL) break;
3758     if (StringICmp (uop->_class, "SMART_V1.0") == 0) {
3759       vsp->is_smupd_in_sep = TRUE;
3760     }
3761     oip = uop->type;
3762     if (oip != NULL) {
3763       if (StringICmp (oip->str, "GenomeBuild") == 0) {
3764         vsp->is_gpipe_in_sep = TRUE;
3765       } else if (StringICmp (oip->str, "StructuredComment") == 0) {
3766         for (ufp = uop->data; ufp != NULL; ufp = ufp->next) {
3767           oip = ufp->label;
3768           if (oip == NULL || oip->str == NULL) continue;
3769           if (StringICmp (oip->str, "Annotation Pipeline") == 0) {
3770             if (ufp->choice == 1 &&
3771                 StringCmp ((CharPtr) ufp->data.ptrvalue, "NCBI eukaryotic genome annotation pipeline") == 0) {
3772               vsp->is_gpipe_in_sep = TRUE;
3773             }
3774           }
3775         }
3776       }
3777     }
3778     break;
3779   case Seq_descr_source:
3780     biop = (BioSourcePtr) sdp->data.ptrvalue;
3781     if (biop == NULL) break;
3782     if (biop->genome == GENOME_genomic) {
3783       vsp->bsp_genomic_in_sep = TRUE;
3784     }
3785     break;
3786   case Seq_descr_molinfo:
3787     mip = (MolInfoPtr) sdp->data.ptrvalue;
3788     if (mip == NULL) break;
3789     switch (mip->tech) {
3790     case MI_TECH_htgs_1:
3791     case MI_TECH_htgs_2:
3792     case MI_TECH_htgs_3:
3793     case MI_TECH_htgs_0:
3794       vsp->is_htg_in_sep = TRUE;
3795       break;
3796     case MI_TECH_barcode:
3797       vsp->is_barcode_sep = TRUE;
3798       break;
3799     default:
3800       break;
3801     }
3802     break;
3803   default:
3804     break;
3805   }
3806 }
3807 
FindMultiIntervalGenes(SeqFeatPtr sfp,Pointer userdata)3808 static void FindMultiIntervalGenes (
3809   SeqFeatPtr sfp,
3810   Pointer userdata
3811 )
3812 
3813 {
3814   BoolPtr    multiIntervalGenesP;
3815   SeqLocPtr  slp;
3816 
3817   if (sfp == NULL || sfp->data.choice != SEQFEAT_GENE) return;
3818   multiIntervalGenesP = (BoolPtr) userdata;
3819   if (multiIntervalGenesP == NULL) return;
3820 
3821   slp = sfp->location;
3822   if (slp == NULL) return;
3823   switch (slp->choice) {
3824     case SEQLOC_PACKED_INT :
3825     case SEQLOC_PACKED_PNT :
3826     case SEQLOC_MIX :
3827     case SEQLOC_EQUIV :
3828       *multiIntervalGenesP = TRUE;
3829       break;
3830     default :
3831       break;
3832   }
3833 }
3834 
3835 //LCOV_EXCL_START
3836 // Only for SegSets
FindSegmentedBioseqs(BioseqPtr bsp,Pointer userdata)3837 static void FindSegmentedBioseqs (
3838   BioseqPtr bsp,
3839   Pointer userdata
3840 )
3841 
3842 {
3843   BoolPtr  segmentedBioseqsP;
3844 
3845   if (bsp == NULL || bsp->repr != Seq_repr_seg) return;
3846   segmentedBioseqsP = (BoolPtr) userdata;
3847   if (segmentedBioseqsP == NULL) return;
3848   *segmentedBioseqsP = TRUE;
3849 }
3850 //LCOV_EXCL_STOP
3851 
SetPubScratchData(SeqDescrPtr sdp,Pointer userdata)3852 static void SetPubScratchData (SeqDescrPtr sdp, Pointer userdata)
3853 
3854 {
3855   AuthListPtr    alp;
3856   Char           buf [2048];
3857   CitGenPtr      cgp;
3858   CharPtr        consortium, str, tmp;
3859   ValNodePtr     vnp;
3860   ObjValNodePtr  ovp;
3861   PubdescPtr     pdp;
3862 
3863   if (sdp == NULL || sdp->choice != Seq_descr_pub || sdp->extended == 0) return;
3864   ovp = (ObjValNodePtr) sdp;
3865   pdp = (PubdescPtr) sdp->data.ptrvalue;
3866   if (pdp == NULL) return;
3867 
3868   vnp = pdp->pub;
3869 
3870   /* skip over just serial number */
3871 
3872   if (vnp != NULL && vnp->choice == PUB_Gen && vnp->next != NULL) {
3873     cgp = (CitGenPtr) vnp->data.ptrvalue;
3874     if (cgp != NULL) {
3875       if (StringNICmp ("BackBone id_pub", cgp->cit, 15) != 0) {
3876         if (cgp->cit == NULL && cgp->journal == NULL && cgp->date == NULL && cgp->serial_number) {
3877             //LCOV_EXCL_START
3878             //serial numbers are stripped by basic cleanup
3879           vnp = vnp->next;
3880           //LCOV_EXCL_STOP
3881         }
3882       }
3883     }
3884   }
3885 
3886   if (PubLabelUnique (vnp, buf, sizeof (buf) - 1, OM_LABEL_CONTENT, TRUE) > 0) {
3887     alp = GetAuthListPtr (pdp, NULL);
3888     if (alp != NULL) {
3889       consortium = NULL;
3890       str = GetAuthorsString (GENBANK_FMT, alp, &consortium, NULL, NULL);
3891       tmp = MemNew (StringLen (buf) + StringLen (str) + StringLen (consortium) + 10);
3892       if (tmp != NULL) {
3893         StringCpy (tmp, buf);
3894         if (StringDoesHaveText (str)) {
3895           StringCat (tmp, "; ");
3896           StringCat (tmp, str);
3897         }
3898         if (StringDoesHaveText (consortium)) {
3899           StringCat (tmp, "; ");
3900           StringCat (tmp, consortium);
3901         }
3902         ovp->idx.scratch = tmp;
3903       }
3904       MemFree (str);
3905       MemFree (consortium);
3906     }
3907   }
3908 }
3909 
ClearPubScratchData(SeqDescrPtr sdp,Pointer userdata)3910 static void ClearPubScratchData (SeqDescrPtr sdp, Pointer userdata)
3911 
3912 {
3913   ObjValNodePtr  ovp;
3914 
3915   if (sdp == NULL || sdp->choice != Seq_descr_pub || sdp->extended == 0) return;
3916   ovp = (ObjValNodePtr) sdp;
3917   ovp->idx.scratch = MemFree (ovp->idx.scratch);
3918 }
3919 
SetUpValidateGeneticCodes(void)3920 static ValNodePtr SetUpValidateGeneticCodes (void)
3921 
3922 {
3923   Char            ch;
3924   GeneticCodePtr  codes;
3925   GeneticCodePtr  gcp;
3926   ValNodePtr      gencodelist = NULL;
3927   Int2            i;
3928   Int4            id;
3929   Int2            j;
3930   Char            name [64];
3931   CharPtr         ptr;
3932   Char            str [256];
3933   ValNodePtr      tmp;
3934 
3935   codes = GeneticCodeTableLoad ();
3936   if (codes != NULL) {
3937     for (gcp = codes; gcp != NULL; gcp = gcp->next) {
3938       id = 0;
3939       str [0] = '\0';
3940       for (tmp = (ValNodePtr) gcp->data.ptrvalue; tmp != NULL; tmp = tmp->next) {
3941         switch (tmp->choice) {
3942           case 1 :
3943             if (StringLen (str) < 1) {
3944               StringNCpy_0 (str, (CharPtr) tmp->data.ptrvalue, sizeof (str));
3945               ptr = str;
3946               ch = *ptr;
3947               while (ch != '\0') {
3948                 if (ch == '/') {
3949                   *ptr = '-';
3950                 }
3951                 ptr++;
3952                 ch = *ptr;
3953               }
3954             }
3955             break;
3956           case 2 :
3957             id = tmp->data.intvalue;
3958             break;
3959           default :
3960             break;
3961         }
3962       }
3963       if (id != 7 && id != 8) {
3964         if (id > 0 /* && id < 30 */ ) {
3965           i = 0;
3966           if (StringLen (str + i) > 0) {
3967             ch = str [i];
3968             while (ch == ' ' || ch == ';') {
3969               i++;
3970               ch = str [i];
3971             }
3972             j = 0;
3973             ch = str [i + j];
3974             while (ch != '\0' && ch != ';') {
3975               name [j] = ch;
3976               j++;
3977               ch = str [i + j];
3978             }
3979             name [j] = '\0';
3980             i += j;
3981             if (ch == ';') {
3982               StringCat (name, ", etc.");
3983             }
3984             ValNodeCopyStr (&gencodelist, (Uint1) id, name);
3985           }
3986         }
3987       }
3988     }
3989   }
3990   return gencodelist;
3991 }
3992 
3993 typedef struct frd {
3994   ValidStructPtr    vsp;
3995   GatherContextPtr  gcp;
3996   /*
3997   CharPtr           string;
3998   */
3999 } FindRepData, PNTR FindRepPtr;
4000 
FindRepValidate(Uint2 entityID,Uint4 itemID,Uint2 itemtype,Pointer userdata)4001 static void FindRepValidate (Uint2 entityID, Uint4 itemID, Uint2 itemtype, Pointer userdata)
4002 
4003 {
4004   FindRepPtr        frp;
4005   GatherContextPtr  gcp;
4006   ValidStructPtr    vsp;
4007 
4008   frp = (FindRepPtr) userdata;
4009   vsp = frp->vsp;
4010   gcp = frp->gcp;
4011 
4012   gcp->entityID = entityID;
4013   gcp->itemID = itemID;
4014   gcp->thistype = itemtype;
4015 
4016   ValidErr (vsp, SEV_ERROR, ERR_GENERIC_EmbeddedScript, "Script tag found in item");
4017 }
4018 
4019 static CharPtr findrepstrs [] = {
4020   "<script", "<object", "<applet", "<embed", "<form", "javascript:", "vbscript:", NULL
4021 };
4022 
4023 typedef struct vvmdata {
4024   Int2        num_mrnas;
4025   Boolean     accounted_for;
4026   Boolean     products_unique;
4027   Boolean     featid_matched;
4028   SeqFeatPtr  nearbygene;
4029   SeqFeatPtr  nearbycds;
4030   SeqFeatPtr  nearbymrna;
4031 } VvmData, PNTR VvmDataPtr;
4032 
AddScratchToFeatures(SeqFeatPtr sfp,Pointer userdata)4033 static void AddScratchToFeatures (
4034   SeqFeatPtr sfp,
4035   Pointer userdata
4036 )
4037 
4038 {
4039   sfp->idx.scratch = (Pointer) MemNew (sizeof (VvmData));
4040 }
4041 
ClearScratchOnFeatures(SeqFeatPtr sfp,Pointer userdata)4042 static void ClearScratchOnFeatures (
4043   SeqFeatPtr sfp,
4044   Pointer userdata
4045 )
4046 
4047 {
4048   sfp->idx.scratch = MemFree (sfp->idx.scratch);
4049 }
4050 
SetupFeatureScratchData(BioseqPtr bsp,Pointer userdata)4051 static void SetupFeatureScratchData (
4052   BioseqPtr bsp,
4053   Pointer userdata
4054 )
4055 
4056 {
4057   SeqFeatPtr         currcds = NULL, currmrna = NULL, currgene = NULL;
4058   SeqMgrFeatContext  fcontext;
4059   SeqFeatPtr         sfp;
4060   VvmDataPtr         vdp;
4061 
4062   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
4063   while (sfp != NULL) {
4064     switch (sfp->idx.subtype) {
4065       case FEATDEF_GENE :
4066         currgene = sfp;
4067         break;
4068       case FEATDEF_CDS :
4069         currcds = sfp;
4070         vdp = (VvmDataPtr) sfp->idx.scratch;
4071         if (vdp != NULL) {
4072           if (vdp->nearbygene == NULL) {
4073             vdp->nearbygene = currgene;
4074           }
4075           if (vdp->nearbymrna == NULL) {
4076             vdp->nearbymrna = currmrna;
4077           }
4078         }
4079         if (currgene != NULL) {
4080           vdp = (VvmDataPtr) currgene->idx.scratch;
4081           if (vdp != NULL) {
4082             if (vdp->nearbycds == NULL) {
4083               vdp->nearbycds = currcds;
4084             }
4085           }
4086         }
4087         if (currmrna != NULL) {
4088           vdp = (VvmDataPtr) currmrna->idx.scratch;
4089           if (vdp != NULL) {
4090             if (vdp->nearbycds == NULL) {
4091               vdp->nearbycds = currcds;
4092             }
4093           }
4094         }
4095         break;
4096       case FEATDEF_mRNA :
4097         currmrna = sfp;
4098         vdp = (VvmDataPtr) sfp->idx.scratch;
4099         if (vdp != NULL) {
4100           if (vdp->nearbygene == NULL) {
4101             vdp->nearbygene = currgene;
4102           }
4103         }
4104         if (currgene != NULL) {
4105           vdp = (VvmDataPtr) currgene->idx.scratch;
4106           if (vdp != NULL) {
4107             if (vdp->nearbymrna == NULL) {
4108               vdp->nearbymrna = currmrna;
4109             }
4110           }
4111         }
4112         break;
4113       default :
4114         vdp = (VvmDataPtr) sfp->idx.scratch;
4115         if (vdp != NULL) {
4116           if (vdp->nearbygene == NULL) {
4117             vdp->nearbygene = currgene;
4118           }
4119           if (vdp->nearbymrna == NULL) {
4120             vdp->nearbymrna = currmrna;
4121           }
4122           if (vdp->nearbycds == NULL) {
4123             vdp->nearbycds = currcds;
4124           }
4125         }
4126         break;
4127     }
4128     sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext);
4129   }
4130 }
4131 
4132 static Boolean using_ec_from_file = FALSE;
4133 
4134 //LCOV_EXCL_START
4135 //internal check of data
TestDeletedOrReplacedECnumbers(ValidStructPtr vsp)4136 static void TestDeletedOrReplacedECnumbers (ValidStructPtr vsp)
4137 
4138 {
4139   Char          buf [256];
4140   FileCache     fc;
4141   FILE          *fp = NULL;
4142   TextFsaPtr    fsa;
4143   size_t        i;
4144   Char          line [512];
4145   CharPtr PNTR  local;
4146   size_t        numitems;
4147   Char          path [PATH_MAX];
4148   CharPtr       ptr;
4149   ErrSev        sev;
4150   CharPtr       str;
4151   CharPtr       tmp;
4152 
4153   /* only check first time program runs validator */
4154 
4155   fsa = (TextFsaPtr) GetAppProperty ("ReplacedEECNumberFSA");
4156   if (fsa != NULL) return;
4157 
4158   GetSpecificECNumberFSA (vsp);
4159   GetAmbiguousECNumberFSA (vsp);
4160   GetDeletedECNumberFSA (vsp);
4161   GetReplacedECNumberFSA (vsp);
4162 
4163   if (using_ec_from_file) {
4164     if (FindPath ("ncbi", "ncbi", "data", path, sizeof (path))) {
4165       FileBuildPath (path, NULL, "ecnum_replaced.txt");
4166       sev = ErrSetMessageLevel (SEV_ERROR);
4167       fp = FileOpen (path, "r");
4168       ErrSetMessageLevel (sev);
4169       if (fp != NULL) {
4170         FileCacheSetup (&fc, fp);
4171 
4172         str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
4173         while (str != NULL) {
4174           if (StringDoesHaveText (str)) {
4175             ptr = StringChr (str, '\t');
4176             if (ptr != NULL) {
4177               *ptr = '\0';
4178               ptr++;
4179               if (! ECnumberNotInList (str)) {
4180                 ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_EcNumberProblem, "Replaced EC number %s still in live data file list", str);
4181               }
4182               if (ECnumberWasDeleted (str)) {
4183                 ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_DeletedEcNumber, "Replaced EC number %s in deleted data file list", str);
4184               }
4185               while (StringDoesHaveText (ptr)) {
4186                 tmp = StringChr (ptr, '\t');
4187                 if (tmp != NULL) {
4188                   *tmp = '\0';
4189                   tmp++;
4190                 }
4191                 if (ECnumberNotInList (ptr)) {
4192                   ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_EcNumberProblem, "Replacement EC number %s not in data file live list", ptr);
4193                 }
4194                 if (ECnumberWasDeleted (ptr)) {
4195                   ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_DeletedEcNumber, "Replacement EC number %s in deleted data file list", ptr);
4196                 }
4197                 ptr = tmp;
4198               }
4199             }
4200           }
4201           str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
4202         }
4203 
4204         FileClose (fp);
4205       }
4206     }
4207   }
4208 
4209   if (! using_ec_from_file) {
4210     local = (CharPtr PNTR) kECNum_replaced;
4211     numitems = sizeof (kECNum_replaced) / sizeof (char*);
4212 
4213     for (i = 0; i < numitems; i++) {
4214       str = local [i];
4215       if (StringHasNoText (str)) continue;
4216       StringNCpy_0 (buf, str, sizeof (buf));
4217       str = buf;
4218       ptr = StringChr (str, '\t');
4219       if (ptr != NULL) {
4220         *ptr = '\0';
4221         ptr++;
4222         if (! ECnumberNotInList (str)) {
4223           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_EcNumberProblem, "Replaced EC number %s still in live internal list", str);
4224         }
4225         if (ECnumberWasDeleted (str)) {
4226           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_DeletedEcNumber, "Replaced EC number %s in deleted internal list", str);
4227         }
4228         while (StringDoesHaveText (ptr)) {
4229           tmp = StringChr (ptr, '\t');
4230           if (tmp != NULL) {
4231             *tmp = '\0';
4232             tmp++;
4233           }
4234           if (ECnumberNotInList (ptr)) {
4235             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_EcNumberProblem, "Replacement EC number %s not in live internal list", ptr);
4236           }
4237           if (ECnumberWasDeleted (ptr)) {
4238             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_DeletedEcNumber, "Replacement EC number %s in deleted internal list", ptr);
4239           }
4240           ptr = tmp;
4241         }
4242       }
4243     }
4244   }
4245 
4246   if (using_ec_from_file) {
4247     if (FindPath ("ncbi", "ncbi", "data", path, sizeof (path))) {
4248       FileBuildPath (path, NULL, "ecnum_deleted.txt");
4249       sev = ErrSetMessageLevel (SEV_ERROR);
4250       fp = FileOpen (path, "r");
4251       ErrSetMessageLevel (sev);
4252       if (fp != NULL) {
4253         FileCacheSetup (&fc, fp);
4254 
4255         str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
4256         while (str != NULL) {
4257           if (StringDoesHaveText (str)) {
4258             ptr = StringChr (str, '\t');
4259             if (ptr != NULL) {
4260               *ptr = '\0';
4261               ptr++;
4262               if (! ECnumberNotInList (str)) {
4263                 ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_EcNumberProblem, "Deleted EC number %s still in live data file list", str);
4264               }
4265             }
4266           }
4267           str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
4268         }
4269 
4270         FileClose (fp);
4271       }
4272     }
4273   }
4274 
4275   if (! using_ec_from_file) {
4276     local = (CharPtr PNTR) kECNum_deleted;
4277     numitems = sizeof (kECNum_deleted) / sizeof (char*);
4278 
4279     for (i = 0; i < numitems; i++) {
4280       str = local [i];
4281       if (StringHasNoText (str)) continue;
4282       StringNCpy_0 (buf, str, sizeof (buf));
4283       str = buf;
4284       ptr = StringChr (str, '\t');
4285       if (ptr != NULL) {
4286         *ptr = '\0';
4287         ptr++;
4288         if (! ECnumberNotInList (str)) {
4289           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_EcNumberProblem, "Deleted EC number %s still in live internal list", str);
4290         }
4291       }
4292     }
4293   }
4294 }
4295 
4296 //LCOV_EXCL_STOP
4297 
4298 
4299 typedef struct collisioninfo {
4300   CharPtr str;
4301   SeqIdPtr sip;
4302   BioseqPtr bsp;
4303 } CollisionInfoData, PNTR CollisionInfoPtr;
4304 
4305 
CollisionInfoNew(SeqIdPtr sip,BioseqPtr bsp)4306 static CollisionInfoPtr CollisionInfoNew (SeqIdPtr sip, BioseqPtr bsp)
4307 {
4308   CollisionInfoPtr cip = (CollisionInfoPtr) MemNew (sizeof (CollisionInfoData));
4309   cip->sip = sip;
4310   cip->bsp = bsp;
4311   cip->str = SeqIdWholeLabel (sip, PRINTID_FASTA_SHORT);
4312   return cip;
4313 }
4314 
4315 
CollisionInfoFree(CollisionInfoPtr cip)4316 static CollisionInfoPtr CollisionInfoFree (CollisionInfoPtr cip)
4317 {
4318   if (cip != NULL) {
4319     cip->str = MemFree (cip->str);
4320     cip = MemFree (cip);
4321   }
4322   return cip;
4323 }
4324 
4325 
LongCollisionCallback(BioseqPtr bsp,Pointer data)4326 static void LongCollisionCallback (BioseqPtr bsp, Pointer data)
4327 {
4328   SeqIdPtr sip;
4329 
4330   if (bsp == NULL || data == NULL) {
4331     return;
4332   }
4333 
4334   for (sip = bsp->id; sip != NULL; sip = sip->next) {
4335     if (!IsNCBIFileID(sip)) {
4336       ValNodeAddPointer ((ValNodePtr PNTR) data, 0, CollisionInfoNew (sip, bsp));
4337     }
4338   }
4339 }
4340 
4341 
SortVnpByCollisionInfo(VoidPtr ptr1,VoidPtr ptr2)4342 static int LIBCALLBACK SortVnpByCollisionInfo (VoidPtr ptr1, VoidPtr ptr2)
4343 
4344 {
4345   CollisionInfoPtr  cip1, cip2;
4346   ValNodePtr    vnp1, vnp2;
4347 
4348   if (ptr1 == NULL || ptr2 == NULL) return 0;
4349   vnp1 = *((ValNodePtr PNTR) ptr1);
4350   vnp2 = *((ValNodePtr PNTR) ptr2);
4351   if (vnp1 == NULL || vnp2 == NULL) return 0;
4352   cip1 = (CollisionInfoPtr) vnp1->data.ptrvalue;
4353   cip2 = (CollisionInfoPtr) vnp2->data.ptrvalue;
4354 
4355   if (cip1 == NULL || cip2 == NULL) return 0;
4356   return StringCmp (cip1->str, cip2->str);
4357 }
4358 
4359 
FindLongIdsThatCollideWhenTruncated(SeqEntryPtr sep,ValidStructPtr vsp,Int4 trunc_len)4360 static void FindLongIdsThatCollideWhenTruncated (SeqEntryPtr sep, ValidStructPtr vsp, Int4 trunc_len)
4361 {
4362   ValNodePtr id_list = NULL, vnp, vnp_c;
4363   CollisionInfoPtr  cip1, cip2;
4364   BioseqPtr         oldbsp;
4365 
4366   VisitBioseqsInSep (sep, &id_list, LongCollisionCallback);
4367   id_list = ValNodeSort (id_list, SortVnpByCollisionInfo);
4368   oldbsp = vsp->bsp;
4369 
4370   for (vnp = id_list; vnp != NULL; vnp = vnp->next) {
4371     cip1 = (CollisionInfoPtr) vnp->data.ptrvalue;
4372     vnp_c = vnp->next;
4373     while (vnp_c != NULL && (cip2 = (CollisionInfoPtr) vnp_c->data.ptrvalue) != NULL
4374           && StringNCmp (cip1->str, cip2->str, trunc_len) == 0) {
4375       vsp->bsp = cip2->bsp;
4376       ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_BadSeqIdFormat,
4377                 "First %d characters of %s and %s are identical", trunc_len, cip1->str, cip2->str);
4378       vnp = vnp_c;
4379       vnp_c = vnp_c->next;
4380     }
4381   }
4382 
4383   vsp->bsp = oldbsp;
4384   vnp = id_list;
4385   while (vnp != NULL) {
4386     vnp_c = vnp->next;
4387     vnp->data.ptrvalue = CollisionInfoFree (vnp->data.ptrvalue);
4388     vnp->next = NULL;
4389     vnp = ValNodeFree (vnp);
4390     vnp = vnp_c;
4391   }
4392 }
4393 
4394 
4395 //LCOV_EXCL_START
4396 //used for locking procedures specific to C Toolkit
ValLookForBigFarSeqs(BioseqPtr bsp,Pointer userdata)4397 static void ValLookForBigFarSeqs (
4398   BioseqPtr bsp,
4399   Pointer userdata
4400 )
4401 
4402 {
4403   Int4         count = 0;
4404   DeltaSeqPtr  dsp;
4405   Boolean      is_ddbj = FALSE;
4406   SeqIdPtr     sip;
4407   BoolPtr      toomanyfarP;
4408 
4409   if (bsp == NULL || userdata == NULL) return;
4410 
4411   if (bsp->repr != Seq_repr_delta) return;
4412   if (bsp->seq_ext_type != 4) return;
4413 
4414   for (sip = bsp->id; sip != NULL; sip = sip->next) {
4415     if (sip->choice == SEQID_DDBJ) {
4416       is_ddbj = TRUE;
4417     }
4418   }
4419 
4420   if (! is_ddbj) return;
4421 
4422   for (dsp = (DeltaSeqPtr) bsp->seq_ext; dsp != NULL; dsp = dsp->next) {
4423     if (dsp->choice == 1) {
4424       count++;
4425     }
4426   }
4427 
4428   if (count > 10000) {
4429     toomanyfarP = (BoolPtr) userdata;
4430     *toomanyfarP = TRUE;
4431   }
4432 }
4433 //LCOV_EXCL_STOP
4434 
ValTooManyFarComponents(SeqEntryPtr sep)4435 static Boolean ValTooManyFarComponents (
4436   SeqEntryPtr sep
4437 )
4438 
4439 {
4440   Boolean  toomanyfar = FALSE;
4441 
4442   if (sep == NULL) return FALSE;
4443 
4444   VisitBioseqsInSep (sep, (Pointer) &toomanyfar, ValLookForBigFarSeqs);
4445 
4446   return toomanyfar;
4447 }
4448 
4449 static CharPtr inferencePrefix [] = {
4450   "",
4451   "similar to sequence",
4452   "similar to AA sequence",
4453   "similar to DNA sequence",
4454   "similar to RNA sequence",
4455   "similar to RNA sequence, mRNA",
4456   "similar to RNA sequence, EST",
4457   "similar to RNA sequence, other RNA",
4458   "profile",
4459   "nucleotide motif",
4460   "protein motif",
4461   "ab initio prediction",
4462   "alignment",
4463   NULL
4464 };
4465 
4466 
NextColonOrVerticalBarPtr(CharPtr ptr)4467 static CharPtr NextColonOrVerticalBarPtr (CharPtr ptr)
4468 
4469 {
4470   Char  ch = '\0';
4471 
4472   if (ptr == NULL) return NULL;
4473 
4474   ch = *ptr;
4475   while (ch != '\0') {
4476     if (ch == ':' || ch == '|') return ptr;
4477     ptr++;
4478     ch = *ptr;
4479   }
4480 
4481   return NULL;
4482 }
4483 
4484 typedef struct valcountdata {
4485   Int4  numInferences;
4486   Int4  numAccessions;
4487 } ValCountData, PNTR ValCountPtr;
4488 
ValCountInfAccnVer(SeqFeatPtr sfp,Pointer userdata)4489 static void ValCountInfAccnVer (SeqFeatPtr sfp, Pointer userdata)
4490 
4491 {
4492   Int2         best, j;
4493   Char         ch;
4494   GBQualPtr    gbq;
4495   size_t       len;
4496   CharPtr      nxt;
4497   CharPtr      ptr;
4498   CharPtr      rest;
4499   CharPtr      str;
4500   CharPtr      tmp;
4501   ValCountPtr  vcp;
4502 
4503 
4504   if (sfp == NULL || userdata == NULL) return;
4505   vcp = (ValCountPtr) userdata;
4506 
4507   for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
4508     if (StringICmp (gbq->qual, "inference") != 0) continue;
4509     if (StringHasNoText (gbq->val)) continue;
4510 
4511     (vcp->numInferences)++;
4512 
4513     rest = NULL;
4514     best = -1;
4515     for (j = 0; inferencePrefix [j] != NULL; j++) {
4516       len = StringLen (inferencePrefix [j]);
4517       if (StringNICmp (gbq->val, inferencePrefix [j], len) != 0) continue;
4518       rest = gbq->val + len;
4519       best = j;
4520     }
4521     if (best < 0 || inferencePrefix [best] == NULL) continue;
4522     if (rest == NULL) continue;
4523 
4524     ch = *rest;
4525     while (IS_WHITESP (ch)) {
4526       rest++;
4527       ch = *rest;
4528     }
4529     if (StringNICmp (rest, "(same species)", 14) == 0) {
4530       rest += 14;
4531     }
4532     ch = *rest;
4533     while (IS_WHITESP (ch) || ch == ':') {
4534       rest++;
4535       ch = *rest;
4536     }
4537     if (StringHasNoText (rest)) continue;
4538 
4539     str = StringSave (rest);
4540 
4541     ptr = str;
4542     if (best == 12) {
4543       ptr = StringRChr (str, ':');
4544       if (ptr != NULL) {
4545         *ptr = '\0';
4546         ptr++;
4547       }
4548     }
4549     while (ptr != NULL) {
4550       nxt = StringChr (ptr, ',');
4551       if (nxt != NULL) {
4552         *nxt = '\0';
4553         nxt++;
4554       }
4555       tmp = NextColonOrVerticalBarPtr (ptr);
4556       if (tmp != NULL) {
4557         *tmp = '\0';
4558         tmp++;
4559         TrimSpacesAroundString (ptr);
4560         TrimSpacesAroundString (tmp);
4561         if (StringDoesHaveText (tmp)) {
4562           if (StringICmp (ptr, "INSD") == 0 || StringICmp (ptr, "RefSeq") == 0) {
4563             (vcp->numAccessions)++;
4564           }
4565         }
4566       }
4567       ptr = nxt;
4568     }
4569 
4570     MemFree (str);
4571   }
4572 }
4573 
TooManyInferenceAccessions(SeqEntryPtr sep,Int4Ptr numInferences,Int4Ptr numAccessions)4574 NLM_EXTERN Boolean TooManyInferenceAccessions (
4575   SeqEntryPtr sep,
4576   Int4Ptr numInferences,
4577   Int4Ptr numAccessions
4578 )
4579 
4580 {
4581   ValCountData  vcd;
4582 
4583   if (numInferences != NULL) {
4584     *numInferences = 0;
4585   }
4586   if (numAccessions != NULL) {
4587     *numAccessions = 0;
4588   }
4589   if (sep == NULL) return FALSE;
4590 
4591   vcd.numInferences = 0;
4592   vcd.numAccessions = 0;
4593 
4594   VisitFeaturesInSep (sep, (Pointer) &vcd, ValCountInfAccnVer);
4595 
4596   if (numInferences != NULL) {
4597     *numInferences = vcd.numInferences;
4598   }
4599   if (numAccessions != NULL) {
4600     *numAccessions = vcd.numAccessions;
4601   }
4602 
4603   if (vcd.numInferences > 1000 || vcd.numAccessions > 1000) return TRUE;
4604 
4605   return FALSE;
4606 }
4607 
4608 /*
4609 static void CountPseudogenes (SeqFeatPtr sfp, Pointer userdata)
4610 
4611 {
4612   FeatProbPtr  fpp;
4613   GBQualPtr    gbq;
4614 
4615   if (sfp == NULL || userdata == NULL) return;
4616   fpp = (FeatProbPtr) userdata;
4617 
4618   if (sfp->pseudo) {
4619     (fpp->num_pseudo)++;
4620   }
4621 
4622   for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
4623     if (StringICmp (gbq->qual, "pseudogene") != 0) continue;
4624     if (StringHasNoText (gbq->val)) continue;
4625 
4626     (fpp->num_pseudogene)++;
4627   }
4628 }
4629 */
4630 
CheckTaxIDs(BioSourcePtr biop,Pointer userdata)4631 static void CheckTaxIDs (BioSourcePtr biop, Pointer userdata)
4632 
4633 {
4634   ValNodePtr     db;
4635   DbtagPtr       dbt;
4636   FeatProbPtr    fpp;
4637   ObjectIdPtr    oip;
4638   OrgNamePtr     onp;
4639   OrgRefPtr      orp;
4640   TaxElementPtr  tep;
4641 
4642   if (biop == NULL || userdata == NULL) return;
4643   fpp = (FeatProbPtr) userdata;
4644 
4645   orp = biop->org;
4646   if (orp == NULL) return;
4647 
4648   for (db = orp->db; db != NULL; db = db->next) {
4649     dbt = (DbtagPtr) db->data.ptrvalue;
4650     if (dbt == NULL) continue;
4651     if (StringICmp (dbt->db, "taxon") != 0) continue;
4652     oip = dbt->tag;
4653     if (oip == NULL) continue;
4654     if (oip->str != NULL) continue;
4655     if (fpp->first_taxid == 0) {
4656       fpp->first_taxid = oip->id;
4657     } else if (fpp->first_taxid != oip->id) {
4658       fpp->mult_taxids = TRUE;
4659     }
4660   }
4661 
4662   onp = orp->orgname;
4663   if (onp == NULL) return;
4664   if (onp->choice == 5) {
4665     for (tep = (TaxElementPtr) onp->data; tep != NULL; tep = tep->next) {
4666       if (tep->fixed_level == 0 && StringICmp (tep->level, "superkingdom") == 0) {
4667         (fpp->num_super_kingdom)++;
4668         if (fpp->super_kingdom_name == NULL) {
4669           fpp->super_kingdom_name = tep->name;
4670         } else if (StringICmp (fpp->super_kingdom_name, tep->name) != 0) {
4671           fpp->super_kingdoms_different = TRUE;
4672         }
4673       }
4674     }
4675   }
4676 }
4677 
4678 
4679 //LCOV_EXCL_START
Heartbeat(ValidStructPtr vsp,CharPtr msg)4680 NLM_EXTERN void Heartbeat(ValidStructPtr vsp, CharPtr msg)
4681 {
4682   Char id_buf[255];
4683 
4684   if (vsp->use_heartbeat) {
4685     if (msg == NULL) {
4686       if (vsp->bsp == NULL || vsp->bsp->id == NULL) {
4687         ValidErr (vsp, SEV_INFO, 0, 0, "Processing");
4688       } else {
4689         SeqIdWrite (SeqIdFindBest (vsp->bsp->id, 0), id_buf, PRINTID_FASTA_SHORT, sizeof (id_buf) - 1);
4690         ValidErr (vsp, SEV_INFO, 0, 0, "Processing %s", id_buf);
4691       }
4692     } else {
4693       ValidErr (vsp, SEV_INFO, 0, 0, msg);
4694     }
4695   }
4696 }
4697 //LCOV_EXCL_STOP
4698 
4699 
IsWgsIntermediate(SeqEntryPtr sep)4700 static Boolean IsWgsIntermediate (SeqEntryPtr sep)
4701 
4702 {
4703   BioseqPtr    bsp;
4704   Boolean      has_gi = FALSE, is_other = FALSE, is_wgs = FALSE;
4705   MolInfoPtr   mip;
4706   SeqDescrPtr  sdp;
4707   SeqIdPtr     sip;
4708 
4709   bsp = FindNucBioseq (sep);
4710   if (bsp == NULL) return FALSE;
4711 
4712   for (sdp = bsp->descr; sdp != NULL; sdp = sdp->next) {
4713     if (sdp->choice != Seq_descr_molinfo) continue;
4714     mip = (MolInfoPtr) sdp->data.ptrvalue;
4715     if (mip == NULL) continue;
4716     if (mip->tech == MI_TECH_wgs) {
4717       is_wgs = TRUE;
4718     }
4719   }
4720   if (! is_wgs) return FALSE;
4721 
4722   for (sip = bsp->id; sip != NULL; sip = sip->next) {
4723     if (sip->choice == SEQID_OTHER) {
4724       is_other = TRUE;
4725     } else if (sip->choice == SEQID_GI) {
4726       has_gi = TRUE;
4727     }
4728   }
4729   if (! is_other) return FALSE;
4730   if (has_gi) return FALSE;
4731 
4732   return TRUE;
4733 }
4734 
IsTsaIntermediate(SeqEntryPtr sep)4735 static Boolean IsTsaIntermediate (SeqEntryPtr sep)
4736 
4737 {
4738   BioseqPtr    bsp;
4739   Boolean      has_gi = FALSE, is_other = FALSE, is_tsa = FALSE;
4740   MolInfoPtr   mip;
4741   SeqDescrPtr  sdp;
4742   SeqIdPtr     sip;
4743 
4744   bsp = FindNucBioseq (sep);
4745   if (bsp == NULL) return FALSE;
4746 
4747   for (sdp = bsp->descr; sdp != NULL; sdp = sdp->next) {
4748     if (sdp->choice != Seq_descr_molinfo) continue;
4749     mip = (MolInfoPtr) sdp->data.ptrvalue;
4750     if (mip == NULL) continue;
4751     if (mip->tech == MI_TECH_tsa) {
4752       is_tsa = TRUE;
4753     }
4754   }
4755   if (! is_tsa) return FALSE;
4756 
4757   for (sip = bsp->id; sip != NULL; sip = sip->next) {
4758     if (sip->choice == SEQID_OTHER) {
4759       is_other = TRUE;
4760     } else if (sip->choice == SEQID_GI) {
4761       has_gi = TRUE;
4762     }
4763   }
4764   if (! is_other) return FALSE;
4765   if (has_gi) return FALSE;
4766 
4767   return TRUE;
4768 }
4769 
ValidateSeqEntry(SeqEntryPtr sep,ValidStructPtr vsp)4770 NLM_EXTERN Boolean ValidateSeqEntry (SeqEntryPtr sep, ValidStructPtr vsp)
4771 
4772 {
4773   AuthListPtr     alp;
4774   AuthorPtr       ap;
4775   DatePtr         cd, dp;
4776   ContactInfoPtr  cip;
4777   CitSubPtr       csp;
4778   Uint2           entityID = 0;
4779   GatherScope     gs;
4780   BioseqSetPtr    bssp;
4781   SeqSubmitPtr    ssp = NULL;
4782   Boolean         do_many = FALSE;
4783   Boolean         mult_subs = FALSE;
4784   Boolean         farFetchProd;
4785   Boolean         first = TRUE;
4786   Int4            errors[6];
4787   Int2            i;
4788   Boolean         inferenceAccnCheck;
4789   Boolean         suppress_no_pubs = TRUE;
4790   Boolean         suppress_no_cit_subs = TRUE;
4791   Boolean         suppress_no_biosrc = TRUE;
4792   FeatProb        featprob;
4793   GatherContextPtr gcp = NULL;
4794   GatherContext   gc;
4795   SeqEntryPtr     fsep;
4796   BioseqPtr       fbsp = NULL;
4797   Int2            limit;
4798   SeqEntryPtr     oldsep;
4799   ErrSev          oldsev;
4800   ObjMgrDataPtr   omdp;
4801   SeqEntryPtr     topsep = NULL;
4802   SeqEntryPtr     tmp;
4803   TextSeqIdPtr    tsip;
4804   ValNodePtr      bsplist;
4805   SubmitBlockPtr  sbp;
4806   ErrSev          sev;
4807   SeqIdPtr        sip;
4808   Boolean         has_multi_int_genes = FALSE;
4809   Boolean         has_seg_bioseqs = FALSE;
4810   Boolean         isGPS = FALSE;
4811   Boolean         isPatent = FALSE;
4812   Boolean         isPDB = FALSE;
4813   Boolean         isWP = FALSE;
4814   FindRepData     frd;
4815   Int4            numInferences;
4816   Int4            numAccessions;
4817 
4818   if (sep == NULL || vsp == NULL) return FALSE;
4819 
4820   genetic_code_name_list = SetUpValidateGeneticCodes ();
4821 
4822   vsp->useSeqMgrIndexes = TRUE; /* now always use indexing */
4823 
4824   for (i = 0; i < 6; i++)       /* keep errors between clears */
4825     errors[i] = 0;
4826 
4827   MemSet ((Pointer) &featprob, 0, sizeof (FeatProb));
4828 
4829   if (vsp->useSeqMgrIndexes) {
4830     entityID = ObjMgrGetEntityIDForChoice (sep);
4831 
4832     if (SeqMgrFeaturesAreIndexed (entityID) == 0) {
4833       oldsev = ErrSetMessageLevel (SEV_MAX);
4834       SeqMgrIndexFeatures (entityID, NULL);
4835       ErrSetMessageLevel (oldsev);
4836     }
4837     SeqMgrExploreBioseqs (entityID, NULL, (Pointer) &featprob, CountMisplacedFeatures, TRUE, TRUE, TRUE);
4838 
4839     topsep = GetTopSeqEntryForEntityID (entityID);
4840     VisitGraphsInSep (topsep, (Pointer) &featprob, CheckGraphPacking);
4841     VisitFeaturesInSep (topsep, (Pointer) &featprob, CountGeneXrefs);
4842     VisitFeaturesInSep (topsep, (Pointer) &featprob, CountFeatLocIdTypes);
4843     /*
4844     VisitFeaturesInSep (topsep, (Pointer) &featprob, CountPseudogenes);
4845     */
4846     VisitBioseqsInSep (topsep, (Pointer) &featprob, CheckTpaHist);
4847     VisitBioSourcesInSep (topsep, (Pointer) &featprob, CheckTaxIDs);
4848   } else {
4849 //LCOV_EXCL_START
4850     /* if not using indexing, still need feature->idx.subtype now */
4851 
4852     entityID = ObjMgrGetEntityIDForChoice (sep);
4853     AssignIDsInEntity (entityID, 0, NULL);
4854 //LCOV_EXCL_STOP
4855   }
4856 
4857   Heartbeat(vsp, "Processing");
4858 
4859   /* Seq-submit can have multiple entries with no Bioseq-set wrapper */
4860 
4861   omdp = ObjMgrGetData (entityID);
4862   if (omdp != NULL && omdp->datatype == OBJ_SEQSUB) {
4863     ssp = (SeqSubmitPtr) omdp->dataptr;
4864     if (ssp != NULL && ssp->data != NULL) {
4865       if (sep->next != NULL) {
4866         do_many = TRUE;
4867         mult_subs = TRUE;
4868       }
4869     }
4870     if (ssp != NULL && ssp->sub != NULL && StringNICmp (ssp->sub->tool, "Geneious", 8) == 0) {
4871       vsp->is_geneious = TRUE;
4872     }
4873   }
4874 
4875   if (IS_Bioseq_set (sep)) {
4876     bssp = (BioseqSetPtr) (sep->data.ptrvalue);
4877     switch (bssp->_class) {
4878     /* case BioseqseqSet_class_genbank: */
4879     case BioseqseqSet_class_pir:
4880     case BioseqseqSet_class_gibb:
4881     case BioseqseqSet_class_gi:
4882     case BioseqseqSet_class_swissprot:
4883       sep = bssp->seq_set;
4884       do_many = TRUE;
4885       break;
4886     case BioseqseqSet_class_wgs_set:
4887       if(ssp != NULL)                   /* Seq-submit on top */
4888         ValidErr(vsp, SEV_WARNING, ERR_SEQ_PKG_SeqSubmitWithWgsSet,
4889                  "File was created as a wgs-set, but should be a batch submission instead.");
4890       break;
4891     case BioseqseqSet_class_gen_prod_set:
4892       isGPS = TRUE;
4893     default:
4894       break;
4895     }
4896   }
4897 
4898   /* if no pubs or biosource, only one message, not one per bioseq */
4899 
4900   if (mult_subs) {
4901     for (tmp = sep; tmp != NULL; tmp = tmp->next) {
4902       LookForAnyPubAndOrg (tmp, &suppress_no_pubs, &suppress_no_cit_subs, &suppress_no_biosrc);
4903     }
4904   } else {
4905     LookForAnyPubAndOrg (sep, &suppress_no_pubs, &suppress_no_cit_subs, &suppress_no_biosrc);
4906   }
4907 
4908   if (GetAppProperty ("ValidateExons") != NULL) {
4909     vsp->validateExons = TRUE;
4910   }
4911 
4912   vsp->is_htg_in_sep = FALSE;
4913   vsp->is_barcode_sep = FALSE;
4914   vsp->is_refseq_in_sep = FALSE;
4915   vsp->is_wp_in_sep = FALSE;
4916   vsp->is_gpipe_in_sep = FALSE;
4917   vsp->is_gps_in_sep = FALSE;
4918   vsp->other_sets_in_sep = FALSE;
4919   vsp->is_embl_ddbj_in_sep = FALSE;
4920   vsp->is_embl_tpe_in_sep = FALSE;
4921   vsp->is_old_gb_in_sep = FALSE;
4922   vsp->is_insd_in_sep = FALSE;
4923   vsp->is_pdb_in_sep = FALSE;
4924   vsp->has_gi_or_accn_ver = FALSE;
4925   vsp->has_gnl_prot_sep = FALSE;
4926   vsp->bsp_genomic_in_sep = FALSE;
4927   vsp->is_smupd_in_sep = FALSE;
4928 
4929   vsp->only_lcl_gnl_in_sep = TRUE;
4930 
4931   VisitBioseqsInSep (sep, (Pointer) vsp, LookForBioseqFields);
4932   VisitSetsInSep (sep, (Pointer) vsp, LookForBioseqSetFields);
4933   VisitDescriptorsInSep (sep, (Pointer) vsp, LookForSeqDescrFields);
4934 
4935   VisitFeaturesInSep (sep, (Pointer) &has_multi_int_genes, FindMultiIntervalGenes);
4936   vsp->has_multi_int_genes = has_multi_int_genes;
4937   VisitBioseqsInSep (sep, (Pointer) &has_seg_bioseqs, FindSegmentedBioseqs);
4938   vsp->has_seg_bioseqs = has_seg_bioseqs;
4939 
4940   vsp->feat_loc_has_gi = featprob.loc_has_gi;
4941   vsp->feat_prod_has_gi = featprob.prod_has_gi;
4942 
4943   globalvsp = vsp;              /* for spell checker */
4944 
4945   inferenceAccnCheck = vsp->inferenceAccnCheck;
4946 
4947   while (sep != NULL) {
4948     vsp->far_fetch_failure = FALSE;
4949 
4950     /* calculate strings for LookForMultipleUnpubPubs test only once for genome product set efficiency */
4951     VisitDescriptorsInSep (sep, NULL, SetPubScratchData);
4952 
4953     MemSet (&gs, 0, sizeof (GatherScope));
4954     gs.scope = sep;             /* default is to scope to this set */
4955 
4956     ValidStructClear (vsp);
4957     vsp->sep = sep;
4958 
4959     MemSet ((Pointer) &gc, 0, sizeof (GatherContext));
4960     gcp = &gc;
4961     gc.entityID = ObjMgrGetEntityIDForChoice (sep);
4962     gc.itemID = 1;
4963     if (IS_Bioseq (sep)) {
4964       gc.thistype = OBJ_BIOSEQ;
4965     } else {
4966       gc.thistype = OBJ_BIOSEQSET;
4967     }
4968     vsp->gcp = gcp;             /* above needed for ValidErr */
4969     vsp->suppress_no_pubs = suppress_no_pubs;
4970     vsp->suppress_no_cit_subs = suppress_no_cit_subs;
4971     vsp->suppress_no_biosrc = suppress_no_biosrc;
4972 
4973     if (vsp->is_refseq_in_sep && vsp->is_insd_in_sep) {
4974       ValidErr (vsp, SEV_ERROR, ERR_SEQ_PKG_INSDRefSeqPackaging,
4975                 "INSD and RefSeq records should not be present in the same set");
4976     }
4977 
4978     if (vsp->is_gps_in_sep && vsp->other_sets_in_sep) {
4979       ValidErr (vsp, SEV_ERROR, ERR_SEQ_PKG_GPSnonGPSPackaging,
4980                 "Genomic product set and mut/pop/phy/eco set records should not be present in the same set");
4981     }
4982 
4983     /* build seqmgr feature indices if not already done */
4984 
4985     bsplist = NULL;
4986     if (vsp->useSeqMgrIndexes) {
4987       entityID = ObjMgrGetEntityIDForChoice (sep);
4988 
4989       if (SeqMgrFeaturesAreIndexed (entityID) == 0) {
4990         //LCOV_EXCL_START
4991         //specific to C Toolkit indexing
4992         oldsev = ErrSetMessageLevel (SEV_MAX);
4993         SeqMgrIndexFeatures (entityID, NULL);
4994         ErrSetMessageLevel (oldsev);
4995         //LCOV_EXCL_STOP
4996       }
4997 
4998       /* lock all remote genome components, locations, and products in advance */
4999 
5000       limit = vsp->validationLimit;
5001       if (! ValTooManyFarComponents (sep)) {
5002         if (limit == VALIDATE_ALL || limit == VALIDATE_INST || limit == VALIDATE_HIST) {
5003           farFetchProd = (Boolean) (vsp->farFetchCDSproducts || vsp->farFetchMRNAproducts);
5004           oldsev = ErrSetMessageLevel (SEV_WARNING);
5005           bsplist = LockFarComponentsEx (sep, TRUE, TRUE, farFetchProd, NULL);
5006           ErrSetMessageLevel (oldsev);
5007         }
5008       }
5009     }
5010 
5011     fsep = FindNthBioseq (sep, 1);
5012     fbsp = NULL;
5013     if (fsep != NULL && IS_Bioseq (fsep)) {
5014       fbsp = (BioseqPtr) fsep->data.ptrvalue;
5015       /* report context as first bioseq */
5016       vsp->bsp = fbsp;
5017     }
5018 
5019     if (fbsp == NULL) {
5020       ValidErr (vsp, SEV_ERROR, ERR_SEQ_PKG_NoBioseqFound, "No Bioseqs in this entire record.");
5021     } else {
5022 
5023       for (sip = fbsp->id; sip != NULL; sip = sip->next) {
5024         if (sip->choice == SEQID_PATENT) {
5025           isPatent = TRUE;
5026         } else if (sip->choice == SEQID_PDB) {
5027           isPDB = TRUE;
5028         } else if (sip->choice == SEQID_OTHER) {
5029           tsip = (TextSeqIdPtr) sip->data.ptrvalue;
5030           if (tsip != NULL && tsip->accession != NULL) {
5031             if (StringNICmp (tsip->accession, "WP_", 3) == 0) {
5032               isWP = TRUE;
5033             }
5034           }
5035         }
5036       }
5037 
5038       if (first) {
5039         TestDeletedOrReplacedECnumbers (vsp);
5040 
5041         if (! vsp->seqSubmitParent) {
5042           omdp = ObjMgrGetData (gc.entityID);
5043           if (omdp == NULL || omdp->datatype != OBJ_SEQSUB) {
5044             if (suppress_no_pubs) {
5045               sev = SEV_ERROR;
5046               if ((!isGPS) && (!IsNoncuratedRefSeq (fbsp, &sev)) && (! IsGpipe (fbsp)) && (! IsWgsContig (fbsp)) && (! IsTsaContig (fbsp))) {
5047                 ValidErr (vsp, sev, ERR_SEQ_DESCR_NoPubFound, "No publications anywhere on this entire record.");
5048               }
5049             }
5050             if (suppress_no_cit_subs) {
5051               sev = SEV_INFO;
5052               if (vsp->genomeSubmission) {
5053                 sev = SEV_ERROR;
5054               }
5055               if ((! IsNoncuratedRefSeq (fbsp, &sev)) && (! IsWgsContig (fbsp)) && (! IsTsaContig (fbsp)) && (! IsWgsIntermediate (vsp->sep)) && (! IsTsaIntermediate (vsp->sep))) {
5056                 ValidErr (vsp, sev, ERR_GENERIC_MissingPubInfo, "No submission citation anywhere on this entire record.");
5057               }
5058             }
5059           }
5060         }
5061 
5062         if (suppress_no_biosrc) {
5063           if ((!isPatent) && ((!isPDB))) {
5064             ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_NoOrgFound, "No organism name anywhere on this entire record.");
5065           }
5066         }
5067 
5068         if (featprob.num_misplaced_features > 1) {
5069           ValidErr (vsp, SEV_REJECT, ERR_SEQ_PKG_FeaturePackagingProblem, "There are %d mispackaged features in this record.", (int) featprob.num_misplaced_features);
5070         } else if (featprob.num_misplaced_features == 1) {
5071           ValidErr (vsp, SEV_REJECT, ERR_SEQ_PKG_FeaturePackagingProblem, "There is %d mispackaged feature in this record.", (int) featprob.num_misplaced_features);
5072         }
5073         if (featprob.num_small_genome_set_misplaced > 1) {
5074           ValidErr (vsp, SEV_WARNING, ERR_SEQ_PKG_FeaturePackagingProblem, "There are %d mispackaged features in this small genome set record.", (int) featprob.num_small_genome_set_misplaced);
5075         } else if (featprob.num_small_genome_set_misplaced == 1) {
5076           ValidErr (vsp, SEV_WARNING, ERR_SEQ_PKG_FeaturePackagingProblem, "There is %d mispackaged feature in this small genome set record.", (int) featprob.num_small_genome_set_misplaced);
5077         }
5078 
5079         if (featprob.num_misplaced_graphs > 1) {
5080           ValidErr (vsp, SEV_REJECT, ERR_SEQ_PKG_GraphPackagingProblem, "There are %d mispackaged graphs in this record.", (int) featprob.num_misplaced_graphs);
5081         } else if (featprob.num_misplaced_graphs == 1) {
5082           ValidErr (vsp, SEV_REJECT, ERR_SEQ_PKG_GraphPackagingProblem, "There is %d mispackaged graph in this record.", (int) featprob.num_misplaced_graphs);
5083         }
5084 
5085         /*
5086         if (featprob.num_archaic_locations > 1) {
5087           ValidErr (vsp, SEV_WARNING, ERR_SEQ_PKG_ArchaicFeatureLocation, "There are %d archaic feature locations in this record.", (int) featprob.num_archaic_locations);
5088         } else if (featprob.num_archaic_locations == 1) {
5089           ValidErr (vsp, SEV_WARNING, ERR_SEQ_PKG_ArchaicFeatureLocation, "There is %d archaic feature location in this record.", (int) featprob.num_archaic_locations);
5090         }
5091 
5092         if (featprob.num_archaic_products > 1) {
5093           ValidErr (vsp, SEV_WARNING, ERR_SEQ_PKG_ArchaicFeatureProduct, "There are %d archaic feature products in this record.", (int) featprob.num_archaic_products);
5094         } else if (featprob.num_archaic_products == 1) {
5095           ValidErr (vsp, SEV_WARNING, ERR_SEQ_PKG_ArchaicFeatureProduct, "There is %d archaic feature product in this record.", (int) featprob.num_archaic_products);
5096         }
5097         */
5098 
5099         if (featprob.num_gene_feats == 0 && featprob.num_gene_xrefs > 0) {
5100           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_OnlyGeneXrefs, "There are %ld gene xrefs and no gene features in this record.", (long) featprob.num_gene_xrefs);
5101         }
5102 
5103         if (featprob.num_tpa_with_hist > 0 && featprob.num_tpa_without_hist > 0) {
5104           ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_TpaAssmeblyProblem, "There are %ld TPAs with history and %ld without history in this record.",
5105                     (long) featprob.num_tpa_with_hist, (long) featprob.num_tpa_without_hist);
5106         }
5107 
5108         if (featprob.has_gi && featprob.num_tpa_without_hist > 0) {
5109           ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_TpaAssmeblyProblem, "There are %ld TPAs without history in this record, but the record has a gi number assignment.",
5110                     (long) featprob.num_tpa_without_hist);
5111         }
5112 
5113         /*
5114         if (featprob.num_pseudo != featprob.num_pseudogene && featprob.num_pseudo > 0 && featprob.num_pseudogene > 0) {
5115            ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InconsistentPseudogeneCounts,
5116                      "There are %ld pseudo features with %ld pseudogene qualifiers in this record.",
5117                      (long) featprob.num_pseudo, (long) featprob.num_pseudogene);
5118         }
5119         */
5120 
5121         if (featprob.mult_taxids && vsp->is_refseq_in_sep) {
5122           if (featprob.num_super_kingdom > 1 && featprob.super_kingdoms_different && isWP) {
5123           } else {
5124             ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_MultipleTaxonIDs, "There are multiple taxonIDs in this RefSeq record.");
5125           }
5126         }
5127 
5128         if (vsp->indexerVersion && vsp->has_gnl_prot_sep && (! vsp->is_refseq_in_sep)) {
5129           if (FindNucBioseq (sep) != NULL) {
5130             ValidErr (vsp, SEV_INFO, ERR_SEQ_INST_ProteinsHaveGeneralID, "INDEXER_ONLY - Protein bioseqs have general seq-id.");
5131           }
5132         }
5133 
5134         first = FALSE;
5135       }
5136 
5137       vsp->bsp = NULL;
5138 
5139       topsep = GetTopSeqEntryForEntityID (gc.entityID);
5140       oldsep = SeqEntrySetScope (topsep);
5141 
5142       /* disabled for now
5143       FindLongIdsThatCollideWhenTruncated (topsep, vsp, 30);
5144       */
5145 
5146       /* do validator tests using Discrepancy Report */
5147       ValidateGeneLocusTags (topsep, vsp);
5148 
5149       VisitFeaturesInSep (sep, NULL, AddScratchToFeatures);
5150       VisitBioseqsInSep (sep, NULL, SetupFeatureScratchData);
5151 
5152       /* AssignIDsInEntity (gc.entityID, 0, NULL); */
5153 
5154       if (inferenceAccnCheck) {
5155         numInferences = 0;
5156         numAccessions = 0;
5157         if (TooManyInferenceAccessions (sep, &numInferences, &numAccessions) && numAccessions > 1000) {
5158           ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_TooManyInferenceAccessions,
5159                     "Skipping validation of %ld /inference qualifiers with %ld accessions",
5160                     (long) numInferences, (long) numAccessions);
5161 
5162           /* suppress inference accession.version check for this record */
5163           vsp->inferenceAccnCheck = FALSE;
5164         }
5165       }
5166 
5167       vsp->sisfp = NULL;
5168       GatherSeqEntry (sep, (Pointer) vsp, Valid1GatherProc, &gs);
5169       ValidateShortIntrons (topsep, vsp);
5170       if(vsp->sisfp != NULL)
5171       {
5172           ValNodeFree(vsp->sisfp);
5173           vsp->sisfp = NULL;
5174       }
5175 
5176       /* restore inferenceAccnCheck flag for next record */
5177       vsp->inferenceAccnCheck = inferenceAccnCheck;
5178 
5179       if (ssp != NULL) {
5180         if (ssp->datatype == 1) {
5181           vsp->bsp = NULL;
5182           vsp->bssp = NULL;
5183           vsp->sfp = NULL;
5184           vsp->descr = NULL;
5185           vsp->gcp = NULL;
5186           sbp = ssp->sub;
5187           if (sbp != NULL) {
5188             csp = sbp->cit;
5189             if (csp != NULL) {
5190               alp = csp->authors;
5191               if (alp != NULL) {
5192                 ValidateAffil (vsp, alp->affil);
5193               }
5194               ValidateCitSub (vsp, csp);
5195             }
5196             cip = sbp->contact;
5197             if (cip != NULL) {
5198               ap = cip->contact;
5199               if (ap != NULL) {
5200                 ValidateAffil (vsp, ap->affil);
5201               }
5202             }
5203             if (sbp->hup) {
5204               dp = sbp->reldate;
5205               cd = DateCurr ();
5206               if (dp != NULL && cd != NULL) {
5207                 if (DateMatch (dp, cd, FALSE) == -1) {
5208                   ValidErr (vsp, SEV_WARNING, ERR_GENERIC_PastReleaseDate,
5209                             "Record release date has already passed");
5210                 }
5211               }
5212               DateFree (cd);
5213             }
5214           }
5215         }
5216       }
5217 
5218       vsp->gcp = NULL;
5219       ValidateFeatCits (sep, vsp);
5220       vsp->gcp = NULL;
5221 
5222       vsp->gcp = NULL;
5223       ValidateFeatIDs (sep, gc.entityID, vsp);
5224       vsp->gcp = NULL;
5225 
5226       vsp->gcp = NULL;
5227       ValidateSeqIdCase (sep, vsp);
5228       vsp->gcp = NULL;
5229 
5230       if (vsp->validateAlignments) {
5231         vsp->gcp = NULL;
5232         ValidateSeqAlignWithinValidator (vsp, sep, vsp->alignFindRemoteBsp, vsp->doSeqHistAssembly);
5233         vsp->gcp = NULL;
5234       }
5235 
5236       if (vsp->far_fetch_failure) {
5237         //LCOV_EXCL_START
5238         //not testable in regression
5239         vsp->gcp = NULL;
5240         ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_FarFetchFailure, "Far fetch failures caused some validator tests to be bypassed");
5241         //LCOV_EXCL_STOP
5242       }
5243 
5244       VisitFeaturesInSep (sep, NULL, ClearScratchOnFeatures);
5245 
5246       SeqEntrySetScope (oldsep);
5247 
5248       VisitDescriptorsInSep (sep, NULL, ClearPubScratchData);
5249     }
5250 
5251     if (vsp->useSeqMgrIndexes) {
5252 
5253       /* unlock all pre-locked remote genome components */
5254 
5255       bsplist = UnlockFarComponents (bsplist);
5256     }
5257 
5258     if (do_many) {
5259       for (i = 0; i < 6; i++)
5260         errors[i] += vsp->errors[i];
5261       sep = sep->next;
5262     } else
5263       sep = NULL;
5264   }
5265 
5266   MemSet ((Pointer) &gc, 0, sizeof (GatherContext));
5267   gcp = &gc;
5268   gc.entityID = ObjMgrGetEntityIDForChoice (sep);
5269   vsp->gcp = gcp;
5270   frd.vsp = vsp;
5271   frd.gcp = gcp;
5272 
5273   limit = vsp->validationLimit;
5274   if (limit == VALIDATE_ALL) {
5275     /*
5276     frd.string = "?";
5277     */
5278     FindStringsInEntity (entityID, findrepstrs, FALSE, FALSE, FALSE, UPDATE_NEVER,
5279                          NULL, NULL, NULL, TRUE, FindRepValidate, (Pointer) &frd);
5280   }
5281 
5282   if (do_many) {
5283     for (i = 0; i < 6; i++)
5284       vsp->errors[i] = errors[i];
5285   }
5286 
5287   genetic_code_name_list = ValNodeFreeData (genetic_code_name_list);
5288 
5289   return TRUE;
5290 }
5291 
5292 
ValidateSetContents(SeqEntryPtr sep,Pointer data,Int4 index,Int2 indent)5293 static void ValidateSetContents (SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent)
5294 {
5295   BioseqPtr       bsp;
5296   ValidStructPtr  vsp;
5297 
5298   vsp = (ValidStructPtr) data;
5299 
5300   if (IS_Bioseq (sep)) {
5301     bsp = (BioseqPtr) (sep->data.ptrvalue);
5302     if (ISA_aa (bsp->mol))
5303       vsp->protcnt++;
5304     else
5305       vsp->nuccnt++;
5306     if (bsp->repr == Seq_repr_seg){
5307         //LCOV_EXCL_START
5308         // Only for SegSets
5309         vsp->segcnt++;
5310         //LCOV_EXCL_STOP
5311     }
5312   }
5313   return;
5314 }
5315 
5316 
5317 //LCOV_EXCL_START
5318 // Only for SegSets
GetBioseqSetClass(Uint1 cl)5319 static CharPtr GetBioseqSetClass (Uint1 cl)
5320 {
5321   if (cl == BioseqseqSet_class_nuc_prot)
5322     return ("nuc-prot");
5323   if (cl == BioseqseqSet_class_segset)
5324     return ("segset");
5325   if (cl == BioseqseqSet_class_conset)
5326     return ("conset");
5327   if (cl == BioseqseqSet_class_parts)
5328     return ("parts");
5329   if (cl == BioseqseqSet_class_gibb)
5330     return ("gibb");
5331   if (cl == BioseqseqSet_class_gi)
5332     return ("gi");
5333   if (cl == BioseqseqSet_class_genbank)
5334     return ("genbank");
5335   if (cl == BioseqseqSet_class_pir)
5336     return ("pir");
5337   if (cl == BioseqseqSet_class_pub_set)
5338     return ("pub-set");
5339   if (cl == BioseqseqSet_class_equiv)
5340     return ("equiv");
5341   if (cl == BioseqseqSet_class_swissprot)
5342     return ("swissprot");
5343   if (cl == BioseqseqSet_class_pdb_entry)
5344     return ("pdb-entry");
5345   if (cl == BioseqseqSet_class_mut_set)
5346     return ("mut-set");
5347   if (cl == BioseqseqSet_class_pop_set)
5348     return ("pop-set");
5349   if (cl == BioseqseqSet_class_phy_set)
5350     return ("phy-set");
5351   if (cl == BioseqseqSet_class_eco_set)
5352     return ("eco-set");
5353   if (cl == BioseqseqSet_class_gen_prod_set)
5354     return ("gen-prod-set");
5355   if (cl == BioseqseqSet_class_wgs_set)
5356     return ("wgs-set");
5357   if (cl == BioseqseqSet_class_small_genome_set)
5358     return ("small-genome-set");
5359   if (cl == BioseqseqSet_class_other)
5360     return ("other");
5361   return ("not-set");
5362 }
5363 //LCOV_EXCL_STOP
5364 
FindGenProdSetParentOfBioseqSet(BioseqSetPtr bssp)5365 static BioseqSetPtr FindGenProdSetParentOfBioseqSet (BioseqSetPtr bssp)
5366 {
5367   if (bssp == NULL) {
5368     return NULL;
5369   } else if (bssp->idx.parenttype != OBJ_BIOSEQSET) {
5370     return NULL;
5371   } else if ((bssp = (BioseqSetPtr)bssp->idx.parentptr) == NULL) {
5372     return NULL;
5373   } else if (bssp->_class == BioseqseqSet_class_gen_prod_set) {
5374     return bssp;
5375   } else {
5376     return FindGenProdSetParentOfBioseqSet (bssp);
5377   }
5378 }
5379 
5380 
FindGenProdSetParentOfBioseq(BioseqPtr bsp)5381 static BioseqSetPtr FindGenProdSetParentOfBioseq (BioseqPtr bsp)
5382 {
5383   BioseqSetPtr bssp;
5384   if (bsp == NULL) {
5385     return NULL;
5386   } else if (bsp->idx.parenttype != OBJ_BIOSEQSET) {
5387     return NULL;
5388   } else if ((bssp = (BioseqSetPtr)bsp->idx.parentptr) == NULL) {
5389     return NULL;
5390   } else if (bssp->_class == BioseqseqSet_class_gen_prod_set) {
5391     return bssp;
5392   } else {
5393     return FindGenProdSetParentOfBioseqSet (bssp);
5394   }
5395 }
5396 
5397 
IfInGPSmustBeMrnaProduct(ValidStructPtr vsp,BioseqPtr bsp)5398 static void IfInGPSmustBeMrnaProduct (ValidStructPtr vsp, BioseqPtr bsp)
5399 
5400 {
5401   /* see if in genomic product */
5402   if (FindGenProdSetParentOfBioseq(bsp) != NULL) {
5403     if (SeqMgrGetRNAgivenProduct (bsp, NULL) == NULL) {
5404       ValidErr (vsp, SEV_WARNING, ERR_SEQ_PKG_GenomicProductPackagingProblem, "Nucleotide bioseq should be product of mRNA feature on contig, but is not");
5405     }
5406   }
5407 }
5408 
IfInGPSmustBeCDSProduct(ValidStructPtr vsp,BioseqPtr bsp)5409 static void IfInGPSmustBeCDSProduct (ValidStructPtr vsp, BioseqPtr bsp)
5410 
5411 {
5412   BioseqSetPtr  bssp;
5413   BioseqPtr     contig;
5414   ValNodePtr    head, vnp;
5415   SeqEntryPtr   sep;
5416   SeqFeatPtr    sfp;
5417 
5418   /* see if in genomic product */
5419   if ((bssp = FindGenProdSetParentOfBioseq(bsp)) != NULL) {
5420     sep = bssp->seq_set;
5421     if (sep == NULL) return;
5422     if (! IS_Bioseq (sep)) return;
5423     contig = (BioseqPtr) sep->data.ptrvalue;
5424     if (contig == NULL) return;
5425     head = SeqMgrGetSfpProductList (bsp);
5426     for (vnp = head; vnp != NULL; vnp = vnp->next) {
5427       sfp = (SeqFeatPtr) vnp->data.ptrvalue;
5428       if (sfp == NULL) continue;
5429       if (BioseqFindFromSeqLoc (sfp->location) == contig) return;
5430     }
5431     ValidErr (vsp, SEV_WARNING, ERR_SEQ_PKG_GenomicProductPackagingProblem, "Protein bioseq should be product of CDS feature on contig, but is not");
5432   }
5433 }
5434 
5435 NLM_EXTERN ValNodePtr BioseqGetSeqDescr(BioseqPtr bsp, Int2 type, ValNodePtr curr);
5436 
5437 
ValidateNucProtSet(BioseqSetPtr bssp,ValidStructPtr vsp)5438 static void ValidateNucProtSet (BioseqSetPtr bssp, ValidStructPtr vsp)
5439 
5440 {
5441   SeqDescrPtr    sdp;
5442   SeqEntryPtr    sep;
5443   SeqIdPtr       sip;
5444   BioSourcePtr   biop;
5445   BioseqPtr      bsp;
5446   BioseqSetPtr   bssp1;
5447   Boolean        is_nm = FALSE;
5448   ObjectIdPtr    oip;
5449   OrgRefPtr      orp;
5450   Int4           prot_biosource = 0;
5451   TextSeqIdPtr   tsip;
5452   UserObjectPtr  uop;
5453 
5454   if (bssp->_class != BioseqseqSet_class_nuc_prot)
5455     return;
5456 
5457   for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
5458     if (IS_Bioseq (sep)) {
5459       bsp = (BioseqPtr) sep->data.ptrvalue;
5460       if (bsp == NULL) continue;
5461       if (ISA_na (bsp->mol)) {
5462         IfInGPSmustBeMrnaProduct (vsp, bsp);
5463         for (sip = bsp->id; sip != NULL; sip = sip->next) {
5464           if (sip->choice != SEQID_OTHER) continue;
5465           tsip = (TextSeqIdPtr) sip->data.ptrvalue;
5466           if (tsip == NULL) continue;
5467           if (StringNCmp (tsip->accession, "NM_", 3) == 0) {
5468             is_nm = TRUE;
5469           }
5470         }
5471       } else if (ISA_aa (bsp->mol)) {
5472         IfInGPSmustBeCDSProduct (vsp, bsp);
5473         sdp = BioseqGetSeqDescr (bsp, Seq_descr_source, NULL);
5474         if (sdp != NULL) {
5475           prot_biosource++;
5476         }
5477       }
5478     }
5479 
5480     if (!IS_Bioseq_set (sep))
5481       continue;
5482 
5483     bssp1 = sep->data.ptrvalue;
5484     if (bssp1 == NULL)
5485       continue;
5486 
5487     if (bssp1->_class != BioseqseqSet_class_segset) {
5488       ValidErr (vsp, SEV_REJECT, ERR_SEQ_PKG_NucProtNotSegSet,
5489                 "Nuc-prot Bioseq-set contains wrong Bioseq-set, its class is \"%s\".", GetBioseqSetClass (bssp1->_class));
5490       break;
5491     }
5492   }
5493 
5494   if (prot_biosource > 1) {
5495     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceOnProtein,
5496               "Nuc-prot set has %ld proteins with a BioSource descriptor", (long) prot_biosource);
5497   } else if (prot_biosource > 0) {
5498     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceOnProtein,
5499               "Nuc-prot set has %ld protein with a BioSource descriptor", (long) prot_biosource);
5500   }
5501 
5502   for (sdp = bssp->descr; sdp != NULL; sdp = sdp->next) {
5503     if (sdp->choice == Seq_descr_title) {
5504       ValidErr (vsp, SEV_WARNING, ERR_SEQ_PKG_NucProtSetHasTitle,
5505                 "Nuc-prot set should not have title descriptor");
5506     }
5507   }
5508 
5509   if (! is_nm) {
5510     for (sdp = bssp->descr; sdp != NULL; sdp = sdp->next) {
5511       if (sdp->choice == Seq_descr_user) {
5512         uop = (UserObjectPtr) sdp->data.ptrvalue;
5513         if (uop != NULL) {
5514           oip = uop->type;
5515           if (oip != NULL && StringICmp (oip->str, "RefGeneTracking") == 0) {
5516             ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_RefGeneTrackingOnNucProtSet,
5517                       "Nuc-prot set should not have RefGeneTracking user object");
5518           }
5519         }
5520       }
5521     }
5522   }
5523 
5524   for (sdp = bssp->descr; sdp != NULL; sdp = sdp->next) {
5525     if (sdp->choice == Seq_descr_source) {
5526       biop = (BioSourcePtr) sdp->data.ptrvalue;
5527       if (biop != NULL) {
5528         orp = biop->org;
5529         if (orp != NULL && StringDoesHaveText (orp->taxname)) return;
5530       }
5531     }
5532   }
5533 
5534   sep = vsp->sep;
5535   if (sep != NULL && IS_Bioseq_set (sep)) {
5536     bssp = (BioseqSetPtr) sep->data.ptrvalue;
5537     if (bssp != NULL && bssp->_class == BioseqseqSet_class_genbank) {
5538       sep = bssp->seq_set;
5539       if (sep != NULL && IS_Bioseq_set (sep)) {
5540         bssp = (BioseqSetPtr) sep->data.ptrvalue;
5541       }
5542     }
5543     if (bssp != NULL && bssp->_class == BioseqseqSet_class_gen_prod_set) {
5544       return;
5545     }
5546   }
5547 
5548   ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceMissing,
5549             "Nuc-prot set does not contain expected BioSource descriptor");
5550 }
5551 
5552 typedef struct incons {
5553   Boolean     diffs;
5554   MolInfoPtr  mip;
5555 } Incons, PNTR InconsPtr;
5556 
5557 //LCOV_EXCL_START
5558 // Only for SegSets
FindInconsistMolInfos(SeqDescrPtr sdp,Pointer userdata)5559 static void FindInconsistMolInfos (SeqDescrPtr sdp, Pointer userdata)
5560 
5561 {
5562   InconsPtr   icp;
5563   MolInfoPtr  mip;
5564 
5565   if (sdp == NULL || sdp->choice != Seq_descr_molinfo) return;
5566   icp = (InconsPtr) userdata;
5567   mip = (MolInfoPtr) sdp->data.ptrvalue;
5568   if (icp == NULL || mip == NULL) return;
5569   if (icp->mip == NULL) {
5570     icp->mip = mip;
5571   } else {
5572     if (icp->mip->biomol != mip->biomol) {
5573       icp->diffs = TRUE;
5574     }
5575   }
5576 }
5577 
5578 // Only for SegSets
ValidateSegmentedSet(BioseqSetPtr bssp,ValidStructPtr vsp)5579 static void ValidateSegmentedSet (BioseqSetPtr bssp, ValidStructPtr vsp)
5580 
5581 {
5582   SeqEntryPtr     sep;
5583   BioseqSetPtr    bssp1;
5584   BioseqPtr       bsp;
5585   Incons          inc;
5586   Uint1           mol = 0;
5587 
5588   if (bssp->_class != BioseqseqSet_class_segset)
5589     return;
5590 
5591   for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
5592     if (IS_Bioseq (sep)) {
5593       bsp = (BioseqPtr) sep->data.ptrvalue;
5594       if (bsp != NULL) {
5595         if (mol == 0 || mol == Seq_mol_other) {
5596           mol = bsp->mol;
5597         } else if (bsp->mol != Seq_mol_other) {
5598           if (ISA_na (bsp->mol) != ISA_na (mol)) {
5599             ValidErr (vsp, SEV_REJECT, ERR_SEQ_PKG_SegSetMixedBioseqs, "Segmented set contains mixture of nucleotides and proteins");
5600           }
5601         }
5602       }
5603     }
5604 
5605     if (!IS_Bioseq_set (sep))
5606       continue;
5607 
5608     bssp1 = sep->data.ptrvalue;
5609     if (bssp1 == NULL)
5610       continue;
5611 
5612     if (bssp1->_class != BioseqseqSet_class_parts) {
5613       ValidErr (vsp, SEV_REJECT, ERR_SEQ_PKG_SegSetNotParts,
5614                 "Segmented set contains wrong Bioseq-set, its class is \"%s\".", GetBioseqSetClass (bssp1->_class));
5615       break;
5616     }
5617   }
5618 
5619   inc.diffs = FALSE;
5620   inc.mip = NULL;
5621   VisitDescriptorsInSet (bssp, (Pointer) &inc, FindInconsistMolInfos);
5622   if (inc.diffs) {
5623     ValidErr (vsp, SEV_ERROR, ERR_SEQ_PKG_InconsistentMolInfoBiomols, "Segmented set contains inconsistent MolInfo biomols");
5624   }
5625 }
5626 
5627 // Only for SegSets
ValidatePartsSet(BioseqSetPtr bssp,ValidStructPtr vsp)5628 static void ValidatePartsSet (BioseqSetPtr bssp, ValidStructPtr vsp)
5629 
5630 {
5631   SeqEntryPtr     sep;
5632   BioseqSetPtr    bssp1;
5633   BioseqPtr       bsp;
5634   Uint1           mol = 0;
5635 
5636   if (bssp->_class != BioseqseqSet_class_parts)
5637     return;
5638 
5639   for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
5640     if (IS_Bioseq (sep)) {
5641       bsp = (BioseqPtr) sep->data.ptrvalue;
5642       if (bsp != NULL) {
5643         if (mol == 0 || mol == Seq_mol_other) {
5644           mol = bsp->mol;
5645         } else if (bsp->mol != Seq_mol_other) {
5646           if (ISA_na (bsp->mol) != ISA_na (mol)) {
5647             ValidErr (vsp, SEV_REJECT, ERR_SEQ_PKG_PartsSetMixedBioseqs, "Parts set contains mixture of nucleotides and proteins");
5648             break;
5649           }
5650         }
5651       }
5652     }
5653   }
5654 
5655   for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
5656     if (IS_Bioseq_set (sep)) {
5657       bssp1 = sep->data.ptrvalue;
5658       if (bssp1 == NULL)
5659         continue;
5660 
5661       ValidErr (vsp, SEV_REJECT, ERR_SEQ_PKG_PartsSetHasSets,
5662                 "Parts set contains unwanted Bioseq-set, its class is \"%s\".", GetBioseqSetClass (bssp1->_class));
5663       break;
5664     }
5665   }
5666 }
5667 //LCOV_EXCL_STOP
5668 
CheckForInconsistentBiosources(SeqEntryPtr sep,ValidStructPtr vsp,OrgRefPtr PNTR orpp,BioseqSetPtr top)5669 static Boolean CheckForInconsistentBiosources (SeqEntryPtr sep, ValidStructPtr vsp, OrgRefPtr PNTR orpp, BioseqSetPtr top)
5670 
5671 {
5672   BioseqPtr       bsp;
5673   BioseqSetPtr    bssp;
5674   SeqEntryPtr     tmp;
5675   ValNodePtr      sdp;
5676   SeqFeatPtr      sfp;
5677   SeqMgrDescContext dcontext;
5678   SeqMgrFeatContext fcontext;
5679   BioSourcePtr    biop;
5680   OrgRefPtr       orp;
5681   OrgRefPtr       firstorp;
5682   GatherContextPtr gcp;
5683   Uint2           entityID = 0, oldEntityID;
5684   Uint4           itemID = 0, oldItemID;
5685   Uint2           itemtype = 0, oldItemtype;
5686   size_t          len, len1, len2;
5687   ErrSev          sev;
5688   CharPtr         sp;
5689 
5690   if (sep == NULL || vsp == NULL || orpp == NULL)
5691     return FALSE;
5692   gcp = vsp->gcp;
5693 
5694   if (IS_Bioseq_set (sep)) {
5695     bssp = (BioseqSetPtr) sep->data.ptrvalue;
5696     if (bssp == NULL)
5697       return FALSE;
5698     for (tmp = bssp->seq_set; tmp != NULL; tmp = tmp->next) {
5699       if (CheckForInconsistentBiosources (tmp, vsp, orpp, top))
5700         return TRUE;
5701     }
5702     return FALSE;
5703   }
5704 
5705   if (!IS_Bioseq (sep))
5706     return FALSE;
5707   bsp = (BioseqPtr) sep->data.ptrvalue;
5708   if (bsp == NULL)
5709     return FALSE;
5710 
5711   biop = NULL;
5712   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
5713   if (sdp != NULL) {
5714     biop = (BioSourcePtr) sdp->data.ptrvalue;
5715     entityID = dcontext.entityID;
5716     itemID = dcontext.itemID;
5717     itemtype = OBJ_SEQDESC;
5718   } else {
5719     sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_BIOSRC, 0, &fcontext);
5720     if (sfp != NULL) {
5721       biop = (BioSourcePtr) sfp->data.value.ptrvalue;
5722       entityID = fcontext.entityID;
5723       itemID = fcontext.itemID;
5724       itemtype = OBJ_SEQFEAT;
5725     }
5726   }
5727   if (biop == NULL)
5728     return FALSE;
5729   orp = biop->org;
5730   if (orp == NULL)
5731     return FALSE;
5732 
5733   firstorp = *orpp;
5734   if (firstorp == NULL) {
5735     *orpp = orp;
5736     return FALSE;
5737   }
5738 
5739   if (StringICmp (orp->taxname, firstorp->taxname) == 0)
5740     return FALSE;
5741 
5742   sev = SEV_ERROR;
5743   sp = StringStr (orp->taxname, " sp. ");
5744   if (sp != NULL) {
5745     len = sp - orp->taxname + 5;
5746     if (StringNCmp (orp->taxname, firstorp->taxname, len) == 0) {
5747       sev = SEV_WARNING;
5748     }
5749   }
5750 
5751   if (sev == SEV_ERROR) {
5752     len1 = StringLen (orp->taxname);
5753     len2 = StringLen (firstorp->taxname);
5754     len = MIN (len1, len2);
5755     if (len > 0 && StringNCmp (orp->taxname, firstorp->taxname, len) == 0) {
5756       sev = SEV_WARNING;
5757     }
5758   }
5759 
5760   oldEntityID = gcp->entityID;
5761   oldItemID = gcp->itemID;
5762   oldItemtype = gcp->thistype;
5763 
5764   gcp->entityID = entityID;
5765   gcp->itemID = itemID;
5766   gcp->thistype = itemtype;
5767 
5768   if (top != NULL) {
5769     gcp->entityID = top->idx.entityID;
5770     gcp->itemID = top->idx.itemID;
5771     gcp->thistype = OBJ_BIOSEQSET;
5772   }
5773 
5774   /* only report the first one that doesn't match - but might be lower severity if not all are sp. */
5775 
5776   ValidErr (vsp, sev, ERR_SEQ_DESCR_InconsistentBioSources, "Population set contains inconsistent organisms.");
5777 
5778   gcp->entityID = oldEntityID;
5779   gcp->itemID = oldItemID;
5780   gcp->thistype = oldItemtype;
5781 
5782   return TRUE;
5783 }
5784 
CheckForInconsistentMolInfos(SeqEntryPtr sep,ValidStructPtr vsp,MolInfoPtr PNTR mipp,BioseqSetPtr top)5785 static Boolean CheckForInconsistentMolInfos (SeqEntryPtr sep, ValidStructPtr vsp, MolInfoPtr PNTR mipp, BioseqSetPtr top)
5786 
5787 {
5788   BioseqPtr          bsp;
5789   BioseqSetPtr       bssp;
5790   SeqMgrDescContext  dcontext;
5791   Uint2              entityID = 0, oldEntityID;
5792   MolInfoPtr         firstmip;
5793   GatherContextPtr   gcp;
5794   Uint4              itemID = 0, oldItemID;
5795   Uint2              itemtype = 0, oldItemtype;
5796   MolInfoPtr         mip;
5797   ValNodePtr         sdp;
5798   SeqEntryPtr        tmp;
5799 
5800   if (sep == NULL || vsp == NULL || mipp == NULL)
5801     return FALSE;
5802   gcp = vsp->gcp;
5803 
5804   if (IS_Bioseq_set (sep)) {
5805     bssp = (BioseqSetPtr) sep->data.ptrvalue;
5806     if (bssp == NULL)
5807       return FALSE;
5808     for (tmp = bssp->seq_set; tmp != NULL; tmp = tmp->next) {
5809       if (CheckForInconsistentMolInfos (tmp, vsp, mipp, top))
5810         return TRUE;
5811     }
5812     return FALSE;
5813   }
5814 
5815   if (!IS_Bioseq (sep))
5816     return FALSE;
5817   bsp = (BioseqPtr) sep->data.ptrvalue;
5818   if (bsp == NULL)
5819     return FALSE;
5820 
5821   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext);
5822   if (sdp == NULL) return FALSE;
5823   mip = (MolInfoPtr) sdp->data.ptrvalue;
5824   if (mip == NULL || mip->biomol == MOLECULE_TYPE_PEPTIDE) return FALSE;
5825 
5826   firstmip = *mipp;
5827   if (firstmip == NULL) {
5828     *mipp = mip;
5829     return FALSE;
5830   }
5831 
5832   if (mip->biomol == firstmip->biomol) return FALSE;
5833 
5834   oldEntityID = gcp->entityID;
5835   oldItemID = gcp->itemID;
5836   oldItemtype = gcp->thistype;
5837 
5838   gcp->entityID = entityID;
5839   gcp->itemID = itemID;
5840   gcp->thistype = itemtype;
5841 
5842   if (top != NULL) {
5843     gcp->entityID = top->idx.entityID;
5844     gcp->itemID = top->idx.itemID;
5845     gcp->thistype = OBJ_BIOSEQSET;
5846   }
5847 
5848   /* only report the first one that doesn't match */
5849 
5850   ValidErr (vsp, SEV_WARNING, ERR_SEQ_PKG_InconsistentMolInfoBiomols, "Pop/phy/mut/eco set contains inconsistent MolInfo biomols");
5851 
5852   gcp->entityID = oldEntityID;
5853   gcp->itemID = oldItemID;
5854   gcp->thistype = oldItemtype;
5855 
5856   return TRUE;
5857 }
5858 
LookForMolInfoInconsistency(BioseqSetPtr bssp,ValidStructPtr vsp)5859 static void LookForMolInfoInconsistency (BioseqSetPtr bssp, ValidStructPtr vsp)
5860 
5861 {
5862   MolInfoPtr    mip = NULL;
5863   SeqEntryPtr   sep;
5864 
5865   if (bssp == NULL) return;
5866 
5867   for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
5868     if (CheckForInconsistentMolInfos (sep, vsp, &mip, bssp))
5869       return;
5870   }
5871 }
5872 
SetHasMolInfo(BioseqSetPtr bssp)5873 static Boolean SetHasMolInfo (BioseqSetPtr bssp)
5874 
5875 {
5876   SeqDescrPtr  sdp;
5877 
5878   if (bssp == NULL) return FALSE;
5879 
5880   for (sdp = bssp->descr; sdp != NULL; sdp = sdp->next) {
5881     if (sdp->choice == Seq_descr_molinfo) return TRUE;
5882   }
5883 
5884   return FALSE;
5885 }
5886 
ValidatePopSet(BioseqSetPtr bssp,ValidStructPtr vsp)5887 static void ValidatePopSet (BioseqSetPtr bssp, ValidStructPtr vsp)
5888 
5889 {
5890   BioseqSetPtr  bssp1;
5891   OrgRefPtr     orp = NULL;
5892   SeqEntryPtr   sep;
5893 
5894   if (bssp->_class != BioseqseqSet_class_pop_set)
5895     return;
5896 
5897   if (vsp->is_refseq_in_sep) {
5898     ValidErr (vsp, SEV_REJECT, ERR_SEQ_PKG_RefSeqPopSet,
5899               "RefSeq record should not be a Pop-set");
5900   }
5901 
5902   for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
5903     if (!IS_Bioseq_set (sep)) continue;
5904     bssp1 = sep->data.ptrvalue;
5905     if (bssp1 == NULL) continue;
5906 
5907     if (bssp1->_class == BioseqseqSet_class_genbank) {
5908       ValidErr (vsp, SEV_INFO, ERR_SEQ_PKG_InternalGenBankSet,
5909                 "Bioseq-set contains internal GenBank Bioseq-set");
5910     }
5911   }
5912 
5913   if (SetHasMolInfo (bssp)) {
5914     ValidErr (vsp, SEV_WARNING, ERR_SEQ_PKG_MisplacedMolInfo, "Pop set has MolInfo on set");
5915   }
5916 
5917   LookForMolInfoInconsistency (bssp, vsp);
5918 
5919   for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
5920     if (CheckForInconsistentBiosources (sep, vsp, &orp, bssp))
5921       return;
5922   }
5923 }
5924 
5925 //LCOV_EXCL_START
5926 typedef struct mutsetsrcdata {
5927   CharPtr  taxname;
5928   Int2     num_not_mut_origin;
5929   Boolean  failed;
5930 } MutSetSrcData, PNTR MutSetSrcPtr;
5931 
CheckMutSetSources(BioSourcePtr biop,Pointer userdata)5932 static void CheckMutSetSources (BioSourcePtr biop, Pointer userdata)
5933 
5934 {
5935   MutSetSrcPtr  mssp;
5936   OrgRefPtr     orp;
5937 
5938   if (biop == NULL || userdata == NULL) return;
5939   mssp = (MutSetSrcPtr) userdata;
5940 
5941   orp = biop->org;
5942   if (orp == NULL || StringHasNoText (orp->taxname)) return;
5943   if (mssp->taxname == NULL) {
5944     mssp->taxname = orp->taxname;
5945   } else if (StringCmp (mssp->taxname, orp->taxname) != 0) {
5946     mssp->failed = TRUE;
5947   }
5948   if (biop->origin != ORG_MUT) {
5949     (mssp->num_not_mut_origin)++;
5950   }
5951 }
5952 //LCOV_EXCL_STOP
5953 
ValidateMutSet(BioseqSetPtr bssp,ValidStructPtr vsp)5954 static void ValidateMutSet (BioseqSetPtr bssp, ValidStructPtr vsp)
5955 
5956 {
5957   BioseqSetPtr   bssp1;
5958 /*  MutSetSrcData  mssd; */
5959   SeqEntryPtr    sep;
5960 
5961   if (bssp->_class != BioseqseqSet_class_mut_set)
5962     return;
5963 
5964   for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
5965     if (!IS_Bioseq_set (sep)) continue;
5966     bssp1 = sep->data.ptrvalue;
5967     if (bssp1 == NULL) continue;
5968 
5969     if (bssp1->_class == BioseqseqSet_class_genbank) {
5970         ValidErr(vsp, SEV_INFO, ERR_SEQ_PKG_InternalGenBankSet,
5971                 "Bioseq-set contains internal GenBank Bioseq-set");
5972     }
5973   }
5974 
5975   if (SetHasMolInfo (bssp)) {
5976     ValidErr (vsp, SEV_WARNING, ERR_SEQ_PKG_MisplacedMolInfo, "Mut set has MolInfo on set");
5977   }
5978 
5979   LookForMolInfoInconsistency (bssp, vsp);
5980 
5981   /* error is currently suppressed
5982   MemSet ((Pointer) &mssd, 0, sizeof (MutSetSrcData));
5983   VisitBioSourcesInSet (bssp, (Pointer) &mssd, CheckMutSetSources);
5984   if (mssd.failed) {
5985     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_InconsistentBioSources, "Mutation set contains inconsistent organisms.");
5986   }
5987   if (mssd.num_not_mut_origin > 1) {
5988     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_InconsistentBioSources, "Mutation set contains more than one non-mutant organism.");
5989   }
5990   */
5991 }
5992 
ValidateGenbankSet(BioseqSetPtr bssp,ValidStructPtr vsp)5993 static void ValidateGenbankSet (BioseqSetPtr bssp, ValidStructPtr vsp)
5994 
5995 {
5996   BioseqSetPtr    bssp1;
5997   SeqEntryPtr     sep;
5998 
5999   for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
6000     if (!IS_Bioseq_set (sep)) continue;
6001     bssp1 = sep->data.ptrvalue;
6002     if (bssp1 == NULL) continue;
6003 
6004     if (bssp1->_class == BioseqseqSet_class_genbank) {
6005         ValidErr(vsp, SEV_INFO, ERR_SEQ_PKG_InternalGenBankSet,
6006                 "Bioseq-set contains internal GenBank Bioseq-set");
6007     }
6008   }
6009 
6010   if (SetHasMolInfo (bssp)) {
6011     ValidErr (vsp, SEV_WARNING, ERR_SEQ_PKG_MisplacedMolInfo, "Genbank set has MolInfo on set");
6012   }
6013 }
6014 
ValidatePhyEcoWgsSet(BioseqSetPtr bssp,ValidStructPtr vsp)6015 static void ValidatePhyEcoWgsSet (BioseqSetPtr bssp, ValidStructPtr vsp)
6016 
6017 {
6018   BioseqSetPtr  bssp1;
6019   SeqEntryPtr   sep;
6020 
6021   for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
6022     if (!IS_Bioseq_set (sep)) continue;
6023     bssp1 = sep->data.ptrvalue;
6024     if (bssp1 == NULL) continue;
6025 
6026     if (bssp1->_class == BioseqseqSet_class_genbank) {
6027         ValidErr(vsp, SEV_INFO, ERR_SEQ_PKG_InternalGenBankSet,
6028                 "Bioseq-set contains internal GenBank Bioseq-set");
6029     }
6030   }
6031 
6032   if (SetHasMolInfo (bssp)) {
6033     ValidErr (vsp, SEV_WARNING, ERR_SEQ_PKG_MisplacedMolInfo, "Phy/eco/wgs set has MolInfo on set");
6034   }
6035 
6036   LookForMolInfoInconsistency (bssp, vsp);
6037 }
6038 
ValidateGenProdSet(BioseqSetPtr bssp,ValidStructPtr vsp)6039 static void ValidateGenProdSet (BioseqSetPtr bssp, ValidStructPtr vsp)
6040 
6041 {
6042   BioseqPtr       bsp;
6043   BioseqPtr       cdna;
6044   SeqMgrFeatContext fcontext;
6045   GatherContextPtr gcp = NULL;
6046   GeneRefPtr      grp;
6047   CharPtr         loc = NULL;
6048   SeqFeatPtr      mrna;
6049   Uint2           olditemtype = 0;
6050   Uint4           olditemid = 0;
6051   Boolean         pseudo;
6052   SeqEntryPtr     sep;
6053   SeqIdPtr        sip;
6054   Boolean         suppressed;
6055 
6056   if (bssp->_class != BioseqseqSet_class_gen_prod_set)
6057     return;
6058 
6059   if (bssp->annot != NULL) {
6060     ValidErr (vsp, SEV_REJECT, ERR_SEQ_PKG_GenomicProductPackagingProblem, "Seq-annot packaged directly on genomic product set");
6061   }
6062 
6063   sep = bssp->seq_set;
6064   if (!IS_Bioseq (sep))
6065     return;
6066   bsp = (BioseqPtr) sep->data.ptrvalue;
6067   if (bsp == NULL)
6068     return;
6069 
6070   gcp = vsp->gcp;
6071   if (gcp == NULL)
6072     return;
6073   olditemid = gcp->itemID;
6074   olditemtype = gcp->thistype;
6075 
6076   if (vsp->useSeqMgrIndexes) {
6077     for (mrna = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_mRNA, &fcontext);
6078          mrna != NULL;
6079          mrna = SeqMgrGetNextFeature (bsp, mrna, 0, FEATDEF_mRNA, &fcontext)) {
6080       if (mrna->pseudo) continue;
6081       grp = GetGeneByFeat (mrna, &pseudo, &suppressed);
6082       if (pseudo) continue;
6083       cdna = BioseqFindFromSeqLoc (mrna->product);
6084       if (cdna != NULL) continue;
6085       gcp->itemID = mrna->idx.itemID;
6086       gcp->thistype = OBJ_SEQFEAT;
6087       loc = SeqLocPrint (mrna->product);
6088       if (loc == NULL) {
6089         loc = StringSave ("?");
6090       }
6091       sip = SeqLocId (mrna->product);
6092       /* okay to have far RefSeq product */
6093       if (sip == NULL || sip->choice != SEQID_OTHER) {
6094         ValidErr (vsp, SEV_WARNING, ERR_SEQ_PKG_GenomicProductPackagingProblem, "Product of mRNA feature (%s) not packaged in genomic product set", loc);
6095       }
6096       MemFree (loc);
6097     }
6098   }
6099 
6100   if (SetHasMolInfo (bssp)) {
6101     ValidErr (vsp, SEV_WARNING, ERR_SEQ_PKG_MisplacedMolInfo, "GenProd set has MolInfo on set");
6102   }
6103 
6104   gcp->itemID = olditemid;
6105   gcp->thistype = olditemtype;
6106 }
6107 
NestedSetProc(BioseqSetPtr bssp,Pointer userdata)6108 static void NestedSetProc (BioseqSetPtr bssp, Pointer userdata)
6109 
6110 {
6111   ValidStructPtr   vsp;
6112   GatherContextPtr gcp = NULL;
6113 
6114   if (bssp == NULL) return;
6115 
6116   /* pop/phy/mut/eco set can contain up to nuc-prot sets */
6117   switch (bssp->_class) {
6118   case BioseqseqSet_class_nuc_prot:
6119   case BioseqseqSet_class_segset:
6120   case BioseqseqSet_class_parts:
6121     return;
6122   default:
6123     break;
6124   }
6125 
6126   vsp = (ValidStructPtr) userdata;
6127   if (vsp == NULL) return;
6128   gcp = vsp->gcp;
6129   if (gcp == NULL) return;
6130 
6131   ValidErr (vsp, SEV_WARNING, ERR_SEQ_PKG_ImproperlyNestedSets, "Nested sets within Pop/Phy/Mut/Eco/Wgs set");
6132 }
6133 
CheckForNestedSets(BioseqSetPtr bssp,Pointer userdata)6134 static void CheckForNestedSets (BioseqSetPtr bssp, Pointer userdata)
6135 
6136 {
6137   SeqEntryPtr  sep;
6138 
6139   if (bssp == NULL) return;
6140 
6141   for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
6142     if (!IS_Bioseq_set (sep)) continue;
6143     VisitSetsInSep (sep, userdata, NestedSetProc);
6144   }
6145 }
6146 
FindDBlinkUserObjectOnBsp(SeqDescrPtr sdp,Pointer userdata)6147 static void FindDBlinkUserObjectOnBsp (SeqDescrPtr sdp, Pointer userdata)
6148 
6149 {
6150   GatherContextPtr  gcp;
6151   ObjectIdPtr       oip;
6152   UserObjectPtr     uop;
6153   ValidStructPtr    vsp;
6154 
6155   if (sdp == NULL || sdp->choice != Seq_descr_user) return;
6156   uop = (UserObjectPtr) sdp->data.ptrvalue;
6157   if (uop == NULL) return;
6158   oip = uop->type;
6159   if (oip == NULL) return;
6160 
6161   if (StringICmp (oip->str, "DBLink") != 0) return;
6162 
6163   vsp = (ValidStructPtr) userdata;
6164   if (vsp == NULL) return;
6165   gcp = vsp->gcp;
6166   if (gcp == NULL) return;
6167 
6168   ValidErr (vsp, SEV_REJECT, ERR_SEQ_DESCR_DBLinkProblem, "DBLink user object should not be on a Bioseq");
6169 }
6170 
FindDBlinkUserObjectInSet(SeqDescrPtr sdp,Pointer userdata)6171 static void FindDBlinkUserObjectInSet (SeqDescrPtr sdp, Pointer userdata)
6172 
6173 {
6174   GatherContextPtr  gcp;
6175   ObjectIdPtr       oip;
6176   UserObjectPtr     uop;
6177   ValidStructPtr    vsp;
6178 
6179   if (sdp == NULL || sdp->choice != Seq_descr_user) return;
6180   uop = (UserObjectPtr) sdp->data.ptrvalue;
6181   if (uop == NULL) return;
6182   oip = uop->type;
6183   if (oip == NULL) return;
6184 
6185   if (StringICmp (oip->str, "DBLink") != 0) return;
6186 
6187   vsp = (ValidStructPtr) userdata;
6188   if (vsp == NULL) return;
6189   gcp = vsp->gcp;
6190   if (gcp == NULL) return;
6191 
6192   ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_DBLinkProblem, "DBLink user object should not be on this set");
6193 }
6194 
ShouldHaveNoDblink(BioseqSetPtr bssp,Pointer userdata)6195 static void ShouldHaveNoDblink (BioseqSetPtr bssp, Pointer userdata)
6196 
6197 {
6198   ValidStructPtr  vsp;
6199 
6200   if (bssp == NULL) return;
6201   vsp = (ValidStructPtr) userdata;
6202   if (vsp == NULL) return;
6203 
6204   VisitDescriptorsOnSet (bssp, vsp, FindDBlinkUserObjectInSet);
6205 }
6206 
ValidateBioseqSet(GatherContextPtr gcp)6207 static void ValidateBioseqSet (GatherContextPtr gcp)
6208 
6209 {
6210   BioseqSetPtr    bssp;
6211   ValidStructPtr  vsp;
6212   SeqEntryPtr     sep;
6213   SeqDescrPtr     sdp;
6214   CharPtr         str;
6215   Boolean         has_title = FALSE;
6216 
6217   vsp = (ValidStructPtr) (gcp->userdata);
6218   bssp = (BioseqSetPtr) (gcp->thisitem);
6219   vsp->bssp = bssp;
6220   vsp->bsp = NULL;
6221   vsp->descr = NULL;
6222   vsp->sfp = NULL;
6223 
6224   if (vsp->non_ascii_chars) {   /* non_ascii chars in AsnRead step */
6225     //LCOV_EXCL_START
6226     //reader strips non-ascii characters, can't test in regression
6227     ValidErr (vsp, SEV_ERROR, ERR_GENERIC_NonAsciiAsn, "Non-ascii chars in input ASN.1 strings");
6228     vsp->non_ascii_chars = FALSE;       /* only do once */
6229     //LCOV_EXCL_STOP
6230   }
6231 
6232   vsp->nuccnt = 0;
6233   vsp->segcnt = 0;
6234   vsp->protcnt = 0;
6235 
6236   sep = gcp->sep;
6237 
6238   SeqEntryExplore (sep, (Pointer) vsp, ValidateSetContents);
6239 
6240   switch (bssp->_class) {
6241   case BioseqseqSet_class_not_set:
6242       //LCOV_EXCL_START
6243       //BasicCleanup fixes not-set to genbank
6244     ValidErr (vsp, SEV_WARNING, ERR_SEQ_PKG_BioseqSetClassNotSet, "Bioseq_set class not set");
6245     break;
6246     //LCOV_EXCL_STOP
6247   case BioseqseqSet_class_nuc_prot:
6248     if (vsp->nuccnt == 0) {
6249       ValidErr (vsp, SEV_ERROR, ERR_SEQ_PKG_NucProtProblem, "No nucleotides in nuc-prot set");
6250     }
6251     if (vsp->protcnt == 0) {
6252       ValidErr (vsp, SEV_ERROR, ERR_SEQ_PKG_NucProtProblem, "No proteins in nuc-prot set");
6253     }
6254     if (vsp->nuccnt > 1 && vsp->segcnt == 0) {
6255       ValidErr (vsp, SEV_REJECT, ERR_SEQ_PKG_NucProtProblem, "Multiple unsegmented nucleotides in nuc-prot set");
6256     }
6257     ValidateNucProtSet (bssp, vsp);
6258     break;
6259   case BioseqseqSet_class_segset:
6260       //LCOV_EXCL_START
6261       //segsets are obsolete
6262     if (vsp->segcnt == 0) {
6263       ValidErr (vsp, SEV_ERROR, ERR_SEQ_PKG_SegSetProblem, "No segmented Bioseq in segset");
6264     }
6265     ValidateSegmentedSet (bssp, vsp);
6266     break;
6267     //LCOV_EXCL_STOP
6268   case BioseqseqSet_class_conset:
6269     if (vsp->indexerVersion && (! vsp->is_refseq_in_sep)) {
6270       ValidErr (vsp, SEV_ERROR, ERR_SEQ_PKG_ConSetProblem, "Set class should not be conset");
6271     }
6272     break;
6273   case BioseqseqSet_class_parts:
6274       //LCOV_EXCL_START
6275       //segsets are obsolete
6276     ValidatePartsSet(bssp, vsp);
6277     break;
6278     //LCOV_EXCL_STOP
6279   case BioseqseqSet_class_genbank:
6280     ValidateGenbankSet (bssp, vsp);
6281     ShouldHaveNoDblink (bssp, vsp);
6282     break;
6283   case BioseqseqSet_class_pop_set:
6284     ValidatePopSet (bssp, vsp);
6285     CheckForNestedSets (bssp, vsp);
6286     ShouldHaveNoDblink (bssp, vsp);
6287     break;
6288   case BioseqseqSet_class_mut_set:
6289     ValidateMutSet (bssp, vsp);
6290     CheckForNestedSets (bssp, vsp);
6291     ShouldHaveNoDblink (bssp, vsp);
6292     break;
6293   case BioseqseqSet_class_phy_set:
6294   case BioseqseqSet_class_eco_set:
6295   case BioseqseqSet_class_wgs_set:
6296     ValidatePhyEcoWgsSet (bssp, vsp);
6297     CheckForNestedSets (bssp, vsp);
6298     ShouldHaveNoDblink (bssp, vsp);
6299     break;
6300   case BioseqseqSet_class_small_genome_set:
6301     ValidatePhyEcoWgsSet (bssp, vsp);
6302     CheckForNestedSets (bssp, vsp);
6303     ShouldHaveNoDblink (bssp, vsp);
6304     break;
6305   case BioseqseqSet_class_gen_prod_set:
6306     ValidateGenProdSet (bssp, vsp);
6307     break;
6308   /*
6309   case BioseqseqSet_class_other:
6310     ValidErr (vsp, SEV_REJECT, ERR_SEQ_PKG_GenomicProductPackagingProblem, "Genomic product set class incorrectly set to other");
6311     break;
6312   */
6313   default:
6314     if (!((vsp->nuccnt) || (vsp->protcnt))) {
6315       ValidErr (vsp, SEV_WARNING, ERR_SEQ_PKG_EmptySet, "No Bioseqs in this set");
6316     }
6317     break;
6318   }
6319 
6320   switch (bssp->_class) {
6321   case BioseqseqSet_class_pop_set:
6322   case BioseqseqSet_class_mut_set:
6323   case BioseqseqSet_class_phy_set:
6324   case BioseqseqSet_class_eco_set:
6325     for (sdp = bssp->descr; sdp != NULL; sdp = sdp->next) {
6326       if (sdp->choice != Seq_descr_title) continue;
6327       str = (CharPtr) sdp->data.ptrvalue;
6328       if (StringHasNoText (str)) continue;
6329       has_title = TRUE;
6330     }
6331     if (! has_title && (vsp->is_insd_in_sep || vsp->is_refseq_in_sep)) {
6332       ValidErr (vsp, SEV_WARNING, ERR_SEQ_PKG_MissingSetTitle, "Pop/Phy/Mut/Eco set does not have title");
6333     }
6334     sep = bssp->seq_set;
6335     if (sep == NULL) {
6336       ValidErr (vsp, SEV_WARNING, ERR_SEQ_PKG_EmptySet, "Pop/Phy/Mut/Eco set has no components");
6337     } else if (sep->next == NULL) {
6338       if (VisitAlignmentsInSep (gcp->sep, NULL, NULL) == 0) {
6339         ValidErr (vsp, SEV_WARNING, ERR_SEQ_PKG_SingleItemSet, "Pop/Phy/Mut/Eco set has only one component and no alignments");
6340       }
6341     }
6342     break;
6343   default:
6344     break;
6345   }
6346 }
6347 
SuppressTrailingXMessage(BioseqPtr bsp)6348 static Boolean SuppressTrailingXMessage (BioseqPtr bsp)
6349 {
6350   ByteStorePtr    bs;
6351   SeqFeatPtr      cds;
6352   Boolean         hasstar;
6353   Int4            len;
6354   MolInfoPtr      mip;
6355   SeqDescrPtr     sdp;
6356   CharPtr         str;
6357 
6358   cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
6359   if (cds != NULL) {
6360     //LCOV_EXCL_START
6361     // bug in C code prevents this from being called
6362     bs = ProteinFromCdRegionEx (cds, TRUE, FALSE);
6363     if (bs != NULL) {
6364       str = BSMerge (bs, NULL);
6365       BSFree (bs);
6366       hasstar = FALSE;
6367       if (str != NULL) {
6368         len = StringLen (str);
6369         if (len > 1 && str[len - 1] == '*') {
6370           hasstar = TRUE;
6371         }
6372       }
6373       MemFree (str);
6374       return hasstar;
6375     }
6376     //LCOV_EXCL_STOP
6377   }
6378   sdp = BioseqGetSeqDescr (bsp, Seq_descr_molinfo, NULL);
6379   if (sdp != NULL) {
6380     mip = (MolInfoPtr) sdp->data.ptrvalue;
6381     if (mip != NULL) {
6382       if (mip->completeness == 4 || mip->completeness == 5)
6383         return TRUE;
6384     }
6385   }
6386   return FALSE;
6387 }
6388 
LookForSecondaryConflict(ValidStructPtr vsp,GatherContextPtr gcp,CharPtr accn,ValNodePtr extra_acc)6389 static void LookForSecondaryConflict (ValidStructPtr vsp, GatherContextPtr gcp, CharPtr accn, ValNodePtr extra_acc)
6390 {
6391   CharPtr         str;
6392   ValNodePtr      vnp;
6393 
6394   if (vsp == NULL || gcp == NULL)
6395     return;
6396   if (StringHasNoText (accn))
6397     return;
6398   for (vnp = extra_acc; vnp != NULL; vnp = vnp->next) {
6399     str = (CharPtr) vnp->data.ptrvalue;
6400     if (StringHasNoText (str))
6401       continue;
6402     if (StringICmp (accn, str) == 0) {
6403       ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_BadSecondaryAccn, "%s used for both primary and secondary accession", accn);
6404     }
6405   }
6406 }
6407 
6408 //LCOV_EXCL_START
6409 // Only for SegSets
CheckSegBspAgainstParts(ValidStructPtr vsp,GatherContextPtr gcp,BioseqPtr bsp)6410 static void CheckSegBspAgainstParts (ValidStructPtr vsp, GatherContextPtr gcp, BioseqPtr bsp)
6411 {
6412   BioseqSetPtr    bssp;
6413   Boolean         is_odd;
6414   BioseqPtr       part;
6415   SeqEntryPtr     sep;
6416   SeqIdPtr        sip;
6417   SeqLocPtr       slp;
6418   BioseqPtr       vbsp;
6419 
6420   if (vsp == NULL || gcp == NULL || bsp == NULL)
6421     return;
6422   if (!vsp->useSeqMgrIndexes)
6423     return;
6424 
6425   if (bsp->repr != Seq_repr_seg || bsp->seq_ext_type != 1 || bsp->seq_ext == NULL)
6426     return;
6427 
6428   sep = bsp->seqentry;
6429   if (sep == NULL)
6430     return;
6431   sep = sep->next;
6432   if (sep == NULL)
6433     return;
6434   if (!IS_Bioseq_set (sep))
6435     return;
6436   bssp = (BioseqSetPtr) sep->data.ptrvalue;
6437   if (bssp == NULL)
6438     return;
6439   if (bssp->_class != BioseqseqSet_class_parts)
6440     return;
6441 
6442   is_odd = FALSE;
6443   for (slp = (ValNodePtr) bsp->seq_ext; slp != NULL; slp = slp->next) {
6444     is_odd = (! is_odd);
6445     if (is_odd) {
6446       if (slp->choice == SEQLOC_NULL) {
6447         ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_BadSegmentedSeq, "Odd segmented component is not expected to be NULL");
6448       }
6449     } else {
6450       if (slp->choice != SEQLOC_NULL) {
6451         vbsp = BioseqFindFromSeqLoc (slp);
6452         if (vbsp != NULL) {
6453           if (vbsp->repr != Seq_repr_virtual) {
6454             ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_BadSegmentedSeq, "Even segmented component is expected to be NULL or VIRTUAL");
6455           }
6456         }
6457       }
6458     }
6459   }
6460 
6461   sep = bssp->seq_set;
6462   for (slp = (ValNodePtr) bsp->seq_ext; slp != NULL; slp = slp->next) {
6463     if (slp->choice == SEQLOC_NULL)
6464       continue;
6465     if (sep == NULL) {
6466       ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_PartsOutOfOrder, "Parts set does not contain enough Bioseqs");
6467       return;
6468     }
6469     if (IS_Bioseq (sep)) {
6470       part = (BioseqPtr) sep->data.ptrvalue;
6471       sip = SeqLocId (slp);
6472       if (sip != NULL && part != NULL) {
6473         if (!SeqIdIn (sip, part->id)) {
6474           ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_PartsOutOfOrder, "Segmented bioseq seq_ext does not correspond to parts packaging order");
6475           return;
6476         }
6477       }
6478     } else {
6479       ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_PartsOutOfOrder, "Parts set component is not Bioseq");
6480       return;
6481     }
6482     sep = sep->next;
6483   }
6484   if (sep != NULL) {
6485     ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_PartsOutOfOrder, "Parts set contains too many Bioseqs");
6486   }
6487 }
6488 //LCOV_EXCL_STOP
6489 
6490 /*****************************************************************************
6491 *
6492 *   ValidateBioseqHist(gcp)
6493 *      Validate one Bioseq Seq-hist
6494 *
6495 *****************************************************************************/
ValidateBioseqHist(GatherContextPtr gcp)6496 static void ValidateBioseqHist (GatherContextPtr gcp)
6497 
6498 {
6499   BioseqPtr       bsp;
6500   BIG_ID          gi = 0;
6501   SeqHistPtr      hist;
6502   SeqIdPtr        sip;
6503   ValidStructPtr  vsp;
6504 
6505   if (gcp == NULL) return;
6506   vsp = (ValidStructPtr) (gcp->userdata);
6507   bsp = (BioseqPtr) (gcp->thisitem);
6508   vsp->bsp = bsp;
6509   vsp->descr = NULL;
6510   vsp->sfp = NULL;
6511   vsp->bssp = (BioseqSetPtr) (gcp->parentitem);
6512   vsp->bsp_partial_val = 0;
6513 
6514   if (bsp == NULL) return;
6515   hist = bsp->hist;
6516   if (hist == NULL) return;
6517 
6518   for (sip = bsp->id; sip != NULL; sip = sip->next) {
6519     if (sip->choice == SEQID_GI) {
6520       gi = (BIG_ID) sip->data.intvalue;
6521     }
6522   }
6523   if (gi == 0) return;
6524 
6525   if (hist->replaced_by_ids != NULL && hist->replaced_by_date != NULL) {
6526 
6527     for (sip = hist->replaced_by_ids; sip != NULL; sip = sip->next) {
6528       if (sip->choice == SEQID_GI) {
6529         if (gi == (BIG_ID) sip->data.intvalue) {
6530           ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_HistoryGiCollision, "Replaced by gi (%ld) is same as current Bioseq", (long) gi);
6531         }
6532       }
6533     }
6534   }
6535 
6536   if (hist->replace_ids != NULL && hist->replace_date != NULL) {
6537 
6538     for (sip = hist->replace_ids; sip != NULL; sip = sip->next) {
6539       if (sip->choice == SEQID_GI) {
6540         if (gi == (BIG_ID) sip->data.intvalue) {
6541           ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_HistoryGiCollision, "Replaces gi (%ld) is same as current Bioseq", (long) gi);
6542         }
6543       }
6544     }
6545   }
6546 }
6547 
6548 /*****************************************************************************
6549 *
6550 *   ValidateBioseqInst(gcp)
6551 *      Validate one Bioseq Seq-inst
6552 *
6553 *****************************************************************************/
IsTpa(BioseqPtr bsp,Boolean has_tpa_assembly,BoolPtr isRefSeqP)6554 static Boolean IsTpa (
6555   BioseqPtr bsp,
6556   Boolean has_tpa_assembly,
6557   BoolPtr isRefSeqP
6558 )
6559 
6560 {
6561   DbtagPtr  dbt;
6562   Boolean   has_bankit = FALSE;
6563   Boolean   has_genbank = FALSE;
6564   Boolean   has_gi = FALSE;
6565   Boolean   has_local = FALSE;
6566   Boolean   has_refseq = FALSE;
6567   Boolean   has_smart = FALSE;
6568   Boolean   has_tpa = FALSE;
6569   SeqIdPtr  sip;
6570 
6571   if (bsp == NULL || bsp->id == NULL) return FALSE;
6572   for (sip = bsp->id; sip != NULL; sip = sip->next) {
6573     switch (sip->choice) {
6574       case SEQID_LOCAL :
6575         has_local = TRUE;
6576         break;
6577       case SEQID_GENBANK :
6578       case SEQID_EMBL :
6579       case SEQID_DDBJ :
6580         has_genbank = TRUE;
6581         break;
6582       case SEQID_OTHER :
6583         has_refseq = TRUE;
6584         if (isRefSeqP != NULL) {
6585           *isRefSeqP = TRUE;
6586         }
6587         break;
6588       case SEQID_GI :
6589         has_gi = TRUE;
6590         break;
6591       case SEQID_TPG :
6592       case SEQID_TPE :
6593       case SEQID_TPD :
6594         has_tpa = TRUE;
6595         break;
6596       case SEQID_GENERAL :
6597         dbt = (DbtagPtr) sip->data.ptrvalue;
6598         if (dbt != NULL) {
6599           if (StringICmp (dbt->db, "BankIt") == 0) {
6600             has_bankit = TRUE;
6601           }
6602           if (StringICmp (dbt->db, "TMSMART") == 0) {
6603             has_smart = TRUE;
6604           }
6605         }
6606         break;
6607       case SEQID_GPIPE :
6608         break;
6609       default :
6610         break;
6611     }
6612   }
6613 
6614   if (has_genbank) return FALSE;
6615   if (has_tpa) return TRUE;
6616   if (has_refseq) return FALSE;
6617   if (has_bankit && has_tpa_assembly) return TRUE;
6618   if (has_smart && has_tpa_assembly) return TRUE;
6619   if (has_gi) return FALSE;
6620   if (has_local && has_tpa_assembly) return TRUE;
6621 
6622   return FALSE;
6623 }
6624 
ValidateIDSetAgainstDb(GatherContextPtr gcp,ValidStructPtr vsp,BioseqPtr bsp)6625 static void ValidateIDSetAgainstDb (GatherContextPtr gcp, ValidStructPtr vsp, BioseqPtr bsp)
6626 
6627 {
6628   SeqIdPtr        sip, sipset;
6629   SeqIdPtr        gbId = NULL;
6630   SeqIdPtr        dbGbId;
6631   DbtagPtr        generalID = NULL;
6632   DbtagPtr        dbGeneralID;
6633   BIG_ID          gi = 0;
6634   BIG_ID          dbGI;
6635   Char            oldGenID [128], newGenID [128];
6636 
6637   if (gcp != NULL && vsp != NULL && bsp != NULL && vsp->validateIDSet) {
6638     for (sip = bsp->id; sip != NULL; sip = sip->next) {
6639       switch (sip->choice) {
6640         case SEQID_GENBANK:
6641           gbId = sip;
6642           break;
6643         case SEQID_GI :
6644           gi = (BIG_ID) sip->data.intvalue;
6645           break;
6646         case SEQID_GENERAL :
6647           generalID = (DbtagPtr) sip->data.ptrvalue;
6648           break;
6649         default :
6650           break;
6651       }
6652     }
6653     if (gi == 0 && gbId != NULL) {
6654       gi = GetGIForSeqId (gbId);
6655     }
6656     if (gi > 0) {
6657       sipset = GetSeqIdSetForGI (gi);
6658       if (sipset != NULL) {
6659         dbGI = 0;
6660         dbGbId = NULL;
6661         dbGeneralID = NULL;
6662         oldGenID [0] = '\0';
6663         newGenID [0] = '\0';
6664         for (sip = sipset; sip != NULL; sip = sip->next) {
6665           switch (sip->choice) {
6666             case SEQID_GI :
6667               dbGI = (BIG_ID) sip->data.intvalue;
6668               break;
6669             case SEQID_GENBANK:
6670               dbGbId = sip;
6671               break;
6672             case SEQID_GENERAL :
6673               dbGeneralID = (DbtagPtr) sip->data.ptrvalue;
6674               break;
6675             default :
6676               break;
6677           }
6678         }
6679         if (dbGI != gi) {
6680           ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_UnexpectedIdentifierChange, "New gi number (%ld) does not match one in NCBI sequence repository (%ld)", (long) gi, (long) dbGI);
6681         }
6682         if (gbId != NULL && dbGbId != NULL) {
6683           if (! SeqIdMatch (gbId, dbGbId)) {
6684             SeqIdWrite (dbGbId, oldGenID, PRINTID_FASTA_SHORT, sizeof (oldGenID));
6685             SeqIdWrite (gbId, newGenID, PRINTID_FASTA_SHORT, sizeof (newGenID));
6686             ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_UnexpectedIdentifierChange, "New accession (%s) does not match one in NCBI sequence repository (%s) on gi (%ld)", newGenID, oldGenID, (long) gi);
6687           }
6688         } else if (gbId != NULL) {
6689           SeqIdWrite (gbId, newGenID, PRINTID_FASTA_SHORT, sizeof (newGenID));
6690           ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_UnexpectedIdentifierChange, "Gain of accession (%s) on gi (%ld) compared to the NCBI sequence repository", newGenID, (long) gi);
6691         } else if (dbGbId != NULL) {
6692           SeqIdWrite (dbGbId, oldGenID, PRINTID_FASTA_SHORT, sizeof (oldGenID));
6693           ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_UnexpectedIdentifierChange, "Loss of accession (%s) on gi (%ld) compared to the NCBI sequence repository", oldGenID, (long) gi);
6694         }
6695         if (generalID != NULL && dbGeneralID != NULL) {
6696           if (! DbtagMatch (generalID, dbGeneralID)) {
6697             DbtagLabel (dbGeneralID, oldGenID, sizeof (oldGenID));
6698             DbtagLabel (generalID, newGenID, sizeof (newGenID));
6699             ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_UnexpectedIdentifierChange, "New general ID (%s) does not match one in NCBI sequence repository (%s) on gi (%ld)", newGenID, oldGenID, (long) gi);
6700           }
6701         } else if (generalID != NULL) {
6702           DbtagLabel (generalID, newGenID, sizeof (newGenID));
6703           ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_UnexpectedIdentifierChange, "Gain of general ID (%s) on gi (%ld) compared to the NCBI sequence repository", newGenID, (long) gi);
6704         } else if (dbGeneralID != NULL) {
6705           DbtagLabel (dbGeneralID, oldGenID, sizeof (oldGenID));
6706           ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_UnexpectedIdentifierChange, "Loss of general ID (%s) on gi (%ld) compared to the NCBI sequence repository", oldGenID, (long) gi);
6707         }
6708       }
6709       SeqIdSetFree (sipset);
6710     }
6711   }
6712 }
6713 
6714 typedef struct enrun {
6715   GatherContextPtr  gcp;
6716   ValidStructPtr    vsp;
6717   Int4              ncount;
6718   Int4              maxrun;
6719   Int4              seqpos;
6720   Int4              gapcount;
6721   Boolean           showAll;
6722   Boolean           inNrun;
6723   Boolean           isWGS;
6724 } RunOfNs, PNTR RunOfNsPtr;
6725 
CountAdjacentProc(CharPtr sequence,Pointer userdata)6726 static void LIBCALLBACK CountAdjacentProc (CharPtr sequence, Pointer userdata)
6727 
6728 {
6729   Char              ch;
6730   GatherContextPtr  gcp;
6731   RunOfNsPtr        ronp;
6732   CharPtr           str;
6733   ValidStructPtr    vsp;
6734 
6735   ronp = (RunOfNsPtr) userdata;
6736   if (sequence == NULL || ronp == NULL) return;
6737 
6738   str = sequence;
6739   ch = *str;
6740   while (ch != '\0') {
6741     (ronp->seqpos)++;
6742     if (ch == 'N') {
6743       (ronp->ncount)++;
6744       if (ronp->ncount > ronp->maxrun) {
6745         ronp->maxrun = ronp->ncount;
6746       }
6747       ronp->inNrun = TRUE;
6748     } else {
6749       if (ch == '-') {
6750         (ronp->gapcount)++;
6751       }
6752       if (ronp->inNrun && ronp->showAll && ronp->isWGS && ronp->ncount >= 20 && ronp->seqpos > ronp->ncount + 1) {
6753         vsp = ronp->vsp;
6754         gcp = ronp->gcp;
6755         ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_InternalNsInSeqRaw, "Run of %ld Ns in raw sequence starting at base %ld",
6756                   (long) ronp->ncount, (long) (ronp->seqpos - ronp->ncount));
6757       } else if (ronp->inNrun && ronp->showAll && ronp->ncount >= 100 && ronp->seqpos > ronp->ncount + 1) {
6758         vsp = ronp->vsp;
6759         gcp = ronp->gcp;
6760         ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_InternalNsInSeqRaw, "Run of %ld Ns in raw sequence starting at base %ld",
6761                   (long) ronp->ncount, (long) (ronp->seqpos - ronp->ncount));
6762       }
6763       ronp->ncount = 0;
6764       ronp->inNrun = FALSE;
6765     }
6766     str++;
6767     ch = *str;
6768   }
6769 }
6770 
6771 
CountAdjacentNsInInterval(GatherContextPtr gcp,BioseqPtr bsp,Int4 from,Int4 to)6772 static Int4 CountAdjacentNsInInterval (GatherContextPtr gcp, BioseqPtr bsp, Int4 from, Int4 to)
6773 {
6774   SeqLocPtr slp;
6775   RunOfNs   ron;
6776 
6777   if (bsp == NULL || from < 0 || to < from || ISA_aa (bsp->mol)) {
6778     return 0;
6779   }
6780 
6781   slp = SeqLocIntNew (from, to, Seq_strand_plus, bsp->id);
6782   ron.gcp = gcp;
6783   ron.vsp = (ValidStructPtr) (gcp->userdata);
6784   ron.ncount = 0;
6785   ron.maxrun = 0;
6786   ron.seqpos = 0;
6787   ron.gapcount = 0;
6788   ron.showAll = FALSE;
6789   ron.inNrun = FALSE;
6790   ron.isWGS = FALSE;
6791   SeqPortStreamLoc (slp, STREAM_EXPAND_GAPS, (Pointer) &ron, CountAdjacentProc);
6792   slp = SeqLocFree (slp);
6793   return ron.maxrun;
6794 }
6795 
6796 
HasUnparsedBrackets(CharPtr title)6797 static Boolean HasUnparsedBrackets (CharPtr title)
6798 
6799 {
6800   CharPtr  str;
6801 
6802   if (StringHasNoText (title)) return FALSE;
6803 
6804   str = StringChr (title, '[');
6805   if (str == NULL) return FALSE;
6806   str = StringChr (str, '=');
6807   if (str == NULL) return FALSE;
6808   str = StringChr (str, ']');
6809   if (str == NULL) return FALSE;
6810   return TRUE;
6811 }
6812 
GetSequencePlusGapByFeature(SeqFeatPtr sfp)6813 static CharPtr GetSequencePlusGapByFeature (SeqFeatPtr sfp)
6814 
6815 {
6816   Int4     len;
6817   CharPtr  str = NULL;
6818 
6819   if (sfp == NULL) return NULL;
6820   len = SeqLocLen (sfp->location);
6821   if (len > 0 && len < MAXALLOC) {
6822     str = MemNew (sizeof (Char) * (len + 2));
6823     if (str != NULL) {
6824       SeqPortStreamLoc (sfp->location, EXPAND_GAPS_TO_DASHES, (Pointer) str, NULL);
6825     }
6826   }
6827 
6828   return str;
6829 }
6830 
6831 typedef struct reusedata {
6832   CharPtr  seqidstr;
6833   Int4     from;
6834   Int4     to;
6835 } ReuseData, PNTR ReuseDataPtr;
6836 
SortVnpByDeltaLoc(VoidPtr ptr1,VoidPtr ptr2)6837 static int LIBCALLBACK SortVnpByDeltaLoc (VoidPtr ptr1, VoidPtr ptr2)
6838 
6839 {
6840   int           compare;
6841   ReuseDataPtr  rdp1, rdp2;
6842   ValNodePtr    vnp1, vnp2;
6843 
6844   if (ptr1 == NULL || ptr2 == NULL) return 0;
6845   vnp1 = *((ValNodePtr PNTR) ptr1);
6846   vnp2 = *((ValNodePtr PNTR) ptr2);
6847   if (vnp1 == NULL || vnp2 == NULL) return 0;
6848   rdp1 = (ReuseDataPtr) vnp1->data.ptrvalue;
6849   rdp2 = (ReuseDataPtr) vnp2->data.ptrvalue;
6850   if (rdp1 == NULL || rdp2 == NULL) return 0;
6851 
6852   compare = StringICmp (rdp1->seqidstr, rdp2->seqidstr);
6853   if (compare > 0) {
6854     return 1;
6855   } else if (compare < 0) {
6856     return -1;
6857   }
6858 
6859   if (rdp1->from > rdp2->from) {
6860     return 1;
6861   } else if (rdp1->from < rdp2->from) {
6862     return -1;
6863   }
6864 
6865   if (rdp1->to > rdp2->to) {
6866     return 1;
6867   } else if (rdp1->to < rdp2->to) {
6868     return -1;
6869   }
6870 
6871   return 0;
6872 }
6873 
CheckDeltaForReuse(ValidStructPtr vsp,GatherContextPtr gcp,BioseqPtr bsp)6874 static void CheckDeltaForReuse (ValidStructPtr vsp, GatherContextPtr gcp, BioseqPtr bsp)
6875 
6876 {
6877   Char          buf [128];
6878   ValNodePtr    head = NULL;
6879   ValNodePtr    last = NULL;
6880   ReuseDataPtr  lastrdp = NULL;
6881   ReuseDataPtr  rdp;
6882   SeqIntPtr     sintp;
6883   SeqIdPtr      sip;
6884   SeqLocPtr     slp;
6885   ValNodePtr    vnp_dsp, vnp_r;
6886 
6887   if (vsp == NULL || gcp == NULL || bsp == NULL) return;
6888 
6889   for (vnp_dsp = (ValNodePtr) bsp->seq_ext; vnp_dsp != NULL; vnp_dsp = vnp_dsp->next) {
6890     if (vnp_dsp->choice != 1) continue;
6891     slp = (SeqLocPtr) vnp_dsp->data.ptrvalue;
6892     if (slp == NULL) continue;
6893     if (slp->choice != SEQLOC_INT) continue;
6894     sintp = (SeqIntPtr) slp->data.ptrvalue;
6895     if (sintp == NULL) continue;
6896     sip = sintp->id;
6897     if (sip == NULL) continue;
6898     if (! SeqIdWrite (sip, buf, PRINTID_FASTA_SHORT, sizeof (buf) - 1)) continue;
6899     rdp = (ReuseDataPtr) MemNew (sizeof (ReuseData));
6900     if (rdp == NULL) continue;
6901     rdp->seqidstr = StringSave (buf);
6902     rdp->from = sintp->from;
6903     rdp->to = sintp->to;
6904     vnp_r = ValNodeAddPointer (&last, 0, (Pointer) rdp);
6905     if (head == NULL) {
6906       head = vnp_r;
6907     }
6908     last = vnp_r;
6909   }
6910 
6911   if (head == NULL) return;
6912 
6913   head = ValNodeSort (head, SortVnpByDeltaLoc);
6914 
6915   for (vnp_r = head; vnp_r != NULL; vnp_r = vnp_r->next) {
6916     rdp = (ReuseDataPtr) vnp_r->data.ptrvalue;
6917     if (rdp == NULL) continue;
6918     if (lastrdp != NULL) {
6919       if (StringICmp (lastrdp->seqidstr, rdp->seqidstr) == 0) {
6920         if (lastrdp->to >= rdp->from && lastrdp->from <= rdp->to) {
6921           ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_OverlappingDeltaRange,
6922                     "Overlapping delta range %ld-%ld and %ld-%ld on a Bioseq %s",
6923                     (long) rdp->from + 1, (long) rdp->to + 1, (long) lastrdp->from + 1,
6924                     (long) lastrdp->to + 1, rdp->seqidstr);
6925         }
6926       }
6927     }
6928     lastrdp = rdp;
6929   }
6930 
6931   for (vnp_r = head; vnp_r != NULL; vnp_r = vnp_r->next) {
6932     rdp = (ReuseDataPtr) vnp_r->data.ptrvalue;
6933     if (rdp == NULL) continue;
6934     rdp->seqidstr = MemFree (rdp->seqidstr);
6935   }
6936   ValNodeFreeData (head);
6937 }
6938 
6939 static CharPtr legal_refgene_status_strings [] = {
6940   "Inferred",
6941   "Provisional",
6942   "Predicted",
6943   "Validated",
6944   "Reviewed",
6945   "Model",
6946   "WGS",
6947   "Pipeline",
6948   NULL
6949 };
6950 
6951 
ReportLongSeqId(SeqIdPtr sip,ValidStructPtr vsp,Int4 max_len)6952 static void ReportLongSeqId (SeqIdPtr sip, ValidStructPtr vsp, Int4 max_len)
6953 {
6954   Int4 id_len = 0;
6955   CharPtr id_txt;
6956 
6957   if (sip == NULL || vsp == NULL || IsNCBIFileID(sip)) {
6958     return;
6959   }
6960 
6961   id_len = SeqIdLabelLen(sip, PRINTID_FASTA_SHORT);
6962   if (id_len > max_len) {
6963     id_txt = SeqIdWholeLabel (sip, PRINTID_FASTA_SHORT);
6964     ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_BadSeqIdFormat, "Sequence ID is unusually long (%d): %s", id_len, id_txt);
6965     id_txt = MemFree (id_txt);
6966   }
6967 
6968 }
6969 
6970 
SequenceHasGaps(BioseqPtr bsp)6971 static Boolean SequenceHasGaps (BioseqPtr bsp)
6972 {
6973   SeqMgrFeatContext context;
6974   SeqFeatPtr sfp;
6975 
6976   if (bsp == NULL) {
6977     return FALSE;
6978   }
6979   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_gap, &context);
6980   if (sfp == NULL) {
6981     return FALSE;
6982   } else {
6983     return TRUE;
6984   }
6985 }
6986 
IsConWithGaps(BioseqPtr bsp)6987 static Boolean IsConWithGaps (BioseqPtr bsp)
6988 
6989 {
6990   DeltaSeqPtr  dsp;
6991   SeqLitPtr    litp;
6992 
6993   if (bsp->repr != Seq_repr_delta) return FALSE;
6994   if (bsp->seq_ext_type != 4) return FALSE;
6995   if (DeltaLitOnly (bsp)) return FALSE;
6996 
6997   for (dsp = (DeltaSeqPtr) bsp->seq_ext; dsp; dsp=dsp->next) {
6998     if (dsp->choice != 2) continue;
6999     litp = (SeqLitPtr) dsp->data.ptrvalue;
7000     if (litp == NULL) continue;
7001     if ((litp->seq_data == NULL || litp->seq_data_type == Seq_code_gap) && litp->length > 0) return TRUE;
7002   }
7003 
7004   return FALSE;
7005 }
7006 
7007 
CheckBioseqEndsForNAndGap(BioseqPtr bsp,Uint1Ptr begin_n,Uint1Ptr begin_gap,Uint1Ptr end_n,Uint1Ptr end_gap)7008 NLM_EXTERN void CheckBioseqEndsForNAndGap (BioseqPtr bsp, Uint1Ptr begin_n, Uint1Ptr begin_gap, Uint1Ptr end_n, Uint1Ptr end_gap)
7009 {
7010   SeqFeatPtr sfp;
7011   CharPtr    str;
7012   Int4       len;
7013 
7014   if (begin_n != NULL) {
7015     *begin_n = eEndIsChar_No;
7016   }
7017   if (begin_gap != NULL) {
7018     *begin_gap = eEndIsChar_No;
7019   }
7020   if (end_n != NULL) {
7021     *end_n = eEndIsChar_No;
7022   }
7023   if (end_gap != NULL) {
7024     *end_gap = eEndIsChar_No;
7025   }
7026   if (bsp == NULL) {
7027     return;
7028   }
7029   if (ISA_na (bsp->mol)
7030       && (bsp->repr == Seq_repr_raw || (bsp->repr == Seq_repr_delta && DeltaLitOnly (bsp)))
7031       && bsp->length > 10 && bsp->topology != 2) {
7032     /* check for N bases at start or stop of sequence */
7033     sfp = (SeqFeatPtr) MemNew (sizeof (SeqFeat));
7034     if (sfp == NULL) return;
7035     sfp->data.choice = SEQFEAT_COMMENT;
7036 
7037     sfp->location = AddIntervalToLocation (NULL, bsp->id, 0, 9, FALSE, FALSE);
7038     str = GetSequencePlusGapByFeature (sfp);
7039     if (str != NULL) {
7040       if (str [0] == 'n' || str [0] == 'N' && begin_n != NULL) {
7041         if (StringICmp (str, "NNNNNNNNNN") == 0) {
7042           *begin_n = eEndIsChar_All;
7043         } else {
7044           *begin_n = eEndIsChar_Last;
7045         }
7046       } else if (str [0] == '-' && begin_gap != NULL) {
7047         if (StringICmp (str, "----------") == 0) {
7048           *begin_gap = eEndIsChar_All;
7049         } else {
7050           *begin_gap = eEndIsChar_Last;
7051         }
7052       }
7053     }
7054     MemFree (str);
7055     sfp->location = SeqLocFree (sfp->location);
7056 
7057     sfp->location = AddIntervalToLocation (NULL, bsp->id, bsp->length - 10, bsp->length - 1, FALSE, FALSE);
7058     str = GetSequencePlusGapByFeature (sfp);
7059     len = StringLen (str);
7060     if (str != NULL && len > 0) {
7061       if (str [len - 1] == 'n' || str [len - 1] == 'N' && end_n != NULL) {
7062         if (StringICmp (str, "NNNNNNNNNN") == 0) {
7063           *end_n = eEndIsChar_All;
7064         } else {
7065           *end_n = eEndIsChar_Last;
7066         }
7067       } else if (str [len - 1] == '-' && end_gap != NULL) {
7068         if (StringICmp (str, "----------") == 0) {
7069           *end_gap = eEndIsChar_All;
7070         } else {
7071           *end_gap = eEndIsChar_Last;
7072         }
7073       }
7074     }
7075 
7076     MemFree (str);
7077     sfp->location = SeqLocFree (sfp->location);
7078 
7079     MemFree (sfp);
7080   }
7081 
7082 }
7083 
7084 
GetBioseqEndWarning(Boolean isNC,Boolean isPatent,Boolean only_local,BioseqPtr bsp,Uint1 end_is_char)7085 static ErrSev GetBioseqEndWarning (Boolean isNC, Boolean isPatent, Boolean only_local, BioseqPtr bsp, Uint1 end_is_char)
7086 {
7087   ErrSev sev;
7088 
7089   if (isNC || isPatent) {
7090     sev = SEV_WARNING;
7091   } else if (bsp->topology == TOPOLOGY_CIRCULAR) {
7092     sev = SEV_WARNING;
7093   } else if (only_local) {
7094     sev = SEV_WARNING;
7095   } else if (end_is_char == eEndIsChar_All) {
7096     sev = SEV_ERROR;
7097   } else {
7098     sev = SEV_WARNING;
7099   }
7100   return sev;
7101 }
7102 
7103 
IsAllNsProc(CharPtr sequence,Pointer userdata)7104 static void LIBCALLBACK IsAllNsProc (CharPtr sequence, Pointer userdata)
7105 
7106 {
7107   Int4    n_len;
7108   BoolPtr pIsAllNs;
7109 
7110   pIsAllNs = (BoolPtr) userdata;
7111   if (sequence == NULL || pIsAllNs == NULL) return;
7112 
7113   n_len = StringSpn (sequence, "N");
7114   if (StringLen (sequence) != n_len) {
7115     *pIsAllNs = FALSE;
7116   }
7117 }
7118 
7119 
IsSequenceAllNs(BioseqPtr bsp)7120 static Boolean IsSequenceAllNs (BioseqPtr bsp)
7121 {
7122   Boolean rval = TRUE;
7123   ErrSev            logsev;
7124   ErrSev            msgsev;
7125 
7126   if (bsp == NULL || bsp->repr == Seq_repr_virtual || bsp->repr == Seq_repr_map) {
7127     return FALSE;
7128   }
7129   msgsev = ErrSetMessageLevel (SEV_MAX);
7130   logsev = ErrSetLogLevel (SEV_MAX);
7131   SeqPortStream (bsp, STREAM_EXPAND_GAPS, (Pointer) &rval, IsAllNsProc);
7132   ErrSetLogLevel (logsev);
7133   ErrSetMessageLevel (msgsev);
7134   return rval;
7135 }
7136 
7137 
ValidateBioseqEnds(BioseqPtr bsp,ValidStructPtr vsp,Boolean isPatent)7138 static void ValidateBioseqEnds (BioseqPtr bsp, ValidStructPtr vsp, Boolean isPatent)
7139 {
7140   Uint1   begin_n, begin_gap, end_n, end_gap;
7141   ErrSev  sev;
7142   Boolean only_local = TRUE;
7143   Boolean isNC = FALSE;
7144   SeqIdPtr sip1;
7145   TextSeqIdPtr tsip;
7146 
7147   if (bsp == NULL || ISA_aa(bsp->mol)) {
7148     return;
7149   }
7150 
7151   if (IsSequenceAllNs(bsp)) {
7152     ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_AllNs, "Sequence is all Ns");
7153     return;
7154   }
7155 
7156   CheckBioseqEndsForNAndGap (bsp, &begin_n, &begin_gap, &end_n, &end_gap);
7157 
7158   for (sip1 = bsp->id; sip1 != NULL; sip1 = sip1->next) {
7159     if (sip1->choice != SEQID_LOCAL) {
7160       only_local = FALSE;
7161     } else if (sip1->choice == SEQID_OTHER) {
7162       tsip = (TextSeqIdPtr) sip1->data.ptrvalue;
7163       if (tsip != NULL && tsip->accession != NULL && StringNICmp (tsip->accession, "NC_", 3) == 0) {
7164         isNC = TRUE;
7165       }
7166     }
7167   }
7168 
7169   if (begin_n != eEndIsChar_No) {
7170     sev = GetBioseqEndWarning(isNC, isPatent, only_local, bsp, begin_n);
7171     ValidErr (vsp, sev, ERR_SEQ_INST_TerminalNs, "N at beginning of sequence");
7172   } else if (begin_gap != eEndIsChar_No) {
7173     sev = GetBioseqEndWarning(isNC, isPatent, only_local, bsp, begin_gap);
7174     ValidErr (vsp, sev, ERR_SEQ_INST_TerminalGap, "Gap at beginning of sequence");
7175   }
7176 
7177   if (end_n != eEndIsChar_No) {
7178     sev = GetBioseqEndWarning(isNC, isPatent, only_local, bsp, end_n);
7179     ValidErr (vsp, sev, ERR_SEQ_INST_TerminalNs, "N at end of sequence");
7180   } else if (end_gap != eEndIsChar_No) {
7181     sev = GetBioseqEndWarning(isNC, isPatent, only_local, bsp, end_gap);
7182     ValidErr (vsp, sev, ERR_SEQ_INST_TerminalGap, "Gap at end of sequence");
7183   }
7184 
7185 }
7186 
7187 
s_IsInNucProtSet(BioseqPtr bsp)7188 static Boolean s_IsInNucProtSet (BioseqPtr bsp)
7189 {
7190   BioseqSetPtr bssp;
7191 
7192   if (bsp == NULL
7193       || bsp->idx.parenttype != OBJ_BIOSEQSET
7194       || (bssp = (BioseqSetPtr)bsp->idx.parentptr) == NULL
7195       || bssp->_class != BioseqseqSet_class_nuc_prot) {
7196     return FALSE;
7197   } else {
7198     return TRUE;
7199   }
7200 }
7201 
7202 
7203 static CharPtr linkEvStrings [] = {
7204   "paired-ends",
7205   "align genus",
7206   "align xgenus",
7207   "align trnscpt",
7208   "within clone",
7209   "clone contig",
7210   "map",
7211   "strobe",
7212   "unspecified",
7213   "pcr",
7214   "other",
7215   "UNKNOWN VALUE",
7216   NULL
7217 };
7218 
CheckForBadSeqIdChars(ValidStructPtr vsp,CharPtr id)7219 static void CheckForBadSeqIdChars (ValidStructPtr vsp, CharPtr id)
7220 
7221 {
7222   Char     ch;
7223   CharPtr  str;
7224 
7225   if (vsp == NULL || id == NULL) return;
7226 
7227   str = id;
7228   ch = *str;
7229   while (ch != '\0') {
7230     if (ch == '|' || ch == ',') {
7231       ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_BadSeqIdFormat, "Bad character '%c' in accession '%s'", ch , id);
7232     }
7233     str++;
7234     ch = *str;
7235   }
7236 }
7237 
ValidateBioseqInst(GatherContextPtr gcp)7238 static void ValidateBioseqInst (GatherContextPtr gcp)
7239 {
7240   Boolean         retval = TRUE;
7241   Int2            i, start_at, num;
7242   Boolean         errors[4], check_alphabet;
7243   static char    *repr[8] = {
7244     "virtual", "raw", "segmented", "constructed",
7245     "reference", "consensus", "map", "delta"
7246   };
7247   /*
7248   SeqPortPtr      spp;
7249   */
7250   Int2            residue, x, termination, gapchar;
7251   Boolean         gapatstart;
7252   Int4            len, divisor = 1, len2, len3;
7253   ValNode         head, vn;
7254   ValNodePtr      vnp, idlist;
7255   BioseqContextPtr bcp;
7256   Boolean         got_partial, is_invalid;
7257   int             seqtype, terminations, dashes;
7258   ValidStructPtr  vsp;
7259   BioseqPtr       bsp, bsp2;
7260   SeqIdPtr        sip1, sip2, sip3;
7261   SeqLocPtr       slp;
7262   SeqIntPtr       sintp;
7263   Char            buf1[128], buf2[128];
7264   SeqLitPtr       slitp;
7265   SeqGapPtr       sgp;
7266   SeqCodeTablePtr sctp;
7267   MolInfoPtr      mip = NULL;
7268   BioSourcePtr    biop = NULL;
7269   OrgRefPtr       orp;
7270   SeqMgrDescContext context;
7271   SeqFeatPtr      cds;
7272   CdRegionPtr     crp;
7273   GBBlockPtr      gbp;
7274   GeneRefPtr      grp;
7275   SeqFeatPtr      gene;
7276   SeqMgrFeatContext genectxt;
7277   CharPtr         genelbl = NULL;
7278   SeqFeatPtr      prot;
7279   SeqMgrFeatContext protctxt;
7280   CharPtr         protlbl = NULL;
7281   TextSeqIdPtr    tsip;
7282   CharPtr         ptr, last, str, title, buf, bufplus;
7283   Uint1           lastchoice;
7284   Char            ch;
7285   Boolean         multitoken;
7286   Boolean         hasGi = FALSE;
7287   SeqHistPtr      hist;
7288   Boolean         hist_asm_missing = FALSE;
7289   IntFuzzPtr      ifp;
7290   Boolean         in_gap;
7291   Boolean         in_N;
7292   Boolean         in_nps;
7293   Boolean         isActiveFin = FALSE;
7294   Boolean         isDDBJ = FALSE;
7295   Boolean         isDraft = FALSE;
7296   Boolean         isEMBL = FALSE;
7297   Boolean         isFullTop = FALSE;
7298   Boolean         isGB = FALSE;
7299   Boolean         isGIBBMT = FALSE;
7300   Boolean         isGIBBSQ = FALSE;
7301   Boolean         is_grc_title = FALSE;
7302   Boolean         isPatent = FALSE;
7303   Boolean         isPDB = FALSE;
7304   Boolean         isPreFin = FALSE;
7305   Boolean         isNG = FALSE;
7306   Boolean         isNTorNC = FALSE;
7307   Boolean         isNZ;
7308   Boolean         is_gps = FALSE;
7309   Boolean         isRefSeq = FALSE;
7310   Boolean         isSwissProt = FALSE;
7311   Boolean         isWP = FALSE;
7312   Boolean         isYP = FALSE;
7313   Boolean         is_assembly = FALSE;
7314   Boolean         is_genome_assembly = FALSE;
7315   Boolean         is_finished_status = FALSE;
7316   Boolean         is_unspec;
7317   Boolean         this_is_gen_asm;
7318   Boolean         isLRG = FALSE;
7319   ValNodePtr      keywords;
7320   Boolean         last_is_gap;
7321   Boolean         non_interspersed_gaps;
7322   Int2            num_adjacent_gaps;
7323   Int2            num_gaps;
7324   Int2            num_gap_known_or_spec;
7325   Int2            num_gap_unknown_unspec;
7326   Boolean         reportFastaBracket;
7327   SeqEntryPtr     sep;
7328   ErrSev          sev;
7329   DbtagPtr        dbt;
7330   SeqIdPtr        sip;
7331   Int2            trailingX = 0;
7332   Int2            numletters, numdigits, numunderscores;
7333   Boolean         letterAfterDigit, badIDchars, internalS;
7334   EMBLBlockPtr    ebp;
7335   SeqDescrPtr     sdp;
7336   SeqMgrDescContext dcontext;
7337   Uint2           oldEntityID, oldItemtype;
7338   Uint4           oldItemID;
7339   size_t          buflen = 1001;
7340   ItemInfo        ii;
7341   Uint1           tech;
7342   Uint2           olditemtype = 0;
7343   Uint4           olditemid = 0;
7344   ObjValNodePtr   ovp;
7345   BioseqSetPtr    bssp;
7346   UserObjectPtr   uop;
7347   UserFieldPtr    ufp;
7348   ObjectIdPtr     oip;
7349   Boolean         hasRefGeneTracking = FALSE;
7350   Boolean         hasRefTrackStatus;
7351   Boolean         hasLegalStatus;
7352   Int2            accn_count = 0;
7353   Int2            gi_count = 0;
7354   Int4            runsofn;
7355   Int4            segnum;
7356   StreamCache     sc;
7357   ValNodePtr      sc_head = NULL;
7358   ValNodePtr      sc_tail = NULL;
7359   RunOfNs         ron;
7360   Boolean         leadingX;
7361   Boolean         isLower;
7362   Boolean         isFirst;
7363   CharPtr         bases;
7364   Int4            dnalen;
7365   Int4            total;
7366   Boolean         has_barcode_keyword = FALSE;
7367   CharPtr         keyword;
7368   Int4            count;
7369   Boolean         doNotSkip;
7370   SeqMgrPtr       smp;
7371   Int4            dblink_count = 0;
7372   Int4            taa_count = 0;
7373   Int4            bs_count = 0;
7374   Int4            as_count = 0;
7375   Int4            pdb_count = 0;
7376   Int4            sra_count = 0;
7377   Int4            bp_count = 0;
7378   Int4            unknown_count = 0;
7379   Boolean         is_master = FALSE;
7380   Boolean         tsa_master = FALSE;
7381   Boolean         wgs_master = FALSE;
7382   int             linktype;
7383   Int4            linkcount;
7384   Int2            linkevarray [12];
7385   ValNodePtr      linkvnp;
7386   LinkageEvidencePtr  lep;
7387   CharPtr         curr_str, prev_str;
7388   Int2            num_seen;
7389 
7390   /* set up data structures */
7391 
7392   vsp = (ValidStructPtr) (gcp->userdata);
7393   bsp = (BioseqPtr) (gcp->thisitem);
7394   vsp->bsp = bsp;
7395   vsp->descr = NULL;
7396   vsp->sfp = NULL;
7397   vsp->bssp = (BioseqSetPtr) (gcp->parentitem);
7398   vsp->bsp_partial_val = 0;
7399 
7400   Heartbeat (vsp, NULL);
7401   sep = vsp->sep;
7402 
7403   if (vsp->non_ascii_chars) {   /* non_ascii chars in AsnRead step */
7404     //LCOV_EXCL_START
7405     //reader strips non-ascii characters, can't test in regression
7406     ValidErr (vsp, SEV_REJECT, ERR_GENERIC_NonAsciiAsn, "Non-ascii chars in input ASN.1 strings");
7407     vsp->non_ascii_chars = FALSE;       /* only do once */
7408     //LCOV_EXCL_STOP
7409   }
7410 
7411   if (bsp->id == NULL) {
7412     //LCOV_EXCL_START
7413     //C Toolkit can't get here from reading file
7414     ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_NoIdOnBioseq, "No ids on a Bioseq");
7415     return;
7416     //LCOV_EXCL_STOP
7417   }
7418 
7419   for (sip1 = bsp->id; sip1 != NULL; sip1 = sip1->next) {
7420     if (sip1->choice == SEQID_OTHER) {
7421       isRefSeq = TRUE;
7422       tsip = (TextSeqIdPtr) sip1->data.ptrvalue;
7423       if (tsip != NULL && tsip->accession != NULL) {
7424         if (StringNICmp (tsip->accession, "NT_", 3) == 0) {
7425           isNTorNC = TRUE;
7426         } else if (StringNICmp (tsip->accession, "NC_", 3) == 0) {
7427           isNTorNC = TRUE;
7428         } else if (StringNICmp (tsip->accession, "NG_", 3) == 0) {
7429           isNG = TRUE;
7430         } else if (StringNICmp (tsip->accession, "WP_", 3) == 0) {
7431           isWP = TRUE;
7432         } else if (StringNICmp (tsip->accession, "YP_", 3) == 0) {
7433           isYP = TRUE;
7434         }
7435       }
7436     } else if (sip1->choice == SEQID_GI) {
7437       hasGi = TRUE;
7438     } else if (sip1->choice == SEQID_GENBANK) {
7439       isGB = TRUE;
7440     } else if (sip1->choice == SEQID_EMBL) {
7441       isEMBL = TRUE;
7442     } else if (sip1->choice == SEQID_DDBJ) {
7443       isDDBJ = TRUE;
7444     } else if (sip1->choice == SEQID_SWISSPROT) {
7445       isSwissProt = TRUE;
7446     } else if (sip1->choice == SEQID_GIBBSQ) {
7447       isGIBBSQ = TRUE;
7448     } else if (sip1->choice == SEQID_GIBBMT) {
7449       isGIBBMT = TRUE;
7450     } else if (sip1->choice == SEQID_PATENT) {
7451       isPatent = TRUE;
7452     }
7453 
7454     for (sip2 = sip1->next; sip2 != NULL; sip2 = sip2->next) {
7455       if (SeqIdComp (sip1, sip2) != SIC_DIFF) {
7456         SeqIdWrite (sip1, buf1, PRINTID_FASTA_SHORT, 40);
7457         SeqIdWrite (sip2, buf2, PRINTID_FASTA_SHORT, 40);
7458         ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_ConflictingIdsOnBioseq, "Conflicting ids on a Bioseq: (%s - %s)", buf1, buf2);
7459       }
7460     }
7461   }
7462 
7463   for (sip1 = bsp->id; sip1 != NULL; sip1 = sip1->next) {
7464     switch (sip1->choice) {
7465         case SEQID_OTHER :
7466           tsip = (TextSeqIdPtr) sip1->data.ptrvalue;
7467           if (tsip != NULL && tsip->accession != NULL) {
7468             len = StringLen (tsip->accession);
7469             if (len == 15) {
7470               if (StringCmp (tsip->accession + 9, "000000") == 0) {
7471                 is_master = TRUE;
7472               }
7473             } else if (len == 16) {
7474               if (StringCmp (tsip->accession + 9, "0000000") == 0) {
7475                 is_master = TRUE;
7476               }
7477             } else if (len == 17) {
7478               if (StringCmp (tsip->accession + 10, "0000000") == 0) {
7479                 is_master = TRUE;
7480               }
7481             }
7482           }
7483           break;
7484         case SEQID_GENBANK :
7485         case SEQID_EMBL :
7486         case SEQID_DDBJ :
7487           tsip = (TextSeqIdPtr) sip1->data.ptrvalue;
7488           if (tsip != NULL && tsip->accession != NULL) {
7489             len = StringLen (tsip->accession);
7490             if (len == 12) {
7491               if (StringCmp (tsip->accession + 6, "000000") == 0) {
7492                 is_master = TRUE;
7493               }
7494             } else if (len == 13) {
7495               if (StringCmp (tsip->accession + 6, "0000000") == 0) {
7496                 is_master = TRUE;
7497               }
7498             } else if (len == 14) {
7499               if (StringCmp (tsip->accession + 6, "00000000") == 0) {
7500                 is_master = TRUE;
7501               }
7502             }
7503           }
7504           break;
7505        default :
7506           break;
7507     }
7508   }
7509 
7510 
7511   for (sip1 = bsp->id; sip1 != NULL; sip1 = sip1->next) {
7512     /* disabled for now
7513     ReportLongSeqId (sip1, vsp, 40);
7514     */
7515     switch (sip1->choice) {
7516     case SEQID_TPG:
7517     case SEQID_TPE:
7518     case SEQID_TPD:
7519       hist = bsp->hist;
7520       if (hist == NULL || hist->assembly == NULL) {
7521         if (ISA_na (bsp->mol) && bsp->repr != Seq_repr_seg) {
7522           hist_asm_missing = TRUE;
7523           keywords = NULL;
7524           vnp = GetNextDescriptorUnindexed (bsp, Seq_descr_genbank, NULL);
7525           if (vnp != NULL && vnp->choice == Seq_descr_genbank) {
7526             gbp = (GBBlockPtr) vnp->data.ptrvalue;
7527             if (gbp != NULL) {
7528               keywords = gbp->keywords;
7529             }
7530           }
7531           if (keywords == NULL) {
7532             vnp = GetNextDescriptorUnindexed (bsp, Seq_descr_embl, NULL);
7533             if (vnp != NULL && vnp->choice == Seq_descr_embl) {
7534               ebp = (EMBLBlockPtr) vnp->data.ptrvalue;
7535               if (ebp != NULL) {
7536                 keywords = ebp->keywords;
7537               }
7538             }
7539           }
7540           if (keywords != NULL) {
7541             for (vnp = keywords; vnp != NULL; vnp = vnp->next) {
7542               str = (CharPtr) vnp->data.ptrvalue;
7543               if (StringHasNoText (str)) continue;
7544               if (StringICmp (str, "TPA:reassembly") == 0) {
7545                 hist_asm_missing = FALSE;
7546               }
7547             }
7548           }
7549           if (hist_asm_missing) {
7550             SeqIdWrite (bsp->id, buf1, PRINTID_FASTA_SHORT, 40);
7551             ValidErr (vsp, SEV_INFO, ERR_SEQ_INST_HistAssemblyMissing, "TPA record %s should have Seq-hist.assembly for PRIMARY block", buf1);
7552           }
7553         }
7554       }
7555       /* continue falling through */
7556     case SEQID_GENBANK:
7557     case SEQID_EMBL:
7558     case SEQID_DDBJ:
7559       tsip = (TextSeqIdPtr) sip1->data.ptrvalue;
7560       if (tsip != NULL && tsip->accession != NULL) {
7561         CheckForBadSeqIdChars (vsp, tsip->accession);
7562         numletters = 0;
7563         numdigits = 0;
7564         numunderscores = 0;
7565         internalS = FALSE;
7566         letterAfterDigit = FALSE;
7567         badIDchars = FALSE;
7568         for (ptr = tsip->accession, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
7569           if (IS_UPPER (ch)) {
7570             numletters++;
7571             if (numdigits > 0 || numunderscores > 0) {
7572               if (ch == 'S' && numletters == 5 && numdigits == 2 && (! internalS)) {
7573                 numletters--;
7574                 internalS = TRUE;
7575               } else {
7576                 letterAfterDigit = TRUE;
7577               }
7578             }
7579           } else if (IS_DIGIT (ch)) {
7580             numdigits++;
7581           } else if (ch == '_') {
7582             numunderscores++;
7583             if (numdigits > 0 || numunderscores > 1) {
7584               letterAfterDigit = TRUE;
7585             }
7586           } else {
7587             badIDchars = TRUE;
7588           }
7589         }
7590         if (letterAfterDigit || badIDchars) {
7591           ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_BadSeqIdFormat, "Bad accession %s", tsip->accession);
7592         } else if (numunderscores > 0) {
7593           if (StringNCmp (tsip->accession, "MAP_", 4) != 0 || numdigits != 6) {
7594             ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_BadSeqIdFormat, "Bad accession %s", tsip->accession);
7595           }
7596         } else if (numletters == 1 && numdigits == 5 && ISA_na (bsp->mol)) {
7597         } else if (numletters == 2 && numdigits == 6 && ISA_na (bsp->mol)) {
7598         } else if (numletters == 3 && numdigits == 5 && ISA_aa (bsp->mol)) {
7599         } else if (numletters == 2 && numdigits == 6 && ISA_aa (bsp->mol) && bsp->repr == Seq_repr_seg) {
7600         } else if (numletters == 4 && internalS && (numdigits == 8 || numdigits == 9 || numdigits == 10) && ISA_na (bsp->mol) &&
7601                    (sip1->choice == SEQID_GENBANK || sip1->choice == SEQID_EMBL || sip1->choice == SEQID_DDBJ ||
7602                     sip1->choice == SEQID_TPG || sip1->choice == SEQID_TPE || sip1->choice == SEQID_TPD)) {
7603         } else if (numletters == 4 && numdigits == 8 && ISA_na (bsp->mol) &&
7604                    (sip1->choice == SEQID_GENBANK || sip1->choice == SEQID_EMBL || sip1->choice == SEQID_DDBJ ||
7605                     sip1->choice == SEQID_TPG || sip1->choice == SEQID_TPE || sip1->choice == SEQID_TPD)) {
7606         } else if (numletters == 4 && numdigits == 9 && ISA_na (bsp->mol) &&
7607                    (sip1->choice == SEQID_GENBANK || sip1->choice == SEQID_EMBL || sip1->choice == SEQID_DDBJ ||
7608                     sip1->choice == SEQID_TPG || sip1->choice == SEQID_TPE || sip1->choice == SEQID_TPD)) {
7609         } else if (numletters == 4 && numdigits == 10 && ISA_na (bsp->mol) &&
7610                    (sip1->choice == SEQID_GENBANK || sip1->choice == SEQID_EMBL || sip1->choice == SEQID_DDBJ ||
7611                     sip1->choice == SEQID_TPG || sip1->choice == SEQID_TPE || sip1->choice == SEQID_TPD)) {
7612         } else if (numletters == 5 && numdigits == 7 && ISA_na (bsp->mol) &&
7613                    (sip1->choice == SEQID_GENBANK || sip1->choice == SEQID_EMBL || sip1->choice == SEQID_DDBJ)) {
7614         } else {
7615           ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_BadSeqIdFormat, "Bad accession %s", tsip->accession);
7616         }
7617         if (vsp->useSeqMgrIndexes) {
7618           vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_genbank, &context);
7619           if (vnp != NULL) {
7620             gbp = (GBBlockPtr) vnp->data.ptrvalue;
7621             if (gbp != NULL) {
7622               LookForSecondaryConflict (vsp, gcp, tsip->accession, gbp->extra_accessions);
7623             }
7624           }
7625           vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_embl, &context);
7626           if (vnp != NULL) {
7627             ebp = (EMBLBlockPtr) vnp->data.ptrvalue;
7628             if (ebp != NULL) {
7629               LookForSecondaryConflict (vsp, gcp, tsip->accession, ebp->extra_acc);
7630             }
7631           }
7632         }
7633         if (hasGi) {
7634           if (tsip->version == 0) {
7635             ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_BadSeqIdFormat, "Accession %s has 0 version", tsip->accession);
7636           }
7637         }
7638       }
7639       /* and keep going with further test */
7640     case SEQID_OTHER:
7641       tsip = (TextSeqIdPtr) sip1->data.ptrvalue;
7642       if (tsip != NULL && tsip->name != NULL) {
7643         multitoken = FALSE;
7644         for (ptr = tsip->name, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
7645           if (IS_WHITESP (ch)) {
7646             multitoken = TRUE;
7647           }
7648         }
7649         if (multitoken) {
7650           ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SeqIdNameHasSpace, "Seq-id.name '%s' should be a single word without any spaces", tsip->name);
7651         }
7652       }
7653       if (tsip != NULL && tsip->accession != NULL && sip1->choice == SEQID_OTHER) {
7654         CheckForBadSeqIdChars (vsp, tsip->accession);
7655         numletters = 0;
7656         numdigits = 0;
7657         numunderscores = 0;
7658         letterAfterDigit = FALSE;
7659         badIDchars = FALSE;
7660         ptr = tsip->accession;
7661         isNZ = (Boolean) (StringNCmp (ptr, "NZ_", 3) == 0);
7662         if (isNZ) {
7663           ptr += 3;
7664         }
7665         for (ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
7666           if (IS_UPPER (ch)) {
7667             numletters++;
7668             if (numdigits > 0 || numunderscores > 0) {
7669               letterAfterDigit = TRUE;
7670             }
7671           } else if (IS_DIGIT (ch)) {
7672             numdigits++;
7673           } else if (ch == '_') {
7674             numunderscores++;
7675             if (numdigits > 0 || numunderscores > 1) {
7676               letterAfterDigit = TRUE;
7677             }
7678           } else {
7679             badIDchars = TRUE;
7680           }
7681         }
7682         if (letterAfterDigit || badIDchars) {
7683           ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_BadSeqIdFormat, "Bad accession %s", tsip->accession);
7684         } else if (isNZ && numletters == 4 && (numdigits == 8 || numdigits == 9) && numunderscores == 0) {
7685         } else if (isNZ && ValidateAccn (tsip->accession) == 0) {
7686         } else if (numletters == 2 && numdigits == 6 && numunderscores == 1) {
7687         } else if (numletters == 2 && numdigits == 8 && numunderscores == 1) {
7688         } else if (numletters == 2 && numdigits == 9 && numunderscores == 1) {
7689         } else {
7690           ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_BadSeqIdFormat, "Bad accession %s", tsip->accession);
7691         }
7692       }
7693       if (hasGi && tsip != NULL && tsip->accession == NULL && (! StringHasNoText (tsip->name))) {
7694         if (sip1->choice == SEQID_DDBJ && bsp->repr == Seq_repr_seg) {
7695           //LCOV_EXCL_START
7696           // Only for SegSets
7697           sev = SEV_WARNING;
7698           //LCOV_EXCL_STOP
7699         } else {
7700           sev = SEV_REJECT;
7701           ValidErr (vsp, sev, ERR_SEQ_INST_BadSeqIdFormat, "Missing accession for %s", tsip->name);
7702         }
7703       }
7704       /* and keep going with additional test */
7705     case SEQID_PIR:
7706     case SEQID_SWISSPROT:
7707     case SEQID_PRF:
7708       tsip = (TextSeqIdPtr) sip1->data.ptrvalue;
7709       if (tsip != NULL && StringHasNoText (tsip->accession) && ISA_na (bsp->mol)) {
7710         if (bsp->repr != Seq_repr_seg || hasGi) {
7711           if (sip1->choice != SEQID_DDBJ || bsp->repr != Seq_repr_seg) {
7712             SeqIdWrite (bsp->id, buf1, PRINTID_FASTA_LONG, sizeof (buf1) - 1);
7713             ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_BadSeqIdFormat, "Missing accession for %s", buf1);
7714           }
7715         }
7716       }
7717       if (tsip != NULL && StringHasNoText (tsip->accession) &&
7718           StringHasNoText (tsip->name) && ISA_aa (bsp->mol)) {
7719         if (sip1->choice == SEQID_PIR || sip1->choice == SEQID_SWISSPROT || sip1->choice == SEQID_PRF) {
7720           SeqIdWrite (bsp->id, buf1, PRINTID_FASTA_LONG, sizeof (buf1) - 1);
7721           ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_BadSeqIdFormat, "Missing identifier for %s", buf1);
7722         }
7723       }
7724       accn_count++;
7725       break;
7726     case SEQID_GPIPE:
7727       break;
7728     case SEQID_PATENT:
7729       isPatent = TRUE;
7730       break;
7731     case SEQID_PDB:
7732       isPDB = TRUE;
7733       break;
7734     case SEQID_GI:
7735       if (sip1->data.intvalue <= 0) {
7736         ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_ZeroGiNumber, "Invalid GI number");
7737       }
7738       gi_count++;
7739       break;
7740     case SEQID_GENERAL:
7741       dbt = (DbtagPtr) sip1->data.ptrvalue;
7742       if (dbt != NULL) {
7743         if (StringHasNoText (dbt->db)) {
7744           ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_BadSeqIdFormat, "General identifier missing database field");
7745         }
7746         if (StringICmp (dbt->db, "LRG") == 0) {
7747           isLRG = TRUE;
7748         }
7749         sev = SEV_ERROR;
7750         if (vsp->only_lcl_gnl_in_sep) {
7751           sev = SEV_REJECT;
7752         } else if (vsp->is_refseq_in_sep) {
7753           sev = SEV_ERROR;
7754         } else if (vsp->is_insd_in_sep) {
7755           sev = SEV_ERROR;
7756         } else if (vsp->indexerVersion) {
7757           sev = SEV_ERROR;
7758         }
7759         if (StringLen (dbt->db) > 20) {
7760           ValidErr (vsp, sev, ERR_SEQ_INST_BadSeqIdFormat, "General database longer than 20 characters");
7761         }
7762         if (StringICmp (dbt->db, "BankIt") != 0 && StringICmp (dbt->db, "TMSMART") != 0 && StringICmp (dbt->db, "NCBIFILE") != 0) {
7763           oip = dbt->tag;
7764           if (oip != NULL && StringLen (oip->str) > 64) {
7765             ValidErr (vsp, sev, ERR_SEQ_INST_BadSeqIdFormat, "General identifier longer than 64 characters");
7766           }
7767         }
7768         oip = dbt->tag;
7769         if (oip != NULL && oip->str != NULL) {
7770           CheckForBadSeqIdChars (vsp, oip->str);
7771         }
7772       }
7773       break;
7774     case SEQID_LOCAL:
7775       oip = (ObjectIdPtr) sip1->data.ptrvalue;
7776       if (oip != NULL && oip->str != NULL) {
7777         CheckForBadSeqIdChars (vsp, oip->str);
7778       }
7779       if (oip != NULL && StringLen (oip->str) > 50) {
7780         sev = SEV_ERROR;
7781         if (! vsp->is_insd_in_sep) {
7782           sev = SEV_REJECT;
7783         } else if (! vsp->indexerVersion) {
7784           sev = SEV_ERROR;
7785         }
7786         ValidErr (vsp, sev, ERR_SEQ_INST_BadSeqIdFormat, "Local identifier longer than 50 characters");
7787       }
7788       break;
7789     default:
7790       break;
7791     }
7792   }
7793 
7794   if (isLRG) {
7795     if (! isNG) {
7796       ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_ConflictingIdsOnBioseq, "LRG sequence needs NG_ accession");
7797     }
7798   }
7799 
7800   if (gi_count > 0 && accn_count == 0 && (! isPDB) && bsp->repr != Seq_repr_virtual) {
7801     if (vsp->seqSubmitParent) {
7802       sev = SEV_WARNING;
7803     } else {
7804       sev = SEV_ERROR;
7805     }
7806     ValidErr (vsp, sev, ERR_SEQ_INST_GiWithoutAccession, "No accession on sequence with gi number");
7807   }
7808   if (gi_count > 0 && accn_count > 1) {
7809     ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_MultipleAccessions, "Multiple accessions on sequence with gi number");
7810   }
7811 
7812   /* optionally check IDs against older version in database */
7813 
7814   if (vsp->validateIDSet) {
7815     ValidateIDSetAgainstDb (gcp, vsp, bsp);
7816   }
7817 
7818   vnp = NULL;
7819   if (vsp->useSeqMgrIndexes) {
7820     vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &context);
7821     if (vnp != NULL) {
7822       mip = (MolInfoPtr) vnp->data.ptrvalue;
7823       if (mip != NULL && bsp->mol == MOLECULE_CLASS_DNA) {
7824         switch (mip->biomol) {
7825           case MOLECULE_TYPE_PRE_MRNA:
7826           case MOLECULE_TYPE_MRNA:
7827           case MOLECULE_TYPE_RRNA:
7828           case MOLECULE_TYPE_TRNA:
7829           case MOLECULE_TYPE_SNRNA:
7830           case MOLECULE_TYPE_SCRNA:
7831           case MOLECULE_TYPE_CRNA:
7832           case MOLECULE_TYPE_SNORNA:
7833           case MOLECULE_TYPE_TRANSCRIBED_RNA:
7834           case MOLECULE_TYPE_NCRNA:
7835           case MOLECULE_TYPE_TMRNA:
7836             olditemid = gcp->itemID;
7837             olditemtype = gcp->thistype;
7838             gcp->itemID = context.itemID;
7839             gcp->thistype = OBJ_SEQDESC;
7840             ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InconsistentMolTypeBiomol, "Molecule type (DNA) does not match biomol (RNA)");
7841             gcp->itemID = olditemid;
7842             gcp->thistype = olditemtype;
7843             break;
7844           default:
7845             break;
7846         }
7847       }
7848     }
7849   } else {
7850 //LCOV_EXCL_START
7851     bcp = BioseqContextNew (bsp);
7852     vnp = BioseqContextGetSeqDescr (bcp, Seq_descr_molinfo, NULL, NULL);
7853     BioseqContextFree (bcp);
7854     if (vnp != NULL) {
7855       mip = (MolInfoPtr) vnp->data.ptrvalue;
7856     }
7857 //LCOV_EXCL_STOP
7858   }
7859 
7860   if (vsp->useSeqMgrIndexes) {
7861     vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_user, &context);
7862     while (vnp != NULL) {
7863       uop = (UserObjectPtr) vnp->data.ptrvalue;
7864       if (uop != NULL) {
7865         oip = uop->type;
7866         if (oip != NULL && StringICmp (oip->str, "TpaAssembly") == 0) {
7867           if (! IsTpa (bsp, TRUE, &isRefSeq)) {
7868             olditemid = gcp->itemID;
7869             olditemtype = gcp->thistype;
7870             gcp->itemID = context.itemID;
7871             gcp->thistype = OBJ_SEQDESC;
7872             SeqIdWrite (bsp->id, buf1, PRINTID_FASTA_SHORT, 40);
7873             ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "Non-TPA record %s should not have TpaAssembly object", buf1);
7874             gcp->itemID = olditemid;
7875             gcp->thistype = olditemtype;
7876           }
7877         } else if (oip != NULL && StringICmp (oip->str, "RefGeneTracking") == 0) {
7878           hasRefGeneTracking = TRUE;
7879           hasRefTrackStatus = FALSE;
7880           hasLegalStatus = FALSE;
7881           for (ufp = uop->data; ufp != NULL; ufp = ufp->next) {
7882             oip = ufp->label;
7883             if (oip != NULL && StringCmp (oip->str, "Status") == 0) {
7884               hasRefTrackStatus = TRUE;
7885               str = (CharPtr) ufp->data.ptrvalue;
7886               if (StringHasNoText (str)) {
7887                 str = "?";
7888               }
7889               for (i = 0; legal_refgene_status_strings [i] != NULL; i++) {
7890                 if (StringICmp (str, legal_refgene_status_strings [i]) == 0) {
7891                   hasLegalStatus = TRUE;
7892                   break;
7893                 }
7894               }
7895               if (! hasLegalStatus) {
7896                 olditemid = gcp->itemID;
7897                 olditemtype = gcp->thistype;
7898                 gcp->itemID = context.itemID;
7899                 gcp->thistype = OBJ_SEQDESC;
7900                 vsp->descr = vnp;
7901                 ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_RefGeneTrackingIllegalStatus, "RefGeneTracking object has illegal Status '%s'", str);
7902                 vsp->descr = NULL;
7903                 gcp->itemID = olditemid;
7904                 gcp->thistype = olditemtype;
7905               }
7906             }
7907           }
7908           if (! hasRefTrackStatus) {
7909             olditemid = gcp->itemID;
7910             olditemtype = gcp->thistype;
7911             gcp->itemID = context.itemID;
7912             gcp->thistype = OBJ_SEQDESC;
7913             vsp->descr = vnp;
7914             ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_RefGeneTrackingWithoutStatus, "RefGeneTracking object needs to have Status set");
7915             vsp->descr = NULL;
7916             gcp->itemID = olditemid;
7917             gcp->thistype = olditemtype;
7918           }
7919           if (! isRefSeq && ! vsp->is_refseq_in_sep) {
7920             olditemid = gcp->itemID;
7921             olditemtype = gcp->thistype;
7922             gcp->itemID = context.itemID;
7923             gcp->thistype = OBJ_SEQDESC;
7924             vsp->descr = vnp;
7925             ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_RefGeneTrackingOnNonRefSeq, "RefGeneTracking object should only be in RefSeq record");
7926             vsp->descr = NULL;
7927             gcp->itemID = olditemid;
7928             gcp->thistype = olditemtype;
7929           }
7930         } else if (oip != NULL && StringICmp (oip->str, "DBLink") == 0) {
7931           dblink_count++;
7932           for (ufp = uop->data; ufp != NULL; ufp = ufp->next) {
7933             oip = ufp->label;
7934             if (oip == NULL || oip->str == NULL) continue;
7935             if (StringICmp (oip->str, "Trace Assembly Archive") == 0 && (ufp->choice == 2 || ufp->choice == 8)) {
7936               taa_count++;
7937             } else if (StringICmp (oip->str, "BioSample") == 0 && (ufp->choice == 1 || ufp->choice == 7)) {
7938               bs_count++;
7939             } else if (StringICmp (oip->str, "Assembly") == 0 && (ufp->choice == 1 || ufp->choice == 7)) {
7940               as_count++;
7941             } else if (StringICmp (oip->str, "ProbeDB") == 0 && (ufp->choice == 1 || ufp->choice == 7)) {
7942               pdb_count++;
7943             } else if (StringICmp (oip->str, "Sequence Read Archive") == 0 && (ufp->choice == 1 || ufp->choice == 7)) {
7944               sra_count++;
7945             } else if (StringICmp (oip->str, "BioProject") == 0 && (ufp->choice == 1 || ufp->choice == 7)) {
7946               bp_count++;
7947             } else {
7948               unknown_count++;
7949             }
7950           }
7951         } else if (oip != NULL && StringICmp (oip->str, "StructuredComment") == 0) {
7952           this_is_gen_asm = FALSE;
7953           for (ufp = uop->data; ufp != NULL; ufp = ufp->next) {
7954             oip = ufp->label;
7955             if (oip == NULL || oip->str == NULL) continue;
7956             if (StringICmp (oip->str, "StructuredCommentPrefix") == 0) {
7957               ValNodeCopyStrEx (&sc_head, &sc_tail, 0, (CharPtr) ufp->data.ptrvalue);
7958               if (StringCmp ((CharPtr) ufp->data.ptrvalue, "##Genome-Assembly-Data-START##") == 0) {
7959                 is_genome_assembly = TRUE;
7960                 this_is_gen_asm = TRUE;
7961               } else if (StringCmp ((CharPtr) ufp->data.ptrvalue, "##Assembly-Data-START##") == 0) {
7962                 is_assembly = TRUE;
7963               }
7964             }
7965           }
7966           if (this_is_gen_asm) {
7967             for (ufp = uop->data; ufp != NULL; ufp = ufp->next) {
7968               oip = ufp->label;
7969               if (oip == NULL || oip->str == NULL) continue;
7970               if (StringICmp (oip->str, "Current Finishing Status") == 0) {
7971                 if (StringCmp ((CharPtr) ufp->data.ptrvalue, "Finished") == 0) {
7972                   is_finished_status = TRUE;
7973                 }
7974               }
7975             }
7976           }
7977           if (is_genome_assembly && is_finished_status && mip != NULL && mip->tech == MI_TECH_wgs) {
7978             olditemid = gcp->itemID;
7979             olditemtype = gcp->thistype;
7980             gcp->itemID = context.itemID;
7981             gcp->thistype = OBJ_SEQDESC;
7982             SeqIdWrite (bsp->id, buf1, PRINTID_FASTA_SHORT, 40);
7983             ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_FinishedStatusForWGS, "WGS record %s should not have Finished status", buf1);
7984             gcp->itemID = olditemid;
7985             gcp->thistype = olditemtype;
7986           }
7987         }
7988         if ((keyword = KeywordForStructuredCommentName (uop)) != NULL) {
7989           if (IsStructuredCommentValid(uop, NULL, NULL) == eFieldValid_Valid) {
7990             if (! HasAllKeywordsForStructuredComment (bsp, keyword)) {
7991               /*
7992               ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_MissingKeyword, "Structured Comment compliant, keyword should be added");
7993               */
7994             }
7995           } else {
7996             if (HasAnyKeywordForStructuredComment (bsp, keyword)) {
7997               ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_BadKeyword, "Structured Comment is non-compliant, keyword should be removed");
7998             }
7999           }
8000         }
8001         keyword = MemFree (keyword);
8002       }
8003       vnp = SeqMgrGetNextDescriptor (bsp, vnp, Seq_descr_user, &context);
8004     }
8005   }
8006 
8007   if (sc_head != NULL) {
8008     sc_head = ValNodeSort (sc_head, SortVnpByString);
8009 
8010     prev_str = NULL;
8011     num_seen = 0;
8012 
8013     for (vnp = sc_head; vnp != NULL; vnp = vnp->next) {
8014       curr_str = (CharPtr) vnp->data.ptrvalue;
8015       if (StringHasNoText (curr_str)) continue;
8016 
8017       if (StringICmp (curr_str, prev_str) == 0) {
8018         num_seen++;
8019       } else {
8020         if (num_seen > 1) {
8021           ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_MultipleComments, "Multiple structured comments with prefix %s", prev_str);
8022         }
8023         prev_str = curr_str;
8024         num_seen = 1;
8025       }
8026     }
8027     if (num_seen > 1) {
8028       ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_MultipleComments, "Multiple structured comments with prefix %s", prev_str);
8029     }
8030 
8031     sc_head = ValNodeFreeData (sc_head);
8032   }
8033 
8034   vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_title, &context);
8035   while (vnp != NULL) {
8036     title = (CharPtr) vnp->data.ptrvalue;
8037     if (StringNCmp (title, "GRC", 3) == 0 && StringLen (title) > 3) {
8038       is_grc_title = TRUE;
8039     }
8040     vnp = SeqMgrGetNextDescriptor (bsp, vnp, Seq_descr_title, &context);
8041   }
8042 
8043   if (s_IsInNucProtSet (bsp)) {
8044     VisitDescriptorsOnBsp (bsp, vsp, FindDBlinkUserObjectOnBsp);
8045   }
8046 
8047   if (dblink_count > 1) {
8048     ValidErr (vsp, SEV_REJECT, ERR_SEQ_DESCR_DBLinkProblem, "%ld DBLink user objects apply to a Bioseq", (long) dblink_count);
8049   }
8050   if (taa_count > 1) {
8051     ValidErr (vsp, SEV_REJECT, ERR_SEQ_DESCR_DBLinkProblem, "Trace Assembly Archive entries appear in %ld DBLink user objects", (long) taa_count);
8052   }
8053   if (bs_count > 1) {
8054     ValidErr (vsp, SEV_REJECT, ERR_SEQ_DESCR_DBLinkProblem, "BioSample entries appear in %ld DBLink user objects", (long) bs_count);
8055   }
8056   if (as_count > 1) {
8057     ValidErr (vsp, SEV_REJECT, ERR_SEQ_DESCR_DBLinkProblem, "Assembly entries appear in %ld DBLink user objects", (long) as_count);
8058   }
8059   if (pdb_count > 1) {
8060     ValidErr (vsp, SEV_REJECT, ERR_SEQ_DESCR_DBLinkProblem, "ProbeDB entries appear in %ld DBLink user objects", (long) pdb_count);
8061   }
8062   if (sra_count > 1) {
8063     ValidErr (vsp, SEV_REJECT, ERR_SEQ_DESCR_DBLinkProblem, "Sequence Read Archive entries appear in %ld DBLink user objects", (long) sra_count);
8064   }
8065   if (bp_count > 1) {
8066     ValidErr (vsp, SEV_REJECT, ERR_SEQ_DESCR_DBLinkProblem, "BioProject entries appear in %ld DBLink user objects", (long) bp_count);
8067   }
8068   if (unknown_count > 1) {
8069     ValidErr (vsp, SEV_REJECT, ERR_SEQ_DESCR_DBLinkProblem, "Unrecognized entries appear in %ld DBLink user objects", (long) unknown_count);
8070   } else if (unknown_count > 0) {
8071     ValidErr (vsp, SEV_REJECT, ERR_SEQ_DESCR_DBLinkProblem, "Unrecognized entries appear in %ld DBLink user object", (long) unknown_count);
8072   }
8073 
8074   if (bp_count == 0 && isRefSeq && (! isNG)) {
8075       if ((bsp->repr == Seq_repr_seg && (! SegHasParts (bsp))) ||
8076           (bsp->repr == Seq_repr_delta && (! DeltaLitOnly (bsp))) ||
8077           (bsp->repr == Seq_repr_ref)) {
8078       ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_ScaffoldLacksBioProject, "BioProject entries not present on CON record");
8079       }
8080   }
8081   if (bp_count == 0 && (isGB || isEMBL || isDDBJ) && ((mip != NULL && mip->tech == MI_TECH_wgs) || is_grc_title)) {
8082       if ((bsp->repr == Seq_repr_seg && (! SegHasParts (bsp))) ||
8083           (bsp->repr == Seq_repr_delta && (! DeltaLitOnly (bsp))) ||
8084           (bsp->repr == Seq_repr_ref)) {
8085       ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_ScaffoldLacksBioProject, "BioProject entries not present on CON record");
8086       }
8087   }
8088 
8089   for (sip1 = bsp->id; sip1 != NULL; sip1 = sip1->next) {
8090     bsp2 = BioseqFindSpecial (sip1);
8091     if (bsp2 == NULL) {
8092       if (!isPatent) {
8093         SeqIdWrite (sip1, buf1, PRINTID_FASTA_SHORT, sizeof (buf1) -1);
8094         ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_IdOnMultipleBioseqs, "BioseqFind (%s) unable to find itself - possible internal error", buf1);
8095       }
8096     } else if (bsp2 != bsp) {
8097       if (sip1->choice == SEQID_GENERAL) {
8098         dbt = (DbtagPtr) sip1->data.ptrvalue;
8099         if (dbt != NULL && StringICmp (dbt->db, "NCBIFILE") == 0) continue;
8100       }
8101       SeqIdWrite (sip1, buf1, PRINTID_FASTA_SHORT, sizeof (buf2) -1);
8102       ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_IdOnMultipleBioseqs, "SeqID %s is present on multiple Bioseqs in record", buf1);
8103     }
8104   }
8105 
8106   for (i = 0; i < 4; i++)
8107     errors[i] = FALSE;
8108 
8109   switch (bsp->repr) {
8110   case Seq_repr_virtual:
8111     if ((bsp->seq_ext_type) || (bsp->seq_ext != NULL))
8112       errors[0] = TRUE;
8113     if ((bsp->seq_data_type) || (bsp->seq_data != NULL))
8114       errors[3] = TRUE;
8115     break;
8116   case Seq_repr_map:
8117     //LCOV_EXCL_START
8118     //C Toolkit gather assumes correct ext type, crashes if other
8119     if ((bsp->seq_ext_type != 3) || (bsp->seq_ext == NULL))
8120       errors[1] = TRUE;
8121     //LCOV_EXCL_STOP
8122     if ((bsp->seq_data_type) || (bsp->seq_data != NULL))
8123       errors[3] = TRUE;
8124     break;
8125   case Seq_repr_ref:
8126     if ((bsp->seq_ext_type != 2) || (bsp->seq_ext == NULL))
8127       errors[1] = TRUE;
8128     if ((bsp->seq_data_type) || (bsp->seq_data != NULL))
8129       errors[3] = TRUE;
8130     break;
8131   case Seq_repr_seg:
8132     //LCOV_EXCL_START
8133     //segsets are obsolete
8134     if ((bsp->seq_ext_type != 1) || (bsp->seq_ext == NULL))
8135       errors[1] = TRUE;
8136     if ((bsp->seq_data_type) || (bsp->seq_data != NULL))
8137       errors[3] = TRUE;
8138     break;
8139     //LCOV_EXCL_STOP
8140   case Seq_repr_raw:
8141   case Seq_repr_const:
8142     if ((bsp->seq_ext_type) || (bsp->seq_ext != NULL))
8143       errors[0] = TRUE;
8144     if ((bsp->seq_data_type < 1) || (bsp->seq_data_type > 11)
8145         || (bsp->seq_data == NULL))
8146       errors[2] = TRUE;
8147     break;
8148   case Seq_repr_delta:
8149     if ((bsp->seq_ext_type != 4) || (bsp->seq_ext == NULL))
8150       errors[1] = TRUE;
8151     if ((bsp->seq_data_type) || (bsp->seq_data != NULL))
8152       errors[3] = TRUE;
8153     break;
8154   default:
8155     ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_ReprInvalid, "Invalid Bioseq->repr = %d", (int) (bsp->repr));
8156     return;
8157   }
8158 
8159   if (errors[0] == TRUE) {
8160     ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_ExtNotAllowed, "Bioseq-ext not allowed on %s Bioseq", repr[bsp->repr - 1]);
8161     retval = FALSE;
8162   }
8163 
8164   if (errors[1] == TRUE) {
8165     ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_ExtBadOrMissing, "Missing or incorrect Bioseq-ext on %s Bioseq", repr[bsp->repr - 1]);
8166     retval = FALSE;
8167   }
8168 
8169   if (errors[2] == TRUE) {
8170     ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SeqDataNotFound, "Missing Seq-data on %s Bioseq", repr[bsp->repr - 1]);
8171     retval = FALSE;
8172   }
8173 
8174   if (errors[3] == TRUE) {
8175     ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_SeqDataNotAllowed, "Seq-data not allowed on %s Bioseq", repr[bsp->repr - 1]);
8176     retval = FALSE;
8177   }
8178 
8179   if (!retval)
8180     return;
8181 
8182   oldEntityID = gcp->entityID;
8183   oldItemID = gcp->itemID;
8184   oldItemtype = gcp->thistype;
8185 
8186   if (ISA_aa (bsp->mol)) {
8187     if (bsp->topology > 1) {    /* not linear */
8188       ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_CircularProtein, "Non-linear topology set on protein");
8189     }
8190     if (bsp->strand > 1) {
8191       ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_DSProtein, "Protein not single stranded");
8192     }
8193 
8194   } else {
8195     if (!bsp->mol)
8196       ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_MolNotSet, "Bioseq.mol is 0");
8197     else if (bsp->mol == Seq_mol_other)
8198       ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_MolOther, "Bioseq.mol is type other");
8199     else if (bsp->mol == Seq_mol_na)
8200       ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_MolNuclAcid, "Bioseq.mol is type na");
8201   }
8202 
8203   if (ISA_na (bsp->mol)) {
8204     if (bsp->strand > 1 && mip != NULL) {
8205       if (mip->biomol == MOLECULE_TYPE_MRNA) {
8206         ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_DSmRNA, "mRNA not single stranded");
8207       }
8208     }
8209   }
8210 
8211   gcp->entityID = oldEntityID;
8212   gcp->itemID = oldItemID;
8213   gcp->thistype = oldItemtype;
8214 
8215   /* check sequence alphabet */
8216   if (((bsp->repr == Seq_repr_raw) || (bsp->repr == Seq_repr_const)) && bsp->seq_data_type != Seq_code_gap) {
8217     if (bsp->fuzz != NULL) {
8218       ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_FuzzyLen, "Fuzzy length on %s Bioseq", repr[bsp->repr - 1]);
8219     }
8220 
8221     if (bsp->length < 1) {
8222         //LCOV_EXCL_START
8223         //can't test in regression, C toolkit can't read it
8224       ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_InvalidLen, "Invalid Bioseq length [%ld]", (long) bsp->length);
8225       //LCOV_EXCL_STOP
8226     }
8227 
8228     seqtype = (int) (bsp->seq_data_type);
8229     switch (seqtype) {
8230     case Seq_code_iupacna:
8231     case Seq_code_ncbi2na:
8232     case Seq_code_ncbi4na:
8233     case Seq_code_ncbi8na:
8234     case Seq_code_ncbipna:
8235       if (ISA_aa (bsp->mol)) {
8236         ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_InvalidAlphabet, "Using a nucleic acid alphabet on a protein sequence");
8237         return;
8238       }
8239       break;
8240     case Seq_code_iupacaa:
8241     case Seq_code_ncbi8aa:
8242     case Seq_code_ncbieaa:
8243     case Seq_code_ncbipaa:
8244     case Seq_code_ncbistdaa:
8245       if (ISA_na (bsp->mol)) {
8246         ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_InvalidAlphabet, "Using a protein alphabet on a nucleic acid");
8247         return;
8248       }
8249       break;
8250     case Seq_code_gap:
8251       break;
8252     default:
8253         //LCOV_EXCL_START
8254         //not readable by C Toolkit
8255       ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_InvalidAlphabet, "Using illegal sequence alphabet [%d]", (int) bsp->seq_data_type);
8256       return;
8257       //LCOV_EXCL_STOP
8258     }
8259 
8260     check_alphabet = FALSE;
8261     switch (seqtype) {
8262     case Seq_code_iupacaa:
8263     case Seq_code_iupacna:
8264     case Seq_code_ncbieaa:
8265     case Seq_code_ncbistdaa:
8266       check_alphabet = TRUE;
8267 
8268     case Seq_code_ncbi8na:
8269     case Seq_code_ncbi8aa:
8270       divisor = 1;
8271       break;
8272 
8273     case Seq_code_ncbi4na:
8274       divisor = 2;
8275       break;
8276 
8277     case Seq_code_ncbi2na:
8278       divisor = 4;
8279       break;
8280 
8281     case Seq_code_ncbipna:
8282       divisor = 5;
8283       break;
8284 
8285     case Seq_code_ncbipaa:
8286       divisor = 21;
8287       break;
8288     }
8289 
8290     len = bsp->length;
8291     if (len % divisor)
8292       len += divisor;
8293     len /= divisor;
8294     len2 = BSLen ((ByteStorePtr) bsp->seq_data);
8295     if (len > len2) {
8296       ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SeqDataLenWrong, "Bioseq.seq_data too short [%ld] for given length [%ld]", (long) (len2 * divisor),
8297                 (long) bsp->length);
8298       return;
8299     } else if (len < len2) {
8300       ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SeqDataLenWrong, "Bioseq.seq_data is larger [%ld] than given length [%ld]", (long) (len2 * divisor),
8301                 (long) bsp->length);
8302     }
8303 
8304     if (check_alphabet) {       /* check 1 letter alphabets */
8305       switch (seqtype) {
8306       case Seq_code_iupacaa:
8307       case Seq_code_ncbieaa:
8308         termination = '*';
8309         gapchar = '-';
8310         break;
8311       case Seq_code_ncbistdaa:
8312         termination = 25;
8313         gapchar = 0;
8314         break;
8315       default:
8316         termination = '\0';
8317         gapchar = '\0';
8318         break;
8319       }
8320       if (! StreamCacheSetup (bsp, NULL, STREAM_EXPAND_GAPS, &sc)) {
8321         //LCOV_EXCL_START
8322         //C toolkit specific
8323         ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SeqPortFail, "Can't open StreamCache");
8324         return;
8325         //LCOV_EXCL_STOP
8326       }
8327       /*
8328       spp = SeqPortNew (bsp, 0, -1, 0, 0);
8329       if (spp == NULL) {
8330         ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SeqPortFail, "Can't open SeqPort");
8331         return;
8332       }
8333       */
8334       i = 0;
8335       terminations = 0;
8336       dashes = 0;
8337       gapatstart = FALSE;
8338       trailingX = 0;
8339       leadingX = FALSE;
8340       isLower = FALSE;
8341       isFirst = TRUE;
8342       for (len = 0; len < bsp->length; len++) {
8343         residue = StreamCacheGetResidue (&sc);
8344         /*
8345         residue = SeqPortGetResidue (spp);
8346         */
8347         if (!IS_residue (residue)) {
8348           //LCOV_EXCL_START
8349           //code never reached, StreamCache ignores invalid residues silently
8350           i++;
8351           if (i > 10) {
8352             ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_InvalidResidue, "More than 10 invalid residues. Checking stopped");
8353             /*
8354             SeqPortFree (spp);
8355             */
8356             // patch_seq is never set
8357             if (vsp->patch_seq)
8358               PatchBadSequence (bsp);
8359             return;
8360           } else {
8361             BSSeek ((ByteStorePtr) bsp->seq_data, len, SEEK_SET);
8362             x = BSGetByte ((ByteStorePtr) bsp->seq_data);
8363             if (bsp->seq_data_type == Seq_code_ncbistdaa) {
8364               ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_InvalidResidue, "Invalid residue [%d] at position [%ld]", (int) x, (long) (len + 1));
8365             } else if (IS_ALPHA (x)) {
8366               ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_InvalidResidue, "Invalid residue '%c' at position [%ld]", (char) x, (long) (len + 1));
8367             } else {
8368               ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_InvalidResidue, "Invalid residue [%d] at position [%ld]", (int) x, (long) (len + 1));
8369             }
8370           }
8371           //LCOV_EXCL_STOP
8372         } else if (residue == termination) {
8373           terminations++;
8374           trailingX = 0;        /* suppress if followed by terminator */
8375         } else if (residue == gapchar) {
8376           dashes++;
8377           if (len == 0) {
8378             gapatstart = TRUE;
8379           }
8380         } else if (residue == 'X') {
8381           if (ISA_na (bsp->mol)) {
8382             ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_InvalidResidue, "Invalid nucleotide residue '%c' at position [%ld]", (char) residue, (long) (len + 1));
8383           } else {
8384             trailingX++;
8385             if (isFirst) {
8386               leadingX = TRUE;
8387             }
8388           }
8389         } else if (ISA_na (bsp->mol) && StringChr ("EFIJLOPQZ", (Char) residue) != NULL) {
8390           ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_InvalidResidue, "Invalid nucleotide residue '%c' at position [%ld]", (char) residue, (long) (len + 1));
8391         } else if (! IS_ALPHA ((Char) residue)) {
8392           ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_InvalidResidue, "Invalid residue '%c' at position [%ld]", (char) residue, (long) (len + 1));
8393         } else {
8394           trailingX = 0;
8395           if (IS_LOWER ((Char) residue)) {
8396             isLower = TRUE;
8397           }
8398         }
8399         isFirst = FALSE;
8400       }
8401       /*
8402       SeqPortFree (spp);
8403       */
8404       if (ISA_aa (bsp->mol) && (leadingX || trailingX > 0)) {
8405         /* only show leading or trailing X if product of NNN in nucleotide */
8406         cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
8407         if (cds != NULL) {
8408           crp = (CdRegionPtr) cds->data.value.ptrvalue;
8409           if (crp != NULL) {
8410             dnalen = SeqLocLen (cds->location);
8411             if (dnalen > 5) {
8412               bases = ReadCodingRegionBases (cds->location, dnalen, crp->frame, &total);
8413               len = StringLen (bases);
8414               if (len > 5) {
8415                 if (StringNICmp (bases, "NNN", 3) != 0) {
8416                   leadingX = FALSE;
8417                 }
8418                 if (StringNICmp (bases + len - 3, "NNN", 3) != 0) {
8419                   trailingX = 0;
8420                 }
8421               }
8422               MemFree (bases);
8423             }
8424           }
8425         }
8426       }
8427       if (leadingX) {
8428         ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_LeadingX, "Sequence starts with leading X", (int) leadingX);
8429       }
8430       if (trailingX > 0 && SuppressTrailingXMessage (bsp)) {
8431         /* suppress if cds translation ends in '*' or 3' partial */
8432       } else if (trailingX > 1) {
8433         ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_TrailingX, "Sequence ends in %d trailing Xs", (int) trailingX);
8434       } else if (trailingX > 0) {
8435         ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_TrailingX, "Sequence ends in %d trailing X", (int) trailingX);
8436       }
8437       if (isLower) {
8438         ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_InvalidResidue, "Sequence contains lower-case characters");
8439       }
8440       if (terminations > 0 || dashes > 0) {
8441         cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
8442         grp = SeqMgrGetGeneXref (cds);
8443         genelbl = NULL;
8444         if (grp == NULL && cds != NULL) {
8445           gene = SeqMgrGetOverlappingGene (cds->location, &genectxt);
8446           if (gene != NULL) {
8447             grp = (GeneRefPtr) gene->data.value.ptrvalue;
8448           }
8449         }
8450         if (grp != NULL && (!SeqMgrGeneIsSuppressed (grp))) {
8451           if (grp->locus != NULL)
8452             genelbl = (grp->locus);
8453           else if (grp->locus_tag != NULL)
8454             genelbl = (grp->locus_tag);
8455           else if (grp->desc != NULL)
8456             genelbl = (grp->desc);
8457           else if (grp->syn != NULL)
8458             genelbl = (CharPtr) (grp->syn->data.ptrvalue);
8459         }
8460         prot = SeqMgrGetBestProteinFeature (bsp, &protctxt);
8461         protlbl = protctxt.label;
8462       }
8463       if (StringHasNoText (genelbl)) {
8464         genelbl = "gene?";
8465       }
8466       if (StringHasNoText (protlbl)) {
8467         protlbl = "prot?";
8468       }
8469       if (dashes > 0) {
8470         if (gapatstart && dashes == 1) {
8471           ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_BadProteinStart, "gap symbol at start of protein sequence (%s - %s)", genelbl, protlbl);
8472         } else if (gapatstart) {
8473           ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_BadProteinStart, "gap symbol at start of protein sequence (%s - %s)", genelbl, protlbl);
8474           ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_GapInProtein, "[%d] internal gap symbols in protein sequence (%s - %s)", (dashes - 1), genelbl, protlbl);
8475         } else {
8476           ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_GapInProtein, "[%d] internal gap symbols in protein sequence (%s - %s)", dashes, genelbl, protlbl);
8477         }
8478       }
8479       if (terminations) {
8480         ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_StopInProtein, "[%d] termination symbols in protein sequence (%s - %s)", terminations, genelbl, protlbl);
8481         if (!i)
8482           return;
8483       }
8484       if (i) {
8485         //LCOV_EXCL_START
8486         // patch_seq is never set
8487         if (vsp->patch_seq)
8488           PatchBadSequence (bsp);
8489         return;
8490         //LCOV_EXCL_STOP
8491       }
8492 
8493     }
8494   }
8495 
8496   if (ISA_na (bsp->mol) && bsp->repr == Seq_repr_delta && DeltaLitOnly (bsp)) {
8497     if (! StreamCacheSetup (bsp, NULL, EXPAND_GAPS_TO_DASHES, &sc)) {
8498       //LCOV_EXCL_START
8499       //C toolkit specific
8500       ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SeqPortFail, "Can't open StreamCache");
8501       return;
8502       //LCOV_EXCL_STOP
8503     }
8504     in_gap = FALSE;
8505     in_N = FALSE;
8506     for (len = 0; len < bsp->length; len++) {
8507       residue = StreamCacheGetResidue (&sc);
8508       if (residue == '-') {
8509         if (in_N) {
8510           ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_InternalNsAdjacentToGap,
8511                     "Ambiguous residue N is adjacent to a gap around position %ld",
8512                     (long) len + 1);
8513         }
8514         in_N = FALSE;
8515         in_gap = TRUE;
8516       } else if (residue == 'N') {
8517         if (in_gap) {
8518           ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_InternalNsAdjacentToGap,
8519                     "Ambiguous residue N is adjacent to a gap around position %ld",
8520                     (long) len + 1);
8521         }
8522         in_gap = FALSE;
8523         in_N = TRUE;
8524       } else {
8525         in_gap = FALSE;
8526         in_N = FALSE;
8527       }
8528     }
8529   }
8530 
8531   if ((bsp->repr == Seq_repr_seg) || (bsp->repr == Seq_repr_ref)) {     /* check segmented sequence */
8532     //LCOV_EXCL_START
8533     //segmented sequences are obsolete
8534     head.choice = SEQLOC_MIX;
8535     head.data.ptrvalue = bsp->seq_ext;
8536     head.next = NULL;
8537     ValidateSeqLoc (vsp, (SeqLocPtr) & head, TRUE, "Segmented Bioseq");
8538     /* check the length */
8539     len = 0;
8540     vnp = NULL;
8541     while ((vnp = SeqLocFindNext (&head, vnp)) != NULL) {
8542       len2 = SeqLocLen (vnp);
8543       if (len2 > 0)
8544         len += len2;
8545     }
8546     if (bsp->length > len) {
8547       ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SeqDataLenWrong, "Bioseq.seq_data too short [%ld] for given length [%ld]", (long) (len), (long) bsp->length);
8548     } else if (bsp->length < len) {
8549       ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SeqDataLenWrong, "Bioseq.seq_data is larger [%ld] than given length [%ld]", (long) (len), (long) bsp->length);
8550     }
8551 
8552     vnp = NULL;
8553     idlist = NULL;
8554     while ((vnp = SeqLocFindNext (&head, vnp)) != NULL) {
8555       sip1 = SeqLocId (vnp);
8556       if (sip1 != NULL) {
8557         SeqIdWrite (sip1, buf1, PRINTID_FASTA_SHORT, 40);
8558         ValNodeCopyStr (&idlist, vnp->choice, buf1);
8559       }
8560     }
8561     if (idlist != NULL) {
8562       idlist = ValNodeSort (idlist, SortVnpByString);
8563       last = (CharPtr) idlist->data.ptrvalue;
8564       lastchoice = (Uint1) idlist->choice;
8565       vnp = idlist->next;
8566       while (vnp != NULL) {
8567         str = (CharPtr) vnp->data.ptrvalue;
8568         if (StringICmp (last, str) == 0) {
8569           if (vnp->choice == lastchoice && lastchoice == SEQLOC_WHOLE) {
8570             ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_DuplicateSegmentReferences, "Segmented sequence has multiple references to %s", str);
8571           } else {
8572             ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_DuplicateSegmentReferences,
8573                       "Segmented sequence has multiple references to %s that are not SEQLOC_WHOLE", str);
8574           }
8575         } else {
8576           last = (CharPtr) vnp->data.ptrvalue;
8577           lastchoice = (Uint1) vnp->choice;
8578         }
8579         vnp = vnp->next;
8580       }
8581       ValNodeFreeData (idlist);
8582     }
8583 
8584     vsp->bsp_partial_val = SeqLocPartialCheck ((SeqLocPtr) (&head));
8585     if (ISA_aa (bsp->mol)) {
8586       got_partial = FALSE;
8587       if (mip != NULL) {
8588           switch (mip->completeness) {
8589           case 2:             /* partial */
8590             got_partial = TRUE;
8591             if (!vsp->bsp_partial_val) {
8592               ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_PartialInconsistent, "Complete segmented sequence with MolInfo partial");
8593             }
8594             break;
8595           case 3:             /* no-left */
8596             if (!(vsp->bsp_partial_val & SLP_START) || (vsp->bsp_partial_val && SLP_STOP))
8597               ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_PartialInconsistent, "No-left inconsistent with segmented SeqLoc");
8598             got_partial = TRUE;
8599             break;
8600           case 4:             /* no-right */
8601             if (!(vsp->bsp_partial_val & SLP_STOP) || (vsp->bsp_partial_val && SLP_START))
8602               ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_PartialInconsistent, "No-right inconsistent with segmented SeqLoc");
8603             got_partial = TRUE;
8604             break;
8605           case 5:             /* no-ends */
8606             if ((!(vsp->bsp_partial_val & SLP_STOP)) || (!(vsp->bsp_partial_val & SLP_START)))
8607               ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_PartialInconsistent, "No-ends inconsistent with segmented SeqLoc");
8608             got_partial = TRUE;
8609             break;
8610           default:
8611             break;
8612           }
8613       }
8614       if (!got_partial && vsp->bsp_partial_val) {
8615         ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_PartialInconsistent, "Partial segmented sequence without MolInfo partial");
8616       }
8617     }
8618     //LCOV_EXCL_STOP
8619   }
8620 
8621   if (bsp->repr == Seq_repr_delta || bsp->repr == Seq_repr_raw) {
8622 
8623     vnp = NULL;
8624     if (vsp->useSeqMgrIndexes) {
8625       sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_genbank, &context);
8626     } else {
8627 //LCOV_EXCL_START
8628       bcp = BioseqContextNew (bsp);
8629       sdp = BioseqContextGetSeqDescr (bcp, Seq_descr_genbank, NULL, NULL);
8630       BioseqContextFree (bcp);
8631 //LCOV_EXCL_STOP
8632     }
8633     if (sdp != NULL) {
8634       gbp = (GBBlockPtr) sdp->data.ptrvalue;
8635       if (gbp != NULL) {
8636         for (vnp = gbp->keywords; vnp != NULL; vnp = vnp->next) {
8637           str = (CharPtr) vnp->data.ptrvalue;
8638           if (StringICmp (str, "HTGS_ACTIVEFIN") == 0) {
8639             isActiveFin = TRUE;
8640           } else if (StringICmp (str, "HTGS_DRAFT") == 0) {
8641             isDraft = TRUE;
8642           } else if (StringICmp (str, "HTGS_FULLTOP") == 0) {
8643             isFullTop = TRUE;
8644           } else if (StringICmp (str, "HTGS_PREFIN") == 0) {
8645             isPreFin = TRUE;
8646           } else if (StringICmp (str, "BARCODE") == 0) {
8647             has_barcode_keyword = TRUE;
8648             if (/* ! vsp->is_barcode_sep */ mip == NULL || mip->tech != MI_TECH_barcode) {
8649               olditemid = gcp->itemID;
8650               olditemtype = gcp->thistype;
8651               if (sdp->extended != 0) {
8652                 ovp = (ObjValNodePtr) sdp;
8653                 gcp->itemID = ovp->idx.itemID;
8654                 gcp->thistype = OBJ_SEQDESC;
8655               }
8656               ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadKeyword, "BARCODE keyword without Molinfo.tech barcode");
8657               gcp->itemID = olditemid;
8658               gcp->thistype = olditemtype;
8659             }
8660           }
8661         }
8662       }
8663     }
8664     if (mip != NULL && mip->tech == MI_TECH_barcode && !has_barcode_keyword) {
8665       ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_BadKeyword, "Molinfo.tech barcode without BARCODE keyword");
8666     }
8667   }
8668 
8669 
8670   if (bsp->repr == Seq_repr_delta) {
8671     len = 0;
8672     count = 0;
8673     num_gap_known_or_spec = 0;
8674     num_gap_unknown_unspec = 0;
8675     doNotSkip = TRUE;
8676     for (vnp = (ValNodePtr) (bsp->seq_ext); vnp != NULL; vnp = vnp->next) {
8677       if (vnp->choice == 1) {
8678         count++;
8679       }
8680     }
8681     if (count > 10000) {
8682       smp = SeqMgrGet ();
8683       if (smp != NULL) {
8684         if (smp->seq_len_lookup_func == NULL) {
8685           doNotSkip = FALSE;
8686         }
8687       }
8688     }
8689     if (doNotSkip) {
8690       for (vnp = (ValNodePtr) (bsp->seq_ext), segnum = 1; vnp != NULL; vnp = vnp->next, segnum++) {
8691           if (vnp->data.ptrvalue == NULL) {
8692               //LCOV_EXCL_START
8693               //not possible reading from file
8694               ValidErr(vsp, SEV_ERROR, ERR_SEQ_INST_SeqDataLenWrong, "NULL pointer in delta seq_ext valnode");
8695               //LCOV_EXCL_STOP
8696           }  else {
8697           switch (vnp->choice) {
8698           case 1:                /* SeqLocPtr */
8699             slp = (SeqLocPtr) (vnp->data.ptrvalue);
8700             if (slp != NULL && slp->choice == SEQLOC_WHOLE) {
8701               ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_WholeComponent, "Delta seq component should not be of type whole");
8702             }
8703             sip3 = SeqLocId (slp);
8704             if (sip3 != NULL) {
8705               if (sip3->choice == SEQID_GI && sip3->data.intvalue <= 0) {
8706                 ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_DeltaComponentIsGi0, "Delta component is gi|0");
8707               }
8708               for (sip1 = bsp->id; sip1 != NULL; sip1 = sip1->next) {
8709                 if (SeqIdComp (sip1, sip3) == SIC_YES) {
8710                   ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SelfReferentialSequence,
8711                             "Self-referential delta sequence");
8712                 }
8713               }
8714             }
8715             len2 = SeqLocLen (slp);
8716             if (len2 < 0)
8717               ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_SeqDataLenWrong, "-1 length on seq-loc of delta seq_ext");
8718             else
8719               len += len2;
8720             sip3 = SeqLocId (slp);
8721             if (sip3 != NULL && slp != NULL && slp->choice == SEQLOC_INT) {
8722               sintp = (SeqIntPtr) slp->data.ptrvalue;
8723               if (sintp != NULL && (sip3->choice == SEQID_GI ||
8724                                    sip3->choice == SEQID_GENBANK ||
8725                                    sip3->choice == SEQID_EMBL ||
8726                                    sip3->choice == SEQID_DDBJ ||
8727                                    sip3->choice == SEQID_TPG ||
8728                                    sip3->choice == SEQID_TPE ||
8729                                    sip3->choice == SEQID_TPD ||
8730                                    sip3->choice == SEQID_OTHER)) {
8731                 vn.choice = SEQLOC_WHOLE;
8732                 vn.data.ptrvalue = sip3;
8733                 vn.next = NULL;
8734                 len3 = SeqLocLen (&vn);
8735                 /* -1 signifies failure to lookup or not connected to lookup function */
8736                 if (len3 != -1) {
8737                   if (sintp->to >= len3) {
8738                     SeqIdWrite (sip3, buf1, PRINTID_FASTA_SHORT, 40);
8739                     ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SeqDataLenWrong,
8740                               "Seq-loc extent (%ld) greater than length of %s (%ld)",
8741                               (long) (sintp->to + 1), buf1, (long) len3);
8742                   }
8743                 }
8744               }
8745             }
8746             if (len2 <= 10 && DeltaLitOnly (bsp)) {
8747                 //LCOV_EXCL_START
8748                 //by definition this code cannot be reached, because it is examining a segment
8749                 //that is a Seq-loc, and DeltaLitOnly only returns true if there are no Seq-loc segments
8750                 // - NOW IT CAN BECAUSE DeltaLitOnly ALLOWS null NULL
8751               slp = (SeqLocPtr) (vnp->data.ptrvalue);
8752               if (slp != NULL && slp->choice != SEQLOC_NULL) {
8753                 str = SeqLocPrint ((SeqLocPtr) (vnp->data.ptrvalue));
8754                 if (str == NULL) {
8755                   str = StringSave ("?");
8756                 }
8757                 ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_SeqLocLength, "Short length (%ld) on seq-loc (%s) of delta seq_ext", (long) len2, str);
8758                 MemFree (str);
8759               }
8760               //LCOV_EXCL_STOP
8761             }
8762             break;
8763           case 2:                /* SeqLitPtr */
8764             slitp = (SeqLitPtr) (vnp->data.ptrvalue);
8765             if (slitp->seq_data != NULL && slitp->seq_data_type != Seq_code_gap) {
8766               if (slitp->length == 0) {
8767                 ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_SeqLitDataLength0, "Seq-lit of length 0 in delta chain");
8768               }
8769               sctp = SeqCodeTableFind (slitp->seq_data_type);
8770               if (sctp == NULL) {
8771                 //LCOV_EXCL_START
8772                 //not readable by C Toolkit
8773                 ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_InvalidAlphabet, "Using illegal sequence alphabet [%d] in SeqLitPtr", (int) slitp->seq_data_type);
8774                 len += slitp->length;
8775                 break;
8776                 //LCOV_EXCL_STOP
8777               }
8778 
8779               start_at = (Int2) (sctp->start_at);
8780               num = (Int2) (sctp->num);
8781 
8782               switch (slitp->seq_data_type) {
8783               case Seq_code_iupacaa:
8784               case Seq_code_iupacna:
8785               case Seq_code_ncbieaa:
8786               case Seq_code_ncbistdaa:
8787                 BSSeek ((ByteStorePtr) slitp->seq_data, 0, SEEK_SET);
8788                 for (len2 = 1; len2 <= (slitp->length); len2++) {
8789                   is_invalid = FALSE;
8790                   residue = BSGetByte ((ByteStorePtr) slitp->seq_data);
8791                   i = residue - start_at;
8792                   if ((i < 0) || (i >= num))
8793                     is_invalid = TRUE;
8794                   else if (*(sctp->names[i]) == '\0')
8795                     is_invalid = TRUE;
8796                   if (is_invalid) {
8797                     if (slitp->seq_data_type == Seq_code_ncbistdaa)
8798                       ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_InvalidResidue, "Invalid residue [%d] at position [%ld]", (int) residue, (long) (len + len2));
8799                     else
8800                       ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_InvalidResidue, "Invalid residue [%c] at position [%ld]", (char) residue, (long) (len + len2));
8801                   }
8802                 }
8803                 break;
8804               default:
8805                 break;
8806               }
8807               if (mip != NULL) {
8808                 if (mip->tech == MI_TECH_htgs_1 || mip->tech == MI_TECH_htgs_2) {
8809                   runsofn = CountAdjacentNsInInterval (gcp, bsp, len, len + slitp->length - 1);
8810                   if (runsofn > 80) {
8811                     ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_InternalNsInSeqLit, "Run of %ld Ns in delta component %d that starts at base %ld", (long) runsofn, segnum, (long) (len + 1));
8812                   }
8813                 } else if (mip->tech == MI_TECH_wgs) {
8814                   runsofn = CountAdjacentNsInInterval (gcp, bsp, len, len + slitp->length - 1);
8815                   if (runsofn >= 20) {
8816                     ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_InternalNsInSeqLit, "Run of %ld Ns in delta component %d that starts at base %ld", (long) runsofn, segnum, (long) (len + 1));
8817                   }
8818                 } else if (mip->tech == MI_TECH_composite_wgs_htgs) {
8819                   runsofn = CountAdjacentNsInInterval (gcp, bsp, len, len + slitp->length - 1);
8820                   if (runsofn > 80) {
8821                     ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_InternalNsInSeqLit, "Run of %ld Ns in delta component %d that starts at base %ld", (long) runsofn, segnum, (long) (len + 1));
8822                   }
8823                 } else {
8824                   runsofn = CountAdjacentNsInInterval (gcp, bsp, len, len + slitp->length - 1);
8825                   if (runsofn >= 100) {
8826                     ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_InternalNsInSeqLit, "Run of %ld Ns in delta component %d that starts at base %ld", (long) runsofn, segnum, (long) (len + 1));
8827                   }
8828                 }
8829               }
8830             } else if (slitp->seq_data != NULL && slitp->seq_data_type == Seq_code_gap) {
8831               sgp = (SeqGapPtr) slitp->seq_data;
8832               is_unspec = FALSE;
8833               if (sgp->linkage_evidence != NULL) {
8834                 MemSet ((Pointer) &linkevarray, 0, sizeof (linkevarray));
8835                 linkcount = 0;
8836                 for (linkvnp = sgp->linkage_evidence; linkvnp != NULL; linkvnp = linkvnp->next) {
8837                   lep = (LinkageEvidencePtr) linkvnp->data.ptrvalue;
8838                   if (lep == NULL) continue;
8839                   linktype = (int) lep->type;
8840                   if (linktype == 8) {
8841                     is_unspec = TRUE;
8842                   }
8843                   linkcount++;
8844                   if (linktype == 255) {
8845                     (linkevarray [10])++;
8846                   } else if (linktype < 0 || linktype > 9) {
8847                     (linkevarray [11])++;
8848                   } else {
8849                     (linkevarray [linktype])++;
8850                   }
8851                 }
8852                 if (sgp->linkage != 1) {
8853                   ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SeqGapProblem, "Seq-gap with linkage evidence must have linkage field set to linked");
8854                 }
8855                 if (sgp->type != 1 && sgp->type != 2 && sgp->type != 7 && sgp->type != 9) {
8856                   sev = SEV_REJECT;
8857                   if (sgp->type == 0 && is_unspec) {
8858                     /* suppress for legacy records */
8859                   } else {
8860                     ValidErr (vsp, sev, ERR_SEQ_INST_SeqGapProblem, "Seq-gap of type %d should not have linkage evidence", (int) sgp->type);
8861                   }
8862                 }
8863                 if (linkevarray [8] > 0 && linkcount > linkevarray [8]) {
8864                   ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_SeqGapProblem, "Seq-gap type has unspecified and additional linkage evidence");
8865                 }
8866                 for (i = 0; i < 12; i++) {
8867                   if (linkevarray [i] > 1) {
8868                     ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_SeqGapProblem, "Linkage evidence '%s' appears %d times", linkEvStrings [i], (long) linkevarray [i]);
8869                   }
8870                 }
8871               } else {
8872                 if (sgp->type == 9) {
8873                   ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SeqGapProblem, "Seq-gap type == scaffold is missing required linkage evidence");
8874                 }
8875                 if (sgp->type == 7 && sgp->linkage == 1) {
8876                   ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SeqGapProblem, "Seq-gap type == repeat and linkage == linked is missing required linkage evidence");
8877                 }
8878               }
8879               if (sgp->type == 0 && is_unspec) {
8880                 num_gap_unknown_unspec++;
8881               } else {
8882                 num_gap_known_or_spec++;
8883               }
8884             } else if (slitp->length == 0) {
8885               if (isSwissProt) {
8886                 sev = SEV_WARNING;
8887               } else {
8888                 sev = SEV_ERROR;
8889               }
8890               ifp = slitp->fuzz;
8891               if (ifp == NULL || ifp->choice != 4 || ifp->a != 0) {
8892                 ValidErr (vsp, sev, ERR_SEQ_INST_SeqLitGapLength0, "Gap of length 0 in delta chain");
8893               } else {
8894                 ValidErr (vsp, sev, ERR_SEQ_INST_SeqLitGapLength0, "Gap of length 0 with unknown fuzz in delta chain");
8895               }
8896             } else if (slitp->seq_data == NULL || slitp->seq_data_type == Seq_code_gap) {
8897               if (slitp->length != 100 && slitp->fuzz != NULL) {
8898                 ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_SeqLitGapFuzzNot100, "Gap of unknown length should have length 100");
8899               }
8900             }
8901             len += slitp->length;
8902             break;
8903           default:
8904             ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_ExtNotAllowed, "Illegal choice [%d] in delta chain", (int) (vnp->choice));
8905             break;
8906           }
8907         }
8908       }
8909       if (bsp->length > len) {
8910         ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SeqDataLenWrong, "Bioseq.seq_data too short [%ld] for given length [%ld]", (long) (len), (long) bsp->length);
8911       } else if (bsp->length < len) {
8912         ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SeqDataLenWrong, "Bioseq.seq_data is larger [%ld] than given length [%ld]", (long) (len), (long) bsp->length);
8913       }
8914 
8915       if (num_gap_unknown_unspec > 0 && num_gap_known_or_spec == 0) {
8916         if (num_gap_unknown_unspec > 1) {
8917           ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_SeqGapProblem, "All %ld Seq-gaps have unknown type and unspecified linkage", (long) num_gap_unknown_unspec);
8918         } else {
8919           ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_SeqGapProblem, "Single Seq-gap has unknown type and unspecified linkage");
8920         }
8921       }
8922 
8923     } else {
8924 
8925       for (vnp = (ValNodePtr) (bsp->seq_ext); vnp != NULL; vnp = vnp->next) {
8926         if (vnp->data.ptrvalue == NULL) {
8927           ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_SeqDataLenWrong, "NULL pointer in delta seq_ext valnode");
8928           continue;
8929         }
8930         switch (vnp->choice) {
8931           case 1 :
8932             slp = (SeqLocPtr) vnp->data.ptrvalue;
8933             if (slp->choice == SEQLOC_WHOLE) {
8934               ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_WholeComponent, "Delta seq component should not be of type whole");
8935               break;
8936             }
8937             if (slp->choice == SEQLOC_INT) {
8938               sintp = (SeqIntPtr) slp->data.ptrvalue;
8939               if (sintp != NULL) {
8940                 len2 = sintp->to - sintp->from + 1;
8941                 if (len2 < 0) {
8942                   ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_SeqDataLenWrong, "-1 length on seq-loc of delta seq_ext");
8943                 } else {
8944                   len += len2;
8945                 }
8946               }
8947             }
8948             break;
8949           case 2 :
8950             slitp = (SeqLitPtr) vnp->data.ptrvalue;
8951             len += slitp->length;
8952             break;
8953           default :
8954             ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_ExtNotAllowed, "Illegal choice [%d] in delta chain", (int) (vnp->choice));
8955             break;
8956         }
8957       }
8958       if (bsp->length > len) {
8959         ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SeqDataLenWrong, "Bioseq.seq_data too short [%ld] for given length [%ld]", (long) (len), (long) bsp->length);
8960       } else if (bsp->length < len) {
8961         ValidErr (vsp, SEV_REJECT, ERR_SEQ_INST_SeqDataLenWrong, "Bioseq.seq_data is larger [%ld] than given length [%ld]", (long) (len), (long) bsp->length);
8962       }
8963     }
8964     if (mip != NULL) {
8965       is_gps = FALSE;
8966       sep = vsp->sep;
8967       if (sep != NULL && IS_Bioseq_set (sep)) {
8968         bssp = (BioseqSetPtr) sep->data.ptrvalue;
8969         if (bssp != NULL && bssp->_class == BioseqseqSet_class_gen_prod_set) {
8970           is_gps = TRUE;
8971         }
8972       }
8973       if ((!isNTorNC) && (! is_gps) && ISA_na (bsp->mol) && mip->tech != MI_TECH_htgs_0 && mip->tech != MI_TECH_htgs_1 &&
8974           mip->tech != MI_TECH_htgs_2 && mip->tech != MI_TECH_htgs_3 && mip->tech != MI_TECH_wgs &&
8975           mip->tech != MI_TECH_composite_wgs_htgs && mip->tech != MI_TECH_unknown && mip->tech != MI_TECH_standard
8976           && mip->tech != MI_TECH_htc && mip->tech != MI_TECH_barcode && mip->tech != MI_TECH_tsa) {
8977         ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_BadDeltaSeq, "Delta seq technique should not be [%d]", (int) (mip->tech));
8978       }
8979     }
8980   } else if (bsp->repr == Seq_repr_raw) {
8981     ron.gcp = gcp;
8982     ron.vsp = vsp;
8983     ron.ncount = 0;
8984     ron.maxrun = 0;
8985     ron.seqpos = 0;
8986     ron.gapcount = 0;
8987     ron.showAll = TRUE;
8988     ron.inNrun = FALSE;
8989     ron.isWGS = FALSE;
8990     if (mip == NULL) {
8991       vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &context);
8992       if (vnp != NULL) {
8993         mip = (MolInfoPtr) vnp->data.ptrvalue;
8994       }
8995     }
8996     if (mip != NULL && mip->tech == MI_TECH_wgs) {
8997       ron.isWGS = TRUE;
8998     }
8999 
9000     if (ISA_na (bsp->mol)) {
9001       SeqPortStream (bsp, EXPAND_GAPS_TO_DASHES, (Pointer) &ron, CountAdjacentProc);
9002     }
9003 
9004     /*
9005     if (ron.inNrun && ron.showAll && ron.ncount >= 100) {
9006       ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_InternalNsInSeqRaw, "Run of %ld Ns in raw sequence starting at base %ld",
9007                 (long) ron.ncount, (long) (ron.seqpos - ron.ncount + 1));
9008     }
9009     */
9010 
9011     if (ron.gapcount > 0 && ISA_na (bsp->mol)) {
9012       ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_InternalGapsInSeqRaw, "Raw nucleotide should not contain gap characters");
9013     }
9014 
9015     /*
9016     if (ron.maxrun >= 100) {
9017       ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_InternalNsInSeqRaw, "Run of %ld Ns in raw sequence", (long) ron.maxrun);
9018     }
9019     */
9020   }
9021 
9022   if (bsp->repr == Seq_repr_delta) {
9023     CheckDeltaForReuse (vsp, gcp, bsp);
9024   }
9025 
9026   sev = SEV_ERROR;
9027   if (mip != NULL) {
9028     if (mip->tech != MI_TECH_htgs_0 && mip->tech != MI_TECH_htgs_1 &&
9029         mip->tech != MI_TECH_htgs_2 && mip->tech != MI_TECH_htgs_3) {
9030       sev = SEV_WARNING;
9031     }
9032   }
9033 
9034   if (bsp->repr == Seq_repr_delta && bsp->seq_ext_type == 4 && bsp->seq_ext != NULL) {
9035     vnp = (DeltaSeqPtr) bsp->seq_ext;
9036     if (vnp != NULL && vnp->choice == 2) {
9037       slitp = (SeqLitPtr) vnp->data.ptrvalue;
9038       if (slitp != NULL && (slitp->seq_data == NULL || slitp->seq_data_type == Seq_code_gap) && bsp->topology != TOPOLOGY_CIRCULAR) {
9039         ValidErr (vsp, sev, ERR_SEQ_INST_BadDeltaSeq, "First delta seq component is a gap");
9040       }
9041     }
9042     last_is_gap = FALSE;
9043     num_adjacent_gaps = 0;
9044     num_gaps = 0;
9045     non_interspersed_gaps = FALSE;
9046     while (vnp->next != NULL) {
9047       vnp = vnp->next;
9048       if (vnp != NULL && vnp->choice == 2) {
9049         slitp = (SeqLitPtr) vnp->data.ptrvalue;
9050         if (slitp != NULL && (slitp->seq_data == NULL || slitp->seq_data_type == Seq_code_gap)) {
9051           if (last_is_gap) {
9052             num_adjacent_gaps++;
9053           }
9054           last_is_gap = TRUE;
9055           num_gaps++;
9056         } else {
9057           if (! last_is_gap) {
9058             non_interspersed_gaps = TRUE;
9059           }
9060           last_is_gap = FALSE;
9061         }
9062       } else {
9063         if (! last_is_gap) {
9064           non_interspersed_gaps = TRUE;
9065         }
9066         last_is_gap = FALSE;
9067       }
9068     }
9069     if (non_interspersed_gaps && (! hasGi) && mip != NULL &&
9070         (mip->tech == MI_TECH_htgs_0 || mip->tech == MI_TECH_htgs_1 || mip->tech == MI_TECH_htgs_2 /* || mip->tech == MI_TECH_htgs_3 */)) {
9071       if (hasRefGeneTracking) {
9072         ValidErr (vsp, SEV_INFO, ERR_SEQ_INST_MissingGaps, "HTGS delta seq should have gaps between all sequence runs");
9073       } else {
9074         ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_MissingGaps, "HTGS delta seq should have gaps between all sequence runs");
9075       }
9076     }
9077     if (vnp != NULL && vnp->choice == 2) {
9078       slitp = (SeqLitPtr) vnp->data.ptrvalue;
9079       if (slitp != NULL && (slitp->seq_data == NULL || slitp->seq_data_type == Seq_code_gap) && bsp->topology != TOPOLOGY_CIRCULAR) {
9080         ValidErr (vsp, sev, ERR_SEQ_INST_BadDeltaSeq, "Last delta seq component is a gap");
9081       }
9082     }
9083     if (num_gaps == 0 && mip != NULL) {
9084       if (/* mip->tech == MI_TECH_htgs_1 || */ mip->tech == MI_TECH_htgs_2) {
9085         if (VisitGraphsInSep (sep, NULL, NULL) == 0) {
9086           if (! isActiveFin) {
9087             ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_BadHTGSeq, "HTGS 2 delta seq has no gaps and no graphs");
9088           }
9089         }
9090       }
9091     }
9092     sev = SEV_ERROR;
9093     if (isRefSeq) {
9094       sev = SEV_WARNING;
9095     }
9096     if (num_adjacent_gaps > 1) {
9097       ValidErr (vsp, sev, ERR_SEQ_INST_BadDeltaSeq, "There are %d adjacent gaps in delta seq", (int) num_adjacent_gaps);
9098     } else if (num_adjacent_gaps > 0) {
9099       ValidErr (vsp, sev, ERR_SEQ_INST_BadDeltaSeq, "There is %d adjacent gap in delta seq", (int) num_adjacent_gaps);
9100     }
9101   }
9102 
9103   if (bsp->repr == Seq_repr_raw) {
9104     if (mip != NULL) {
9105       if (/* mip->tech == MI_TECH_htgs_1 || */ mip->tech == MI_TECH_htgs_2) {
9106         if (VisitGraphsInSep (sep, NULL, NULL) == 0) {
9107           if (! isActiveFin) {
9108             ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_BadHTGSeq, "HTGS 2 raw seq has no gaps and no graphs");
9109           }
9110         }
9111       }
9112     }
9113   }
9114 
9115   if (mip != NULL && mip->tech == MI_TECH_htgs_3) {
9116     if (isDraft) {
9117       ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_BadHTGSeq, "HTGS 3 sequence should not have HTGS_DRAFT keyword");
9118     }
9119     if (isPreFin) {
9120       ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_BadHTGSeq, "HTGS 3 sequence should not have HTGS_PREFIN keyword");
9121     }
9122     if (isActiveFin) {
9123       ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_BadHTGSeq, "HTGS 3 sequence should not have HTGS_ACTIVEFIN keyword");
9124     }
9125     if (isFullTop) {
9126       ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_BadHTGSeq, "HTGS 3 sequence should not have HTGS_FULLTOP keyword");
9127     }
9128   }
9129 
9130   if (is_master && bsp->repr == Seq_repr_virtual && mip != NULL) {
9131     if (mip->tech == MI_TECH_wgs) {
9132       wgs_master = TRUE;
9133     }
9134     if (mip->tech == MI_TECH_tsa) {
9135       tsa_master = TRUE;
9136     }
9137   }
9138 
9139   if (wgs_master && ! is_genome_assembly) {
9140     sev = SEV_ERROR;
9141     if (isEMBL || isDDBJ) {
9142       sev = SEV_WARNING;
9143     }
9144     ValidErr (vsp, sev, ERR_SEQ_INST_WGSMasterLacksStrucComm, "WGS master without Genome Assembly Data user object");
9145   }
9146 
9147   if (tsa_master && ! is_assembly) {
9148     sev = SEV_ERROR;
9149     if (isEMBL || isDDBJ) {
9150       sev = SEV_WARNING;
9151     }
9152     ValidErr (vsp, sev, ERR_SEQ_INST_TSAMasterLacksStrucComm, "TSA master without Assembly Data user object");
9153   }
9154 
9155   if (wgs_master) {
9156     /* ignore ShortSeq for WGS master - length is actually the number of contigs in the project */
9157   } else if (ISA_aa (bsp->mol)) {
9158     if ((bsp->length <= 3) && (bsp->length >= 0) && (!isPDB)) {
9159       if (mip == NULL || mip->completeness < 2 || mip->completeness > 5) {
9160         ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_ShortSeq, "Sequence only %ld residues", (long) (bsp->length));
9161       }
9162     }
9163 
9164   } else {
9165     if ((bsp->length <= 10) && (bsp->length >= 0) && (!isPDB)) {
9166       ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_ShortSeq, "Sequence only %ld residues", (long) (bsp->length));
9167     }
9168   }
9169 
9170 #if 0
9171   if (bsp->length > 350000 && (! isNTorNC)) {
9172     Boolean         isGenBankEMBLorDDBJ;
9173     Boolean         litHasData;
9174     if (bsp->repr == Seq_repr_delta) {
9175       isGenBankEMBLorDDBJ = FALSE;
9176       /* suppress this for data from genome annotation project */
9177       VisitBioseqsInSep (vsp->sep, (Pointer) &isGenBankEMBLorDDBJ, LookForGEDseqID);
9178       if (mip != NULL && isGenBankEMBLorDDBJ) {
9179         if (mip->tech == MI_TECH_htgs_0 || mip->tech == MI_TECH_htgs_1 || mip->tech == MI_TECH_htgs_2) {
9180           ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_LongHtgsSequence, "Phase 0, 1 or 2 HTGS sequence exceeds 350kbp limit");
9181         } else if (mip->tech == MI_TECH_htgs_3) {
9182           ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_SequenceExceeds350kbp, "Phase 3 HTGS sequence exceeds 350kbp limit");
9183         } else if (mip->tech == MI_TECH_wgs) {
9184           /*
9185           ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_SequenceExceeds350kbp, "WGS sequence exceeds 350kbp limit");
9186           */
9187         } else {
9188           len = 0;
9189           litHasData = FALSE;
9190           for (vnp = (ValNodePtr) (bsp->seq_ext); vnp != NULL; vnp = vnp->next) {
9191             if (vnp->choice == 2) {
9192               slitp = (SeqLitPtr) (vnp->data.ptrvalue);
9193               if (slitp != NULL) {
9194                 if (slitp->seq_data != NULL) {
9195                   litHasData = TRUE;
9196                 }
9197                 len += slitp->length;
9198               }
9199             }
9200           }
9201           if (len > 500000 && litHasData) {
9202             ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_LongLiteralSequence, "Length of sequence literals exceeds 500kbp limit");
9203           }
9204         }
9205       }
9206     } else if (bsp->repr == Seq_repr_raw) {
9207       vnp = NULL;
9208       if (vsp->useSeqMgrIndexes) {
9209         vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &context);
9210       } else {
9211         bcp = BioseqContextNew (bsp);
9212         vnp = BioseqContextGetSeqDescr (bcp, Seq_descr_molinfo, NULL, NULL);
9213         BioseqContextFree (bcp);
9214       }
9215       if (vnp != NULL) {
9216         mip = (MolInfoPtr) vnp->data.ptrvalue;
9217       }
9218       if (mip != NULL) {
9219         if (mip->tech == MI_TECH_htgs_0 || mip->tech == MI_TECH_htgs_1 || mip->tech == MI_TECH_htgs_2) {
9220           ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_LongHtgsSequence, "Phase 0, 1 or 2 HTGS sequence exceeds 350kbp limit");
9221         } else if (mip->tech == MI_TECH_htgs_3) {
9222           ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_SequenceExceeds350kbp, "Phase 3 HTGS sequence exceeds 350kbp limit");
9223         } else if (mip->tech == MI_TECH_wgs) {
9224           /*
9225           ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_SequenceExceeds350kbp, "WGS sequence exceeds 350kbp limit");
9226           */
9227         } else {
9228           ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_SequenceExceeds350kbp, "Length of sequence exceeds 350kbp limit");
9229         }
9230       } else {
9231         ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_SequenceExceeds350kbp, "Length of sequence exceeds 350kbp limit");
9232       }
9233     } else {
9234       /* Could be a segset header bioseq that is > 350kbp */
9235       /* No-op for now? Or generate a warning? */
9236     }
9237   }
9238 #endif
9239 
9240   if (bsp->repr == Seq_repr_seg) {
9241 //LCOV_EXCL_START
9242 // Only for SegSets
9243     CheckSegBspAgainstParts (vsp, gcp, bsp);
9244 //LCOV_EXCL_STOP
9245   }
9246 
9247   if (ISA_na (bsp->mol) || ISA_aa (bsp->mol)) {
9248     vnp = BioseqGetSeqDescr (bsp, Seq_descr_title, NULL);
9249     if (vnp != NULL) {
9250       title = (CharPtr) vnp->data.ptrvalue;
9251       if (StringDoesHaveText (title)) {
9252         if (HasUnparsedBrackets (title)) {
9253           reportFastaBracket = TRUE;
9254           for (sip = bsp->id; sip != NULL; sip = sip->next) {
9255             if (sip->choice != SEQID_GENERAL) continue;
9256             dbt = (DbtagPtr) sip->data.ptrvalue;
9257             if (dbt == NULL) continue;
9258             if (StringICmp (dbt->db, "TMSMART") == 0) {
9259               reportFastaBracket = FALSE;
9260             }
9261             if (StringICmp (dbt->db, "BankIt") == 0) {
9262               reportFastaBracket = FALSE;
9263             }
9264           }
9265           if (reportFastaBracket) {
9266             sdp = NULL;
9267             if (vsp->useSeqMgrIndexes) {
9268               sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
9269             } else {
9270 //LCOV_EXCL_START
9271               bcp = BioseqContextNew (bsp);
9272               sdp = BioseqContextGetSeqDescr (bcp, Seq_descr_source, NULL, NULL);
9273               BioseqContextFree (bcp);
9274 //LCOV_EXCL_STOP
9275             }
9276             if (sdp != NULL) {
9277               biop = (BioSourcePtr) sdp->data.ptrvalue;
9278               if (biop != NULL) {
9279                 orp = biop->org;
9280                 if (orp != NULL) {
9281                   if (StringDoesHaveText (orp->taxname)) {
9282                     if (StringChr (orp->taxname, '=') != NULL) {
9283                       if (StringISearch (title, orp->taxname) != NULL) {
9284                         reportFastaBracket = FALSE;
9285                       }
9286                     }
9287                   }
9288                 }
9289               }
9290             }
9291           }
9292           if (reportFastaBracket) {
9293             olditemid = gcp->itemID;
9294             olditemtype = gcp->thistype;
9295             if (vnp->extended != 0) {
9296               ovp = (ObjValNodePtr) vnp;
9297               gcp->itemID = ovp->idx.itemID;
9298               gcp->thistype = OBJ_SEQDESC;
9299             }
9300             ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_FastaBracketTitle, "Title may have unparsed [...=...] construct");
9301             gcp->itemID = olditemid;
9302             gcp->thistype = olditemtype;
9303           }
9304         }
9305 
9306         if (StringISearch (title, "complete genome") != NULL && SequenceHasGaps (bsp)) {
9307           //LCOV_EXCL_START
9308           //bug in C Toolkit - only works if gaps are instantiated
9309           /* warning if title contains complete genome but sequence contains gap features */
9310           olditemid = gcp->itemID;
9311           olditemtype = gcp->thistype;
9312           gcp->itemID = bsp->idx.itemID;
9313           gcp->thistype = OBJ_BIOSEQ;
9314           ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_CompleteTitleProblem, "Title contains 'complete genome' but sequence has gaps");
9315           gcp->itemID = olditemid;
9316           gcp->thistype = olditemtype;
9317           //LCOV_EXCL_STOP
9318         }
9319       }
9320     } else {
9321       if (ISA_na (bsp->mol) && vsp->other_sets_in_sep && (vsp->is_insd_in_sep || vsp->is_refseq_in_sep) && vsp->indexerVersion) {
9322         ValidErr (vsp, SEV_WARNING, ERR_SEQ_PKG_ComponentMissingTitle,
9323                   "Nucleotide component of pop/phy/mut/eco/wgs set is missing its title");
9324       }
9325     }
9326   }
9327 
9328   if (ISA_aa (bsp->mol) && vsp->useSeqMgrIndexes) {
9329     vnp = BioseqGetSeqDescr (bsp, Seq_descr_title, NULL);
9330     if (vnp != NULL) {
9331       if (bsp->idx.parenttype == OBJ_BIOSEQSET) {
9332         bssp = (BioseqSetPtr) bsp->idx.parentptr;
9333         while (bssp != NULL && bssp->_class != BioseqseqSet_class_nuc_prot) {
9334           if (bssp->idx.parenttype == OBJ_BIOSEQSET) {
9335             bssp = (BioseqSetPtr) bssp->idx.parentptr;
9336           } else {
9337             bssp = NULL;
9338           }
9339         }
9340         if (bssp != NULL && bssp->_class == BioseqseqSet_class_nuc_prot) {
9341           title = (CharPtr) vnp->data.ptrvalue;
9342           tech = 0;
9343           sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext);
9344           if (sdp != NULL) {
9345             mip = (MolInfoPtr) sdp->data.ptrvalue;
9346             if (mip != NULL) {
9347               tech = mip->tech;
9348             }
9349           }
9350           buf = MemNew (sizeof (Char) * (buflen + 1));
9351           MemSet ((Pointer) (&ii), 0, sizeof (ItemInfo));
9352           /* check generated protein defline with first prp->name - new convention */
9353           if (buf != NULL && NewCreateDefLineBuf (&ii, bsp, buf, buflen, TRUE, FALSE)) {
9354             if (StringICmp (buf, title) != 0) {
9355               /* okay if instantiated title has single trailing period */
9356               len2 = StringLen (buf);
9357               len3 = StringLen (title);
9358               if (len3 == len2 + 1 && title [len3 - 1] == '.' && len3 > 3 && title [len3 - 2] != '.') {
9359                 StringCat (buf, ".");
9360               }
9361             }
9362             if (StringICmp (buf, title) != 0) {
9363               /* also check generated protein defline with all prp->names - old convention */
9364               if (NewCreateDefLineBuf (&ii, bsp, buf, buflen, TRUE, TRUE)) {
9365                 bufplus = buf;
9366                 if (StringNCmp (bufplus, "PREDICTED: ", 11) == 0) {
9367                   bufplus += 11;
9368                 } else if (StringNCmp (bufplus, "UNVERIFIED: ", 12) == 0) {
9369                   bufplus += 12;
9370                 } else if (StringNCmp (bufplus, "PUTATIVE PSEUDOGENE: ", 12) == 0) {
9371                   bufplus += 21;
9372                 }
9373                 if (StringNCmp (title, "PREDICTED: ", 11) == 0) {
9374                   title += 11;
9375                 } else if (StringNCmp (title, "UNVERIFIED: ", 12) == 0) {
9376                   title += 12;
9377                 } else if (StringNCmp (title, "PUTATIVE PSEUDOGENE: ", 21) == 0) {
9378                   title += 21;
9379                 }
9380                 if (StringICmp (bufplus, title) != 0) {
9381                   olditemid = gcp->itemID;
9382                   olditemtype = gcp->thistype;
9383                   if (vnp->extended != 0) {
9384                     ovp = (ObjValNodePtr) vnp;
9385                     gcp->itemID = ovp->idx.itemID;
9386                     gcp->thistype = OBJ_SEQDESC;
9387                   }
9388                   ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_InconsistentProteinTitle,
9389                             "Instantiated protein title does not match automatically generated title");
9390                   gcp->itemID = olditemid;
9391                   gcp->thistype = olditemtype;
9392                 }
9393               }
9394             }
9395           }
9396           MemFree (buf);
9397         }
9398       }
9399     }
9400   }
9401 
9402   if (ISA_aa (bsp->mol) && vsp->useSeqMgrIndexes) {
9403     in_nps = FALSE;
9404     if (bsp->idx.parenttype == OBJ_BIOSEQSET) {
9405       bssp = (BioseqSetPtr) bsp->idx.parentptr;
9406       while (bssp != NULL && bssp->_class != BioseqseqSet_class_nuc_prot) {
9407         if (bssp->idx.parenttype == OBJ_BIOSEQSET) {
9408           bssp = (BioseqSetPtr) bssp->idx.parentptr;
9409         } else {
9410           bssp = NULL;
9411         }
9412       }
9413       if (bssp != NULL && bssp->_class == BioseqseqSet_class_nuc_prot) {
9414         in_nps = TRUE;
9415       }
9416     }
9417     if (! in_nps) {
9418       if (isGB || isEMBL || isDDBJ || isRefSeq) {
9419         if (! isGIBBMT && ! isGIBBSQ && ! isPatent && ! isWP && ! isYP) {
9420           olditemid = gcp->itemID;
9421           olditemtype = gcp->thistype;
9422           gcp->itemID = bsp->idx.itemID;
9423           gcp->thistype = OBJ_BIOSEQ;
9424           ValidErr (vsp, SEV_ERROR, ERR_SEQ_PKG_OrphanedProtein, "Orphaned stand-alone protein");
9425           gcp->itemID = olditemid;
9426           gcp->thistype = olditemtype;
9427         }
9428       }
9429     }
9430   }
9431 
9432   vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &context);
9433   if (vnp != NULL) {
9434     mip = (MolInfoPtr) vnp->data.ptrvalue;
9435     if (mip != NULL) {
9436       if (mip->completeness != 1 && isGB) {
9437         buf = MemNew (sizeof (Char) * (4097));
9438         if (buf != NULL && NewCreateDefLineBuf (NULL, bsp, buf, 4096, FALSE, FALSE)) {
9439           if (StringStr (buf, "complete genome") != NULL) {
9440             olditemid = gcp->itemID;
9441             olditemtype = gcp->thistype;
9442             if (vnp->extended != 0) {
9443               ovp = (ObjValNodePtr) vnp;
9444               gcp->itemID = ovp->idx.itemID;
9445               gcp->thistype = OBJ_SEQDESC;
9446             }
9447             ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_CompleteTitleProblem, "Complete genome in title without complete flag set");
9448             gcp->itemID = olditemid;
9449             gcp->thistype = olditemtype;
9450           }
9451         }
9452         MemFree (buf);
9453       }
9454       if (mip->completeness != 1 && bsp->topology == 2 &&
9455           (! IsConWithGaps (bsp)) &&
9456           !vsp->is_embl_ddbj_in_sep) {
9457         olditemid = gcp->itemID;
9458         olditemtype = gcp->thistype;
9459         if (vnp->extended != 0) {
9460           ovp = (ObjValNodePtr) vnp;
9461           gcp->itemID = ovp->idx.itemID;
9462           gcp->thistype = OBJ_SEQDESC;
9463         }
9464         ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_CompleteCircleProblem, "Circular topology without complete flag set");
9465         gcp->itemID = olditemid;
9466         gcp->thistype = olditemtype;
9467       }
9468     }
9469   }
9470 
9471   /* check for N bases at start or stop of sequence */
9472   ValidateBioseqEnds(bsp, vsp, isPatent);
9473 }
9474 
9475 /*****************************************************************************
9476 *
9477 *   ValidatePubdesc(gcp)
9478 *      Check pubdesc for missing information
9479 *
9480 *****************************************************************************/
HasNoText(CharPtr str)9481 static Boolean HasNoText (CharPtr str)
9482 {
9483   Char            ch;
9484 
9485   if (str != NULL) {
9486     ch = *str;
9487     while (ch != '\0') {
9488       if (ch > ' ') {
9489         return FALSE;
9490       }
9491       str++;
9492       ch = *str;
9493     }
9494   }
9495   return TRUE;
9496 }
9497 
HasNoName(ValNodePtr name)9498 static Boolean HasNoName (ValNodePtr name)
9499 {
9500   AuthorPtr       ap;
9501   NameStdPtr      nsp;
9502   PersonIdPtr     pid;
9503 
9504   if (name != NULL) {
9505     ap = name->data.ptrvalue;
9506     if (ap != NULL) {
9507       pid = ap->name;
9508       if (pid != NULL) {
9509         if (pid->choice == 2) {
9510           nsp = pid->data;
9511           if (nsp != NULL) {
9512             if (!HasNoText (nsp->names[0])) {
9513               return FALSE;
9514             }
9515           }
9516         } else if (pid->choice == 3 || pid->choice == 4) {
9517            if (!HasNoText ((CharPtr) pid->data)) {
9518             return FALSE;
9519           }
9520        } else if (pid->choice == 5) {
9521           /* consortium */
9522           if (!HasNoText ((CharPtr) pid->data)) {
9523             return FALSE;
9524           }
9525         }
9526       }
9527     }
9528   }
9529   return TRUE;
9530 }
9531 
ValidateAffil(ValidStructPtr vsp,AffilPtr ap)9532 static void ValidateAffil (ValidStructPtr vsp, AffilPtr ap)
9533 
9534 {
9535   if (ap != NULL) {
9536     if (ap->affil == NULL && ap->div == NULL && ap->street == NULL && ap->city == NULL &&
9537         ap->sub == NULL && ap->postal_code == NULL && ap->country == NULL &&
9538         ap->phone == NULL && ap->fax == NULL && ap->email == NULL) {
9539       /* no affiliation */
9540     } else {
9541       if (ap->choice == 2) {
9542         /*
9543         if (StringHasNoText (ap->city)) {
9544           ValidErr (vsp, SEV_WARNING, ERR_GENERIC_MissingPubInfo, "Submission citation affiliation has no city");
9545         }
9546         */
9547         if (StringHasNoText (ap->country)) {
9548           ValidErr (vsp, SEV_WARNING, ERR_GENERIC_MissingPubInfo, "Submission citation affiliation has no country");
9549         }
9550         if (StringCmp (ap->country, "USA") == 0) {
9551           if (StringHasNoText (ap->sub)) {
9552             ValidErr (vsp, SEV_WARNING, ERR_GENERIC_MissingPubInfo, "Submission citation affiliation has no state");
9553           }
9554         }
9555       }
9556     }
9557   }
9558 }
9559 
9560 #define DATE_OKAY        0
9561 #define EMPTY_DATE       1
9562 #define BAD_DATE_STR     2
9563 #define BAD_DATE_YEAR    4
9564 #define BAD_DATE_MONTH   8
9565 #define BAD_DATE_DAY    16
9566 #define BAD_DATE_SEASON 32
9567 #define BAD_DATE_OTHER  64
9568 
DateIsBad(DatePtr dp,Boolean needFullDate,Int2Ptr baddatep)9569 static Boolean DateIsBad (DatePtr dp, Boolean needFullDate, Int2Ptr baddatep)
9570 
9571 {
9572   Char     ch;
9573   CharPtr  ptr;
9574   Int2     rsult = DATE_OKAY;
9575 
9576   if (dp == NULL) {
9577     rsult = EMPTY_DATE;
9578   } else if (dp->data [0] == 0) {
9579     if (dp->str == NULL) {
9580       rsult = BAD_DATE_STR;
9581     } else if (StringCmp (dp->str, "?") == 0) {
9582       rsult = BAD_DATE_STR;
9583     }
9584   } else if (dp->data [0] == 1) {
9585     if (dp->data [1] == 0) {
9586       rsult |= BAD_DATE_YEAR;
9587     }
9588     if (dp->data [2] > 12) {
9589       rsult |= BAD_DATE_MONTH;
9590     }
9591     if (dp->data [3] > 31) {
9592       rsult |= BAD_DATE_DAY;
9593     }
9594     if (needFullDate) {
9595       if (dp->data [2] == 0) {
9596         rsult |= BAD_DATE_MONTH;
9597       }
9598       if (dp->data [3] == 0) {
9599         rsult |= BAD_DATE_DAY;
9600       }
9601     }
9602     if (StringDoesHaveText (dp->str)) {
9603       ptr = dp->str;
9604       ch = *ptr;
9605       while (ch != '\0') {
9606         if (IS_ALPHA (ch) || ch == '-') {
9607         } else {
9608           rsult |= BAD_DATE_SEASON;
9609         }
9610         ptr++;
9611         ch = *ptr;
9612       }
9613     }
9614   } else {
9615     rsult = BAD_DATE_OTHER;
9616   }
9617 
9618   if (baddatep != NULL) {
9619     *baddatep = rsult;
9620   }
9621 
9622   return (Boolean) (rsult > 0);
9623 }
9624 
PrintBadDateError(ValidStructPtr vsp,Int2 baddate,int severity,int code1,int code2,CharPtr mssg)9625 static void PrintBadDateError (ValidStructPtr vsp, Int2 baddate, int severity, int code1, int code2, CharPtr mssg)
9626 
9627 {
9628   Char  buf [256];
9629 
9630   buf [0] = '\0';
9631 
9632   if ((baddate & EMPTY_DATE) != 0) {
9633     StringCat (buf, "EMPTY_DATE ");
9634   }
9635   if ((baddate & BAD_DATE_STR) != 0) {
9636     StringCat (buf, "BAD_STR ");
9637   }
9638   if ((baddate & BAD_DATE_YEAR) != 0) {
9639     StringCat (buf, "BAD_YEAR ");
9640   }
9641   if ((baddate & BAD_DATE_MONTH) != 0) {
9642     StringCat (buf, "BAD_MONTH ");
9643   }
9644   if ((baddate & BAD_DATE_DAY) != 0) {
9645     StringCat (buf, "BAD_DAY ");
9646   }
9647   if ((baddate & BAD_DATE_SEASON) != 0) {
9648     StringCat (buf, "BAD_SEASON ");
9649   }
9650   if ((baddate & BAD_DATE_OTHER) != 0) {
9651     StringCat (buf, "BAD_OTHER ");
9652   }
9653 
9654   TrimSpacesAroundString (buf);
9655 
9656   ValidErr (vsp, severity, code1, code2, "%s - %s", mssg, buf);
9657 }
9658 
ValidateCitSub(ValidStructPtr vsp,CitSubPtr csp)9659 static void ValidateCitSub (ValidStructPtr vsp, CitSubPtr csp)
9660 {
9661   AffilPtr        ap;
9662   AuthListPtr     alp;
9663   Int2            baddate;
9664   DatePtr         dp;
9665   ValNodePtr      name;
9666   Boolean         hasAffil = FALSE;
9667   Boolean         hasName = FALSE;
9668   ErrSev          sev;
9669 
9670   if (vsp == NULL || csp == NULL) return;
9671 
9672   sev = SEV_REJECT;
9673   if (vsp->is_refseq_in_sep) {
9674     sev = SEV_WARNING;
9675   }
9676   if (vsp->is_insd_in_sep) {
9677     sev = SEV_WARNING;
9678   }
9679   if (vsp->is_htg_in_sep) {
9680     sev = SEV_WARNING;
9681   }
9682   if (vsp->is_pdb_in_sep) {
9683     sev = SEV_WARNING;
9684   }
9685 
9686   alp = csp->authors;
9687   if (alp != NULL) {
9688     if (alp->choice == 1) {
9689       for (name = alp->names; name != NULL; name = name->next) {
9690         if (!HasNoName (name)) {
9691           hasName = TRUE;
9692         }
9693       }
9694     } else if (alp->choice == 2 || alp->choice == 3) {
9695       for (name = alp->names; name != NULL; name = name->next) {
9696         if (!HasNoText ((CharPtr) name->data.ptrvalue)) {
9697           hasName = TRUE;
9698         }
9699       }
9700     }
9701     ap = alp->affil;
9702     if (ap != NULL) {
9703       if (ap->affil == NULL && ap->div == NULL && ap->street == NULL && ap->city == NULL &&
9704            ap->sub == NULL && ap->postal_code == NULL && ap->country == NULL &&
9705            ap->phone == NULL && ap->fax == NULL && ap->email == NULL) {
9706         /* no affiliation */
9707         if (sev == SEV_REJECT) {
9708           ValidErr (vsp, sev, ERR_GENERIC_MissingPubInfo, "Submission has no affiliation");
9709         }
9710       } else {
9711         hasAffil = TRUE;
9712         if (ap->choice == 2) {
9713           /*
9714           if (StringHasNoText (ap->city)) {
9715             ValidErr (vsp, SEV_WARNING, ERR_GENERIC_MissingPubInfo, "Submission citation affiliation has no city");
9716           }
9717           */
9718           if (StringHasNoText (ap->country)) {
9719             ValidErr (vsp, sev, ERR_GENERIC_MissingPubInfo, "Submission citation affiliation has no country");
9720           }
9721           if (StringHasNoText (ap->div) && StringHasNoText (ap->affil)) {
9722             ValidErr (vsp, sev, ERR_GENERIC_MissingPubInfo, "Submission citation affiliation has no institution");
9723           }
9724           if (StringCmp (ap->country, "USA") == 0) {
9725             if (StringHasNoText (ap->sub)) {
9726               ValidErr (vsp, SEV_WARNING, ERR_GENERIC_MissingPubInfo, "Submission citation affiliation has no state");
9727             }
9728           }
9729         }
9730       }
9731     }
9732   } else {
9733     //LCOV_EXCL_START
9734     //not valid ASN.1
9735     ValidErr (vsp, sev, ERR_GENERIC_MissingPubInfo, "Submission citation affiliation has no authors");
9736     //LCOV_EXCL_STOP
9737   }
9738   if (!hasName) {
9739     //LCOV_EXCL_START
9740     //BasicCleanup inserts a "?" if there are no authors
9741     ValidErr (vsp, sev, ERR_GENERIC_MissingPubInfo, "Submission citation has no author names");
9742     //LCOV_EXCL_STOP
9743   }
9744   if (!hasAffil) {
9745     if (! vsp->is_patent_in_sep) {
9746       ValidErr (vsp, sev, ERR_GENERIC_MissingPubInfo, "Submission citation has no affiliation");
9747     }
9748   }
9749   dp = csp->date;
9750   if (dp != NULL) {
9751     if (DateIsBad (dp, FALSE, &baddate)) {
9752       PrintBadDateError (vsp, baddate, SEV_ERROR, ERR_GENERIC_BadDate, "Submission citation date has error");
9753     }
9754   } else {
9755     ValidErr (vsp, SEV_ERROR, ERR_GENERIC_MissingPubInfo, "Submission citation has no date");
9756   }
9757 }
9758 
LookForMultiplePubs(ValidStructPtr vsp,GatherContextPtr gcp,SeqDescrPtr sdp)9759 static void LookForMultiplePubs (ValidStructPtr vsp, GatherContextPtr gcp, SeqDescrPtr sdp)
9760 
9761 {
9762   Bioseq       bs;
9763   Boolean      collision, otherpub;
9764   Int4         muid, pmid;
9765   SeqDescrPtr  nextpub;
9766   PubdescPtr   pdp;
9767   ValNodePtr   vnp;
9768 
9769 
9770   if (sdp != NULL && sdp->choice == Seq_descr_pub && sdp->extended != 0 && vsp != NULL && gcp != NULL) {
9771     MemSet ((Pointer) &bs, 0, sizeof (Bioseq));
9772     pdp = (PubdescPtr) sdp->data.ptrvalue;
9773     if (pdp != NULL) {
9774       otherpub = FALSE;
9775       muid = 0;
9776       pmid = 0;
9777       for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
9778         if (vnp->choice == PUB_Muid) {
9779           muid = vnp->data.intvalue;
9780         } else if (vnp->choice == PUB_PMid) {
9781           pmid = vnp->data.intvalue;
9782         } else {
9783           otherpub = TRUE;
9784         }
9785       }
9786       if (otherpub) {
9787         if (muid > 0 || pmid > 0) {
9788           collision = FALSE;
9789           nextpub = GetNextDescriptorUnindexed (&bs, Seq_descr_pub, sdp);
9790           while (nextpub != NULL) {
9791             pdp = (PubdescPtr) nextpub->data.ptrvalue;
9792             if (pdp != NULL) {
9793               for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
9794                 if (vnp->choice == PUB_Muid) {
9795                   if (muid > 0 && muid == vnp->data.intvalue) {
9796                     collision = TRUE;
9797                   }
9798                 } else if (vnp->choice == PUB_PMid) {
9799                   if (pmid > 0 && pmid == vnp->data.intvalue) {
9800                     collision = TRUE;
9801                   }
9802                 }
9803               }
9804             }
9805             nextpub = GetNextDescriptorUnindexed (&bs, Seq_descr_pub, nextpub);
9806           }
9807           if (collision) {
9808             ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_CollidingPublications, "Multiple publications with same identifier");
9809           }
9810         }
9811       }
9812     }
9813   }
9814 }
9815 
LookForMultipleUnpubPubs(ValidStructPtr vsp,GatherContextPtr gcp,BioseqPtr bsp)9816 static void LookForMultipleUnpubPubs (ValidStructPtr vsp, GatherContextPtr gcp, BioseqPtr bsp)
9817 
9818 {
9819   Char               buf [2048];
9820   CharPtr            last, str;
9821   SeqMgrDescContext  dcontext;
9822   ValNodePtr         list = NULL, next, vnp;
9823   ObjValNodePtr      ovp;
9824   PubdescPtr         pdp;
9825   SeqDescrPtr        sdp;
9826 
9827   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_pub, &dcontext);
9828   while (sdp) {
9829     pdp = (PubdescPtr) sdp->data.ptrvalue;
9830     if (pdp != NULL) {
9831       ovp = (ObjValNodePtr) sdp;
9832       if (ovp->idx.scratch != NULL) {
9833         ValNodeCopyStr (&list, 0, ovp->idx.scratch);
9834       }
9835     }
9836     sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_pub, &dcontext);
9837   }
9838 
9839   if (list == NULL) return;
9840 
9841   list = ValNodeSort (list, SortVnpByString);
9842   last = (CharPtr) list->data.ptrvalue;
9843   vnp = list->next;
9844   while (vnp != NULL) {
9845     next = vnp->next;
9846     str = (CharPtr) vnp->data.ptrvalue;
9847     if (StringICmp (last, str) == 0) {
9848       StringNCpy_0 (buf, str, sizeof (buf));
9849       StringCpy (buf + 100, "...");
9850       ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_CollidingPublications,
9851                 "Multiple equivalent publications annotated on this sequence [%s]", buf);
9852     } else {
9853       last = (CharPtr) vnp->data.ptrvalue;
9854     }
9855     vnp = next;
9856   }
9857 
9858   ValNodeFreeData (list);
9859 }
9860 
BadCharsInAuth(CharPtr str,CharPtr PNTR badauthor,Boolean allowcomma,Boolean allowperiod,Boolean last)9861 static Boolean BadCharsInAuth (CharPtr str, CharPtr PNTR badauthor, Boolean allowcomma, Boolean allowperiod, Boolean last)
9862 {
9863   Char     ch;
9864   CharPtr  ptr, stp = NULL;
9865 
9866   if (StringHasNoText (str)) return FALSE;
9867   if (last) {
9868       stp = StringISearch(str, "St.");
9869       if (stp == str) {
9870           stp += 2;  /* point to the period */
9871       } else if ((stp = StringISearch(str, "de M.")) == str) {
9872           stp += 4; /* point to the period */
9873       }
9874   }
9875 
9876   ptr = str;
9877   ch = *ptr;
9878   while (ch != '\0') {
9879     /* success on any of these tests are allowed values */
9880     if (IS_ALPHA (ch)) {
9881     } else if (ch == '-' || ch == '\'' || ch == ' ') {
9882     } else if (ch == ',' && allowcomma) {
9883     } else if (ch == '.' && (allowperiod || stp == ptr)) {
9884     } else if (StringCmp (ptr, "2nd") == 0 ||
9885                StringCmp (ptr, "3rd") == 0 ||
9886                StringCmp (ptr, "4th") == 0 ||
9887                StringCmp (ptr, "5th") == 0 ||
9888                StringCmp (ptr, "6th") == 0) {
9889       return FALSE;
9890     } else {
9891       /* bad character found */
9892       *badauthor = str;
9893       return TRUE;
9894     }
9895     ptr++;
9896     ch = *ptr;
9897   }
9898 
9899   return FALSE;
9900 }
9901 
BadCharsInName(ValNodePtr name,CharPtr PNTR badauthor,BoolPtr last_name_badP)9902 static Boolean BadCharsInName (ValNodePtr name, CharPtr PNTR badauthor, BoolPtr last_name_badP)
9903 
9904 {
9905   AuthorPtr    ap;
9906   NameStdPtr   nsp;
9907   PersonIdPtr  pid;
9908 
9909   if (name == NULL) return FALSE;
9910   ap = name->data.ptrvalue;
9911   if (ap == NULL) return FALSE;
9912   pid = ap->name;
9913   if (pid == NULL) return FALSE;
9914 
9915   if (pid->choice == 2) {
9916     nsp = pid->data;
9917     if (nsp == NULL) return FALSE;
9918     if (StringICmp (nsp->names [0], "et al.") == 0) return FALSE;
9919     if (BadCharsInAuth (nsp->names [0], badauthor, FALSE, FALSE, TRUE)) {
9920       if (last_name_badP != NULL) {
9921         *last_name_badP = TRUE;
9922       }
9923       return TRUE; /* last    */
9924     }
9925     if (BadCharsInAuth (nsp->names [1], badauthor, FALSE, FALSE, FALSE)) return TRUE; /* first    */
9926     if (BadCharsInAuth (nsp->names [4], badauthor, FALSE, TRUE, FALSE)) return TRUE;  /* initials */
9927     if (BadCharsInAuth (nsp->names [5], badauthor, FALSE, TRUE, FALSE)) return TRUE;  /* suffix */
9928   }
9929 
9930   return FALSE;
9931 }
9932 
9933 static CharPtr suffixList [] = {
9934   "Jr.", "Sr.", "II", "III", "IV", "V", "VI", "2nd", "3rd", "4th", "5th", "6th", NULL
9935 };
9936 
ValidateSuffix(ValidStructPtr vsp,GatherContextPtr gcp,PubdescPtr pdp,ValNodePtr name)9937 static void ValidateSuffix (ValidStructPtr vsp, GatherContextPtr gcp, PubdescPtr pdp, ValNodePtr name)
9938 
9939 {
9940   AuthorPtr    ap;
9941   Int2         i;
9942   NameStdPtr   nsp;
9943   PersonIdPtr  pid;
9944   CharPtr      suffix;
9945 
9946   if (name == NULL) return;
9947   ap = name->data.ptrvalue;
9948   if (ap == NULL) return;
9949   pid = ap->name;
9950   if (pid == NULL) return;
9951 
9952   if (pid->choice == 2) {
9953     nsp = pid->data;
9954     if (nsp == NULL) return;
9955     suffix = nsp->names [5];
9956     if (StringHasNoText (suffix)) return;
9957     for (i = 0; suffixList [i] != NULL; i++) {
9958       if (StringICmp (suffix, suffixList [i]) == 0) return;
9959     }
9960     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadAuthorSuffix, "Bad author suffix %s", suffix);
9961   }
9962 }
9963 
StringAlreadyInList(ValNodePtr head,CharPtr str)9964 static Boolean StringAlreadyInList (
9965   ValNodePtr head,
9966   CharPtr str
9967 )
9968 
9969 {
9970   ValNodePtr  vnp;
9971 
9972   for (vnp = head; vnp != NULL; vnp = vnp->next) {
9973     if (StringICmp ((CharPtr) vnp->data.ptrvalue, str) == 0) return TRUE;
9974   }
9975 
9976   return FALSE;
9977 }
9978 
ValidatePubdesc(ValidStructPtr vsp,GatherContextPtr gcp,PubdescPtr pdp)9979 static void ValidatePubdesc (ValidStructPtr vsp, GatherContextPtr gcp, PubdescPtr pdp)
9980 {
9981   AuthListPtr     alp;
9982   AuthorPtr       ap;
9983   CharPtr         badauthor;
9984   Int2            baddate;
9985   CitArtPtr       cap = NULL;
9986   CitGenPtr       cgp;
9987   CitJourPtr      cjp = NULL;
9988   ValNodePtr      conslist = NULL;
9989   CitSubPtr       csp;
9990   DatePtr         dp;
9991   Boolean         hasName, hasTitle, hasIsoJTA = FALSE,
9992                   inPress = FALSE, electronic_journal = FALSE,
9993                   conflicting_pmids = FALSE, redundant_pmids = FALSE,
9994                   conflicting_muids = FALSE, redundant_muids = FALSE,
9995                   unpub = FALSE;
9996   ImprintPtr      imp;
9997   Boolean         last_name_bad;
9998   Int4            muid = 0;
9999   Boolean         noVol = FALSE, noPages = FALSE;
10000   ValNodePtr      name;
10001   PersonIdPtr     pid;
10002   Int4            pmid = 0;
10003   CharPtr         ptr;
10004   ErrSev          sev;
10005   Int4            start;
10006   Int4            stop;
10007   CharPtr         str;
10008   Char            temp [64];
10009   Int4            thepmid = 0;
10010   ValNodePtr      title;
10011   Int4            uid = 0;
10012   long int        val;
10013   ValNodePtr      vnp;
10014 
10015   if (vsp == NULL || pdp == NULL) return;
10016 
10017   for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
10018     if (vnp->choice == PUB_PMid) {
10019       thepmid = vnp->data.intvalue;
10020     }
10021   }
10022   for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
10023     switch (vnp->choice) {
10024     case PUB_Gen:
10025       cgp = (CitGenPtr) vnp->data.ptrvalue;
10026       hasName = FALSE;
10027       if (cgp != NULL) {
10028         if (StringDoesHaveText (cgp->cit)) {
10029           /* skip if just BackBone id number */
10030           if (StringNICmp (cgp->cit, "BackBone id_pub = ", 18) == 0 && cgp->journal == NULL && cgp->date == NULL && cgp->serial_number < 0) break;
10031           if (StringNICmp (cgp->cit, "submitted", 8) == 0 ||
10032               StringNICmp (cgp->cit, "unpublished", 11) == 0 ||
10033               StringNICmp (cgp->cit, "Online Publication", 18) == 0 ||
10034               StringNICmp (cgp->cit, "Published Only in DataBase", 26) == 0) {
10035             unpub = TRUE;
10036           } else if (StringNICmp (cgp->cit, "(er) ", 5) == 0) {
10037             unpub = TRUE;
10038           } else {
10039             ValidErr (vsp, SEV_ERROR, ERR_GENERIC_MissingPubInfo, "Unpublished citation text invalid");
10040           }
10041           if (StringStr (cgp->cit, "Title=") != NULL) {
10042             ValidErr (vsp, SEV_ERROR, ERR_GENERIC_StructuredCitGenCit, "Unpublished citation has embedded Title");
10043           }
10044           if (StringStr (cgp->cit, "Journal=") != NULL) {
10045             ValidErr (vsp, SEV_ERROR, ERR_GENERIC_StructuredCitGenCit, "Unpublished citation has embedded Journal");
10046           }
10047         }
10048         /* skip if just serial number */
10049         if (cgp->cit == NULL && cgp->journal == NULL && cgp->date == NULL && cgp->serial_number > -1) break;
10050         dp = cgp->date;
10051         if (dp == NULL) {
10052           if (! unpub) {
10053             ValidErr (vsp, SEV_WARNING, ERR_GENERIC_MissingPubInfo, "Publication date missing");
10054           }
10055         } else if (dp->str != NULL) {
10056           if (StringCmp (dp->str, "?") == 0) {
10057             ValidErr (vsp, SEV_WARNING, ERR_GENERIC_MissingPubInfo, "Publication date marked as '?'");
10058           }
10059         } else if (dp->data [1] == 0) {
10060           ValidErr (vsp, SEV_WARNING, ERR_GENERIC_MissingPubInfo, "Publication date not set");
10061         } else if (DateIsBad (dp, FALSE, &baddate)) {
10062           PrintBadDateError (vsp, baddate, SEV_ERROR, ERR_GENERIC_BadDate, "Publication date has error");
10063         }
10064         alp = cgp->authors;
10065         if (alp != NULL) {
10066           if (alp->choice == 1) {
10067             for (name = alp->names; name != NULL; name = name->next) {
10068               if (!HasNoName (name)) {
10069                 hasName = TRUE;
10070               }
10071             }
10072           } else if (alp->choice == 2 || alp->choice == 3) {
10073             for (name = alp->names; name != NULL; name = name->next) {
10074               if (!HasNoText ((CharPtr) name->data.ptrvalue)) {
10075                 hasName = TRUE;
10076               }
10077             }
10078           }
10079         }
10080         if (!hasName) {
10081           sev = SEV_ERROR;
10082           if (vsp->is_refseq_in_sep) {
10083             sev = SEV_WARNING;
10084           }
10085           ValidErr (vsp, sev, ERR_GENERIC_MissingPubInfo, "Publication has no author names");
10086         }
10087       }
10088       break;
10089     case PUB_PMid:
10090       if (pmid == 0) {
10091         pmid = vnp->data.intvalue;
10092       } else if (pmid != vnp->data.intvalue) {
10093         conflicting_pmids = TRUE;
10094       } else {
10095         redundant_pmids = TRUE;
10096       }
10097       if (uid == 0) {
10098         uid = vnp->data.intvalue;
10099       }
10100       break;
10101     case PUB_Muid:
10102       if (muid == 0) {
10103         muid = vnp->data.intvalue;
10104       } else if (muid != vnp->data.intvalue) {
10105         conflicting_muids = TRUE;
10106       } else {
10107         redundant_muids = TRUE;
10108       }
10109       if (uid == 0) {
10110         uid = vnp->data.intvalue;
10111       }
10112       break;
10113     case PUB_Sub :
10114       csp = (CitSubPtr) vnp->data.ptrvalue;
10115       if (csp != NULL) {
10116         ValidateCitSub (vsp, csp);
10117       }
10118       break;
10119     case PUB_Medline:
10120       ValidErr (vsp, SEV_ERROR, ERR_GENERIC_MedlineEntryPub, "Publication is medline entry");
10121       break;
10122     case PUB_Article:
10123       cap = (CitArtPtr) vnp->data.ptrvalue;
10124       hasName = FALSE;
10125       hasTitle = FALSE;
10126       if (cap != NULL) {
10127         for (title = cap->title; title != NULL; title = title->next) {
10128           if (!HasNoText ((CharPtr) title->data.ptrvalue)) {
10129             hasTitle = TRUE;
10130           }
10131         }
10132         if (!hasTitle) {
10133           ValidErr (vsp, SEV_ERROR, ERR_GENERIC_MissingPubInfo, "Publication has no title");
10134         }
10135         alp = cap->authors;
10136         if (alp != NULL) {
10137           if (alp->choice == 1) {
10138             for (name = alp->names; name != NULL; name = name->next) {
10139               if (!HasNoName (name)) {
10140                 hasName = TRUE;
10141               }
10142             }
10143           } else if (alp->choice == 2 || alp->choice == 3) {
10144             for (name = alp->names; name != NULL; name = name->next) {
10145               if (!HasNoText ((CharPtr) name->data.ptrvalue)) {
10146                 hasName = TRUE;
10147               }
10148             }
10149           }
10150         }
10151         if (!hasName) {
10152           sev = SEV_ERROR;
10153           if (vsp->is_refseq_in_sep) {
10154             sev = SEV_WARNING;
10155           }
10156           ValidErr (vsp, sev, ERR_GENERIC_MissingPubInfo, "Publication has no author names");
10157         }
10158       }
10159 
10160       if (cap != NULL) {
10161         switch (cap->from) {
10162         case 1:
10163           cjp = (CitJourPtr) cap->fromptr;
10164           if (cjp != NULL) {
10165             hasTitle = FALSE;
10166             for (title = cjp->title; title != NULL; title = title->next) {
10167               if (title->choice == Cit_title_iso_jta) {
10168                 hasIsoJTA = TRUE;
10169               }
10170               if (!HasNoText ((CharPtr) title->data.ptrvalue)) {
10171                 hasTitle = TRUE;
10172                 if (title->choice == Cit_title_name) {
10173                   if (StringNCmp ((CharPtr) title->data.ptrvalue, "(er)", 4) == 0) {
10174                     electronic_journal = TRUE;
10175                   }
10176                 }
10177               }
10178             }
10179             if (!hasTitle) {
10180               ValidErr (vsp, SEV_ERROR, ERR_GENERIC_MissingPubInfo, "Journal title missing");
10181             }
10182             imp = cjp->imp;
10183             if (imp != NULL) {
10184               if (imp->pubstatus == PUBSTATUS_epublish || imp->pubstatus == PUBSTATUS_aheadofprint) {
10185                 electronic_journal = TRUE;
10186               }
10187               if (imp->prepub == 2) {
10188                 inPress = TRUE;
10189                 if (StringDoesHaveText (imp->pages)) {
10190                   ValidErr (vsp, SEV_WARNING, ERR_GENERIC_PublicationInconsistency, "In-press is not expected to have page numbers");
10191                 }
10192                 dp = imp->date;
10193                 if (dp == NULL || StringCmp (dp->str, "?") == 0) {
10194                   ValidErr (vsp, SEV_WARNING, ERR_GENERIC_MissingPubInfo, "In-press is missing the date");
10195                 }
10196               }
10197               if (imp->prepub == 0 && imp->pubstatus != PUBSTATUS_aheadofprint) {
10198                 noVol = StringHasNoText (imp->volume);
10199                 noPages = StringHasNoText (imp->pages);
10200                 if (noVol) {
10201                   if (electronic_journal) {
10202                     ValidErr (vsp, SEV_INFO, ERR_GENERIC_MissingVolumeEpub, "Electronic journal volume missing");
10203                   } else {
10204                     ValidErr (vsp, SEV_WARNING, ERR_GENERIC_MissingVolume, "Journal volume missing");
10205                   }
10206                 }
10207                 if (noPages) {
10208                   if (electronic_journal) {
10209                     ValidErr (vsp, SEV_INFO, ERR_GENERIC_MissingPagesEpub, "Electronic journal pages missing");
10210                   } else {
10211                     ValidErr (vsp, SEV_WARNING, ERR_GENERIC_MissingPages, "Journal pages missing");
10212                   }
10213                 }
10214                 if ((! noPages) && (! electronic_journal)) {
10215                   sev = SEV_WARNING;
10216                   StringNCpy_0 (temp, imp->pages, sizeof (temp));
10217                   ptr = StringChr (temp, '-');
10218                   if (ptr != NULL) {
10219                     *ptr = '\0';
10220                     ptr++;
10221                     if (sscanf (temp, "%ld", &val) == 1) {
10222                       start = (Int4) val;
10223                       if (sscanf (ptr, "%ld", &val) == 1) {
10224                         stop = (Int4) val;
10225                         if (start == 0 || stop == 0) {
10226                           ValidErr (vsp, sev, ERR_GENERIC_BadPageNumbering, "Page numbering has zero value");
10227                         } else if (start < 0 || stop < 0) {
10228                           ValidErr (vsp, sev, ERR_GENERIC_BadPageNumbering, "Page numbering has negative value");
10229                         } else if (start > stop) {
10230                           ValidErr (vsp, sev, ERR_GENERIC_BadPageNumbering, "Page numbering out of order");
10231                         } else if (stop > start + 50) {
10232                           ValidErr (vsp, sev, ERR_GENERIC_BadPageNumbering, "Page numbering greater than 50");
10233                         }
10234                       } else {
10235                         ValidErr (vsp, sev, ERR_GENERIC_BadPageNumbering, "Page numbering stop looks strange");
10236                       }
10237                     } else if (! IS_ALPHA (temp [0])) {
10238                       ValidErr (vsp, sev, ERR_GENERIC_BadPageNumbering, "Page numbering start looks strange");
10239                     }
10240                   }
10241                 }
10242                 dp = imp->date;
10243                 if (dp == NULL) {
10244                   ValidErr (vsp, SEV_WARNING, ERR_GENERIC_MissingPubInfo, "Publication date missing");
10245                 } else if (dp->str != NULL) {
10246                   if (StringCmp (dp->str, "?") == 0) {
10247                     ValidErr (vsp, SEV_WARNING, ERR_GENERIC_MissingPubInfo, "Publication date marked as '?'");
10248                   }
10249                 } else if (dp->data [1] == 0) {
10250                   ValidErr (vsp, SEV_WARNING, ERR_GENERIC_MissingPubInfo, "Publication date not set");
10251                 } else if (DateIsBad (dp, FALSE, &baddate)) {
10252                   PrintBadDateError (vsp, baddate, SEV_ERROR, ERR_GENERIC_BadDate, "Publication date has error");
10253                 }
10254               }
10255               if (imp->pubstatus == PUBSTATUS_aheadofprint && imp->prepub != 2) {
10256                 if (noVol || noPages) {
10257                 } else if (thepmid == 0) {
10258                   ValidErr (vsp, SEV_WARNING, ERR_GENERIC_PublicationInconsistency, "Ahead-of-print without in-press");
10259                 }
10260               }
10261               if (imp->pubstatus == PUBSTATUS_epublish && imp->prepub == 2) {
10262                 //LCOV_EXCL_START
10263                 //BasicCleanup removes prepub = 2 if pubstatus is epublish
10264                 ValidErr (vsp, SEV_WARNING, ERR_GENERIC_PublicationInconsistency, "Electronic-only publication should not also be in-press");
10265                 //LCOV_EXCL_STOP
10266               }
10267               if (imp->pubstatus == PUBSTATUS_epublish || imp->pubstatus == PUBSTATUS_ppublish || imp->pubstatus == PUBSTATUS_aheadofprint) {
10268                 if (StringDoesHaveText (pdp->comment)) {
10269                   if (StringStr (pdp->comment, "Publication Status") != NULL ||
10270                       StringStr (pdp->comment, "Publication-Status") != NULL ||
10271                       StringStr (pdp->comment, "Publication_Status") != NULL) {
10272                     ValidErr (vsp, SEV_WARNING, ERR_GENERIC_UnexpectedPubStatusComment, "Publication status is in comment for pmid %ld", (long) thepmid);
10273                   }
10274                 }
10275               }
10276             }
10277           }
10278           break;
10279         default:
10280           break;
10281         }
10282       }
10283       break;
10284     case PUB_Equiv:
10285       ValidErr (vsp, SEV_WARNING, ERR_GENERIC_UnnecessaryPubEquiv, "Publication has unexpected internal Pub-equiv");
10286       break;
10287     default:
10288       break;
10289     }
10290   }
10291 
10292   if (conflicting_pmids) {
10293     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_CollidingPublications, "Multiple conflicting pmids in a single publication");
10294   } else if (redundant_pmids) {
10295     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_CollidingPublications, "Multiple redundant pmids in a single publication");
10296   }
10297   if (conflicting_muids) {
10298     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_CollidingPublications, "Multiple conflicting muids in a single publication");
10299   } else if (redundant_muids) {
10300     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_CollidingPublications, "Multiple redundant muids in a single publication");
10301   }
10302 
10303   if (cap != NULL && cjp != NULL && (uid > 0 || inPress || vsp->alwaysRequireIsoJTA)) {
10304     if (! hasIsoJTA) {
10305       if (! electronic_journal) {
10306         ValidErr (vsp, SEV_WARNING, ERR_GENERIC_MissingISOJTA, "ISO journal title abbreviation missing");
10307       }
10308     }
10309   }
10310 
10311   alp = GetAuthListPtr (pdp, NULL);
10312   if (alp != NULL) {
10313     sev = SEV_ERROR;
10314     if (vsp->is_refseq_in_sep) {
10315       sev = SEV_WARNING;
10316     }
10317     if (alp->choice == 1) {
10318       for (name = alp->names; name != NULL; name = name->next) {
10319         badauthor = NULL;
10320         last_name_bad = FALSE;
10321         if (BadCharsInName (name, &badauthor, &last_name_bad)) {
10322           if (StringHasNoText (badauthor)) {
10323             badauthor = "?";
10324           }
10325           if (last_name_bad) {
10326             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadCharInAuthorLastName, "Bad characters in author %s", badauthor);
10327           } else {
10328             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadCharInAuthorName, "Bad characters in author %s", badauthor);
10329           }
10330         }
10331         ValidateSuffix (vsp, gcp, pdp, name);
10332         ap = (AuthorPtr) name->data.ptrvalue;
10333         if (ap == NULL) continue;
10334         pid = ap->name;
10335         if (pid == NULL) continue;
10336         if (pid->choice == 5) {
10337           str = (CharPtr) pid->data;
10338           if (StringHasNoText (str)) {
10339             //LCOV_EXCL_START
10340             //BasicCleanup converts empty consortium to std author with last name "?"
10341             ValidErr (vsp, SEV_WARNING, ERR_GENERIC_PublicationInconsistency, "Empty consortium");
10342             continue;
10343             //LCOV_EXCL_STOP
10344           }
10345           if (StringAlreadyInList (conslist, str)) {
10346             ValidErr (vsp, SEV_WARNING, ERR_GENERIC_PublicationInconsistency, "Duplicate consortium '%s'", str);
10347             continue;
10348           }
10349           ValNodeAddPointer (&conslist, 0, (Pointer) str);
10350         }
10351       }
10352     } else if (alp->choice == 2 || alp->choice == 3) {
10353       for (name = alp->names; name != NULL; name = name->next) {
10354         badauthor = NULL;
10355         if (BadCharsInAuth ((CharPtr) name->data.ptrvalue, &badauthor, TRUE, TRUE, FALSE)) {
10356           if (StringHasNoText (badauthor)) {
10357             badauthor = "?";
10358           }
10359           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadCharInAuthorName, "Bad characters in author %s", badauthor);
10360         }
10361       }
10362     }
10363   }
10364   ValNodeFree (conslist);
10365 }
10366 
ValidateSfpCit(ValidStructPtr vsp,GatherContextPtr gcp,SeqFeatPtr sfp)10367 static void ValidateSfpCit (ValidStructPtr vsp, GatherContextPtr gcp, SeqFeatPtr sfp)
10368 {
10369   ValNodePtr      ppr;
10370   ValNodePtr      psp;
10371 
10372   if (vsp == NULL || sfp == NULL || sfp->cit == NULL)
10373     return;
10374   psp = sfp->cit;
10375   if (psp == NULL)
10376     return;
10377   for (ppr = (ValNodePtr) psp->data.ptrvalue; ppr != NULL; ppr = ppr->next) {
10378     if (ppr->choice == PUB_Equiv) {
10379       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnnecessaryCitPubEquiv, "Citation on feature has unexpected internal Pub-equiv");
10380       return;
10381     }
10382   }
10383 }
10384 
10385 typedef struct bioseqvalid
10386 {
10387   ValidStructPtr  vsp;
10388   Boolean         is_aa;         /* bioseq is protein? */
10389   Boolean         is_mrna;       /* molinfo is mrna? */
10390   Boolean         is_prerna;     /* molinfo is precursor rna? */
10391   Boolean         is_artificial; /* biosource origin is artificial */
10392   Boolean         is_synthetic;  /* biosource origin synthetic */
10393   Boolean         is_syn_constr; /* is organism name synthetic construct, plasmid, vector, or SYN division? */
10394   Boolean         got_a_pub;
10395   int             last_na_mol, last_na_mod, last_organelle, last_partialness, last_left_right,
10396                   last_biomol, last_tech, last_completeness,
10397                   num_full_length_src_feat, num_full_length_prot_ref,
10398                   num_justprot, num_preprot, num_matpep, num_sigpep, num_transpep;
10399   ValNodePtr      last_gb, last_embl, last_prf, last_pir, last_sp, last_pdb,
10400                   last_create, last_update, last_biosrc, last_orgref;
10401   OrgRefPtr       last_org;
10402   GatherContextPtr gcp;
10403   BioseqPtr        bsp;
10404 }
10405 BioseqValidStr , PNTR BioseqValidStrPtr;
10406 
10407 // Used by DeltaOrFarSeg
CheckForNucProt(BioseqSetPtr bssp,Pointer userdata)10408 static void CheckForNucProt (BioseqSetPtr bssp, Pointer userdata)
10409 {
10410   BoolPtr         hasPartsP;
10411 
10412   if (bssp->_class == BioseqseqSet_class_nuc_prot) {
10413     hasPartsP = (BoolPtr) userdata;
10414     *hasPartsP = TRUE;
10415   }
10416 }
10417 
10418 //LCOV_EXCL_START
10419 // Only for SegSets
CheckForParts(BioseqSetPtr bssp,Pointer userdata)10420 static void CheckForParts (BioseqSetPtr bssp, Pointer userdata)
10421 {
10422   BoolPtr         hasPartsP;
10423 
10424   if (bssp->_class == BioseqseqSet_class_parts) {
10425     hasPartsP = (BoolPtr) userdata;
10426     *hasPartsP = TRUE;
10427   }
10428 }
10429 //LCOV_EXCL_STOP
10430 
DeltaOrFarSeg(SeqEntryPtr sep,SeqLocPtr location)10431 static Boolean DeltaOrFarSeg (SeqEntryPtr sep, SeqLocPtr location)
10432 {
10433   BioseqPtr       bsp;
10434   Boolean         hasParts = FALSE;
10435 
10436   bsp = BioseqFindFromSeqLoc (location);
10437   if (bsp != NULL) {
10438     if (bsp->repr == Seq_repr_delta) {
10439       VisitSetsInSep (sep, (Pointer) &hasParts, CheckForNucProt);
10440       if (!hasParts)
10441         return TRUE;
10442     }
10443     if (bsp->repr == Seq_repr_seg) {
10444 //LCOV_EXCL_START
10445 // Only for SegSets
10446       VisitSetsInSep (sep, (Pointer) &hasParts, CheckForParts);
10447       if (!hasParts)
10448         return TRUE;
10449 //LCOV_EXCL_STOP
10450     }
10451   }
10452   return FALSE;
10453 }
10454 
10455 
ConsistentWithA(Char ch)10456 static Boolean ConsistentWithA (Char ch)
10457 
10458 {
10459   return (Boolean) (StringChr ("ANRMWHVD", ch) != NULL);
10460 }
10461 
ConsistentWithC(Char ch)10462 static Boolean ConsistentWithC (Char ch)
10463 
10464 {
10465   return (Boolean) (StringChr ("CNYMSHBV", ch) != NULL);
10466 }
10467 
ConsistentWithG(Char ch)10468 static Boolean ConsistentWithG (Char ch)
10469 
10470 {
10471   return (Boolean) (StringChr ("GNRKSBVD", ch) != NULL);
10472 }
10473 
ConsistentWithT(Char ch)10474 static Boolean ConsistentWithT (Char ch)
10475 
10476 {
10477   return (Boolean) (StringChr ("TNYKWHBD", ch) != NULL);
10478 }
10479 
10480 static void
ValidateIntronEndsAtSpliceSiteOrGap(ValidStructPtr vsp,SeqLocPtr slp)10481 ValidateIntronEndsAtSpliceSiteOrGap
10482 (ValidStructPtr vsp,
10483  SeqLocPtr slp)
10484 {
10485   BioseqPtr          bsp;
10486   SeqIdPtr           sip;
10487   Uint1              strand;
10488   Int4               strt, stop, pos;
10489   Boolean            partial5, partial3;
10490   Char               buf[3];
10491   Char               ch0, ch1;
10492   Char               id_buf[150];
10493   SeqFeatPtr         rna;
10494   SeqMgrFeatContext  rcontext;
10495 
10496   if (vsp == NULL || slp == NULL) return;
10497   CheckSeqLocForPartial (slp, &partial5, &partial3);
10498   if (partial5 && partial3) return;
10499 
10500   /* suppress if contained by rRNA - different consensus splice site */
10501 
10502   rna = SeqMgrGetOverlappingFeature (slp, 0, vsp->rrna_array, vsp->numrrna,
10503                                      NULL, CONTAINED_WITHIN, &rcontext);
10504   if (rna != NULL) return;
10505 
10506   /* suppress if contained by tRNA - different consensus splice site */
10507 
10508   rna = SeqMgrGetOverlappingFeature (slp, 0, vsp->trna_array, vsp->numtrna,
10509                                      NULL, CONTAINED_WITHIN, &rcontext);
10510   if (rna != NULL) return;
10511 
10512 
10513   sip = SeqLocId (slp);
10514   if (sip == NULL)
10515     return;
10516 
10517   bsp = NULL;
10518   if (sip != NULL && (sip->choice != SEQID_GI || sip->data.intvalue > 0)) {
10519     bsp = BioseqLockById (sip);
10520   }
10521   if (bsp == NULL)
10522     return;
10523 
10524   if (IsBioseqOrganelle(bsp)) {
10525     BioseqUnlock (bsp);
10526     return;
10527   }
10528 
10529   BioseqLabel (bsp, id_buf, sizeof (id_buf) - 1, OM_LABEL_CONTENT);
10530 
10531   strt = SeqLocStart (slp);
10532   stop = SeqLocStop (slp);
10533 
10534   strand = SeqLocStrand (slp);
10535 
10536   if (!partial5) {
10537     if (strand == Seq_strand_minus) {
10538       SeqPortStreamInt (bsp, stop - 1, stop, Seq_strand_minus, EXPAND_GAPS_TO_DASHES, (Pointer) buf, NULL);
10539       pos = stop;
10540     } else {
10541       SeqPortStreamInt (bsp, strt, strt + 1, Seq_strand_plus, EXPAND_GAPS_TO_DASHES, (Pointer) buf, NULL);
10542       pos = strt;
10543     }
10544     ch0 = buf[0];
10545     ch1 = buf[1];
10546     if (ch0 == '-' && ch1 == '-') {
10547     } else if (ConsistentWithG (ch0) && ConsistentWithT (ch1)) {
10548     } else if (ch0 == 'G' && ch1 == 'C') {
10549     } else if (pos == 0 || pos == bsp->length - 1) {
10550       ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_NotSpliceConsensusDonor,
10551                 "Splice donor consensus (GT) not found at start of terminal intron, position %ld of %s", (long) (pos + 1), id_buf);
10552     } else {
10553       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_NotSpliceConsensusDonor,
10554                 "Splice donor consensus (GT) not found at start of intron, position %ld of %s", (long) (pos + 1), id_buf);
10555     }
10556   }
10557   if (!partial3) {
10558     if (strand == Seq_strand_minus) {
10559       SeqPortStreamInt (bsp, strt, strt + 1, Seq_strand_minus, EXPAND_GAPS_TO_DASHES, (Pointer) buf, NULL);
10560       pos = strt;
10561     } else {
10562       SeqPortStreamInt (bsp, stop - 1, stop, Seq_strand_plus, EXPAND_GAPS_TO_DASHES, (Pointer) buf, NULL);
10563       pos = stop;
10564     }
10565     ch0 = buf[0];
10566     ch1 = buf[1];
10567     if (ch0 == '-' && ch1 == '-') {
10568     } else if (ConsistentWithA (ch0) && ConsistentWithG (ch1)) {
10569     } else if (pos == 0 || pos == bsp->length - 1) {
10570       ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_NotSpliceConsensusAcceptor,
10571                 "Splice acceptor consensus (AG) not found at end of terminal intron, position %ld of %s, but at end of sequence", (long) (pos + 1), id_buf);
10572     } else {
10573       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_NotSpliceConsensusAcceptor,
10574                 "Splice acceptor consensus (AG) not found at end of intron, position %ld of %s", (long) (pos + 1), id_buf);
10575     }
10576   }
10577   BioseqUnlock (bsp);
10578 }
10579 
10580 //LCOV_EXCL_START
10581 //farloc is never set during indexing, so this function is never called
IsLocInSmallGenomeSet(SeqLocPtr loc)10582 static Boolean IsLocInSmallGenomeSet (
10583   SeqLocPtr loc
10584 )
10585 
10586 {
10587   BioseqPtr  bsp;
10588   SeqIdPtr   sip;
10589   SeqLocPtr  slp;
10590 
10591   if (loc == NULL) return FALSE;
10592 
10593   slp = SeqLocFindNext (loc, NULL);
10594   while (slp != NULL) {
10595     sip = SeqLocId (slp);
10596     if (sip == NULL) return FALSE;
10597     bsp = BioseqFind (sip);
10598     if (bsp == NULL) return FALSE;
10599     slp = SeqLocFindNext (loc, slp);
10600   }
10601 
10602   return TRUE;
10603 }
10604 
10605 //farloc is never set during indexing, so this function is never called
AllPartsInSmallGenomeSet(SeqLocPtr loc,ValidStructPtr vsp,BioseqPtr bsp)10606 static Boolean AllPartsInSmallGenomeSet (
10607   SeqLocPtr loc,
10608   ValidStructPtr vsp,
10609   BioseqPtr bsp
10610 )
10611 
10612 {
10613   BioseqSetPtr  bssp;
10614   SeqEntryPtr   oldscope;
10615   Boolean       rsult = FALSE;
10616   SeqEntryPtr   sep;
10617 
10618   if (loc == NULL || vsp == NULL || bsp == NULL) return FALSE;
10619 
10620   sep = vsp->sep;
10621   if (sep == NULL) return FALSE;
10622   if (! IS_Bioseq_set (sep)) return FALSE;
10623   bssp = (BioseqSetPtr) sep->data.ptrvalue;
10624   if (bssp == NULL) return FALSE;
10625 
10626   /* if genbank set wraps everything, go down one set level */
10627   if (bssp->_class == BioseqseqSet_class_genbank) {
10628     sep = bssp->seq_set;
10629     if (sep == NULL) return FALSE;
10630     if (! IS_Bioseq_set (sep)) return FALSE;
10631     bssp = (BioseqSetPtr) sep->data.ptrvalue;
10632   }
10633 
10634   /* check for small genome set */
10635   if (bssp == NULL || bssp->_class != BioseqseqSet_class_small_genome_set) return FALSE;
10636 
10637   /* scope within small genome set for subsequent BioseqFind calls */
10638   oldscope = SeqEntrySetScope (sep);
10639 
10640   rsult = IsLocInSmallGenomeSet (loc);
10641 
10642   SeqEntrySetScope (oldscope);
10643 
10644   return rsult;
10645 }
10646 //LCOV_EXCL_STOP
10647 
10648 
HasTerminalException(CdRegionPtr crp)10649 static Boolean HasTerminalException (CdRegionPtr crp)
10650 {
10651   CodeBreakPtr cbp;
10652   Boolean rval = FALSE;
10653 
10654   for (cbp = crp->code_break; cbp != NULL && !rval; cbp = cbp->next) {
10655     if (cbp->aa.choice == 1 && cbp->aa.value.intvalue == 42) {
10656       rval = TRUE;
10657     }
10658   }
10659   return TRUE;
10660 }
10661 
10662 
IsAmbiguous(Char ch)10663 static Boolean IsAmbiguous(Char ch)
10664 {
10665   if (ch == 'A' || ch == 'T' || ch == 'G' || ch == 'C' || ch == 'U') {
10666     return FALSE;
10667   } else {
10668     return TRUE;
10669   }
10670 }
10671 
10672 
CheckCommentForAmbiguityPhrase(SeqFeatPtr sfp,ValidStructPtr vsp)10673 static void CheckCommentForAmbiguityPhrase(SeqFeatPtr sfp, ValidStructPtr vsp)
10674 {
10675   CdRegionPtr crp;
10676   Int4 len, last_codon_len, j;
10677   CharPtr underlying;
10678   Boolean has_ambig = FALSE;
10679 
10680   if (sfp == NULL || vsp == NULL || sfp->data.choice != SEQFEAT_CDREGION
10681       || StringSearch (sfp->comment, "ambiguity in stop codon") == NULL
10682       || (crp = (CdRegionPtr)(sfp->data.value.ptrvalue)) == NULL
10683       || !HasTerminalException(crp)) {
10684     return;
10685   }
10686 
10687   len = SeqLocLen(sfp->location);
10688   if (crp->frame == 2) {
10689     len -= 1;
10690   } else if (crp->frame == 3) {
10691     len -= 2;
10692   }
10693 
10694   last_codon_len = len % 3;
10695   if (last_codon_len == 0) {
10696     last_codon_len = 3;
10697   }
10698 
10699   underlying = MemNew (sizeof (Char) * (len + 2));
10700   if (underlying != NULL) {
10701     SeqPortStreamLoc (sfp->location, EXPAND_GAPS_TO_DASHES, (Pointer) underlying, NULL);
10702     for (j = 0; j < last_codon_len && !has_ambig; j++) {
10703       has_ambig = IsAmbiguous(*(underlying + len - 1 - j));
10704     }
10705   }
10706   underlying = MemFree (underlying);
10707   if (!has_ambig) {
10708     ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_BadComment,
10709               "Feature comment indicates ambiguity in stop codon but no ambiguities are present in stop codon.");
10710   }
10711 }
10712 
10713 
10714 /*****************************************************************************
10715 *
10716 *   ValidateSeqFeatContext(gcp)
10717 *      Gather callback helper function for validating context on a Bioseq
10718 *
10719 *****************************************************************************/
ValidateSeqFeatCommon(SeqFeatPtr sfp,BioseqValidStrPtr bvsp,ValidStructPtr vsp,Int4 left,Int4 right,Int2 numivals,Uint4 featitemid,Boolean farloc,BioseqPtr bsp)10720 static Boolean ValidateSeqFeatCommon (SeqFeatPtr sfp, BioseqValidStrPtr bvsp, ValidStructPtr vsp,
10721                                       Int4 left, Int4 right, Int2 numivals, Uint4 featitemid, Boolean farloc, BioseqPtr bsp)
10722 {
10723   BioseqSetPtr    bssp;
10724   Boolean         do_error;
10725   GatherContextPtr gcp = NULL;
10726   ImpFeatPtr      ifp;
10727   Uint2           olditemtype = 0;
10728   Uint4           olditemid = 0;
10729   ProtRefPtr      prp;
10730   RnaRefPtr       rrp;
10731   CharPtr         str;
10732   SeqLocPtr       slp;
10733   SeqIdPtr        sip;
10734   TextSeqIdPtr    tsip;
10735   Boolean         on_seg = FALSE;
10736   Boolean         is_emb = FALSE;
10737   Boolean         is_nc = FALSE;
10738   Boolean         is_refseq = FALSE;
10739   ErrSev          sev;
10740   Boolean         no_nonconsensus_except;
10741   GBQualPtr       gbq;
10742   GeneRefPtr      grp;
10743   CharPtr         sfp_pseudo, gene_pseudo;
10744   SeqFeatPtr      gene;
10745 
10746 
10747   vsp->descr = NULL;
10748   vsp->sfp = sfp;
10749 
10750   if (featitemid > 0) {
10751     gcp = vsp->gcp;
10752     if (gcp != NULL) {
10753       olditemid = gcp->itemID;
10754       olditemtype = gcp->thistype;
10755       gcp->itemID = featitemid;
10756       gcp->thistype = OBJ_SEQFEAT;
10757     }
10758   }
10759 
10760   if (bsp != NULL) {
10761     for (sip = bsp->id; sip != NULL; sip = sip->next) {
10762       if (sip->choice == SEQID_OTHER) {
10763         is_refseq = TRUE;
10764         tsip = (TextSeqIdPtr) sip->data.ptrvalue;
10765         if (tsip != NULL && tsip->accession != NULL) {
10766           if (StringNICmp (tsip->accession, "NT_", 3) == 0) {
10767             is_nc = TRUE;
10768           }
10769         }
10770       } else if (sip->choice == SEQID_EMBL) {
10771         is_emb = TRUE;
10772       }
10773     }
10774   }
10775 
10776   if (bvsp->is_aa) {
10777     if (sfp->data.choice == SEQFEAT_PROT) {
10778       if ((left == 0) && (right == ((vsp->bsp->length) - 1))) {
10779         bvsp->num_full_length_prot_ref++;
10780         prp = (ProtRefPtr) sfp->data.value.ptrvalue;
10781         if (prp != NULL) {
10782           switch (prp->processed) {
10783             case 0:
10784               bvsp->num_justprot++;
10785               break;
10786             case 1:
10787               bvsp->num_preprot++;
10788               break;
10789             case 2:
10790               bvsp->num_matpep++;
10791               break;
10792             case 3:
10793               bvsp->num_sigpep++;
10794               break;
10795             case 4:
10796               bvsp->num_transpep++;
10797               break;
10798             default:
10799               break;
10800           }
10801         }
10802       }
10803     }
10804 
10805     switch (sfp->data.choice) {
10806     case SEQFEAT_CDREGION:
10807     case SEQFEAT_RNA:
10808     case SEQFEAT_RSITE:
10809     case SEQFEAT_TXINIT:
10810       ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidForType, "Invalid feature for a protein Bioseq.");
10811       break;
10812     case SEQFEAT_GENE:
10813         if (bsp != NULL) {
10814           do_error = FALSE;
10815           if (bsp->idx.parenttype == OBJ_BIOSEQSET) {
10816             bssp = (BioseqSetPtr) bsp->idx.parentptr;
10817             while (bssp != NULL) {
10818               switch (bssp->_class) {
10819               case BioseqseqSet_class_nuc_prot :
10820               case BioseqseqSet_class_mut_set :
10821               case BioseqseqSet_class_pop_set :
10822               case BioseqseqSet_class_phy_set :
10823               case BioseqseqSet_class_eco_set :
10824               case BioseqseqSet_class_gen_prod_set :
10825                 do_error = TRUE;
10826                 break;
10827               default :
10828                 break;
10829               }
10830               if (bssp->idx.parenttype == OBJ_BIOSEQSET) {
10831                 bssp = (BioseqSetPtr) bssp->idx.parentptr;
10832               } else {
10833                 bssp = NULL;
10834               }
10835             }
10836           }
10837           if (do_error) {
10838             ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidForType, "Invalid feature for a protein Bioseq.");
10839           }
10840         }
10841       break;
10842     default:
10843       break;
10844     }
10845 
10846   } else {
10847     switch (sfp->data.choice) {
10848     case SEQFEAT_PROT:
10849     case SEQFEAT_PSEC_STR:
10850       ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidForType, "Invalid feature for a nucleotide Bioseq.");
10851       break;
10852     default:
10853       break;
10854     }
10855 
10856   }
10857 
10858   if (bvsp->is_mrna) {
10859     switch (sfp->data.choice) {
10860     case SEQFEAT_CDREGION:
10861       if (numivals > 1) {
10862         if ((! sfp->excpt) ||
10863             (StringISearch (sfp->except_text, "ribosomal slippage") == NULL)) {
10864           sev = SEV_ERROR;
10865           if (is_refseq) {
10866             sev = SEV_WARNING;
10867           }
10868           ValidErr (vsp, sev, ERR_SEQ_FEAT_InvalidForType, "Multi-interval CDS feature is invalid on an mRNA (cDNA) Bioseq.");
10869         }
10870       }
10871       break;
10872     case SEQFEAT_RNA:
10873       rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
10874       if (rrp != NULL && rrp->type == 2) {
10875         ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidForType, "mRNA feature is invalid on an mRNA (cDNA) Bioseq.");
10876       }
10877       break;
10878     case SEQFEAT_IMP:
10879       ifp = (ImpFeatPtr) sfp->data.value.ptrvalue;
10880       if (ifp != NULL && ifp->key != NULL && (!HasNoText (ifp->key))) {
10881         if (StringCmp (ifp->key, "intron") == 0 || StringCmp (ifp->key, "CAAT_signal") == 0) {
10882           ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidForType, "Invalid feature for an mRNA Bioseq.");
10883         }
10884       }
10885       break;
10886     default:
10887       break;
10888     }
10889   } else if (bvsp->is_prerna) {
10890     switch (sfp->data.choice) {
10891     case SEQFEAT_IMP:
10892       ifp = (ImpFeatPtr) sfp->data.value.ptrvalue;
10893       if (ifp != NULL && ifp->key != NULL && (!HasNoText (ifp->key))) {
10894         if (StringCmp (ifp->key, "CAAT_signal") == 0) {
10895           //LCOV_EXCL_START
10896           //BasicCleanup converts this feature to regulatory class, code not reachable
10897           ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidForType, "Invalid feature for an pre-RNA Bioseq.");
10898           //LCOV_EXCL_STOP
10899         }
10900       }
10901       break;
10902     default:
10903       break;
10904     }
10905   }
10906 
10907   if (farloc && (! is_nc) && (! is_emb) && (! AllPartsInSmallGenomeSet (sfp->location, vsp, bsp))) {
10908     //LCOV_EXCL_START
10909     //farloc is never set during indexing
10910     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_FarLocation, "Feature has 'far' location - accession not packaged in record");
10911     //LCOV_EXCL_STOP
10912   }
10913 
10914   if ((sfp->data.choice == SEQFEAT_PUB) || (sfp->cit != NULL))
10915     bvsp->got_a_pub = TRUE;
10916 
10917   str = (CharPtr) sfp->comment;
10918   if (SerialNumberInString (str)) {
10919     ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_SerialInComment,
10920               "Feature comment may refer to reference by serial number - attach reference specific comments to the reference REMARK instead.");
10921   }
10922   CheckCommentForAmbiguityPhrase(sfp, vsp);
10923 
10924   if (bsp != NULL && bsp->repr == Seq_repr_seg) {
10925 //LCOV_EXCL_START
10926 // Only for SegSets
10927     slp = SeqLocFindNext (sfp->location, NULL);
10928     while (slp != NULL) {
10929       sip = SeqLocId (slp);
10930       if (sip != NULL) {
10931         if (SeqIdIn (sip, bsp->id)) {
10932           on_seg = TRUE;
10933         }
10934       }
10935       slp = SeqLocFindNext (sfp->location, slp);
10936     }
10937     if (on_seg) {
10938       sev = SEV_ERROR;
10939       if (is_nc) {
10940         sev = SEV_WARNING;
10941       }
10942       if (! DeltaOrFarSeg (vsp->sep, sfp->location)) {
10943         ValidErr (vsp, sev, ERR_SEQ_FEAT_LocOnSegmentedBioseq, "Feature location on segmented bioseq, not on parts");
10944       }
10945     }
10946 //LCOV_EXCL_STOP
10947   }
10948 
10949   if (sfp->idx.subtype == FEATDEF_intron) {
10950     no_nonconsensus_except = TRUE;
10951     if (sfp->excpt) {
10952       if (StringISearch (sfp->except_text, "nonconsensus splice site") != NULL) {
10953         no_nonconsensus_except = FALSE;
10954       }
10955     }
10956     if (no_nonconsensus_except) {
10957       ValidateIntronEndsAtSpliceSiteOrGap (vsp, sfp->location);
10958     }
10959   }
10960 
10961   if (sfp->idx.subtype == FEATDEF_CDS || sfp->idx.subtype == FEATDEF_mRNA) {
10962     gene = NULL;
10963     grp = SeqMgrGetGeneXref (sfp);
10964     if (grp == NULL) {
10965       gene = SeqMgrGetOverlappingGene (sfp->location, NULL);
10966     } else if (! SeqMgrGeneIsSuppressed (grp)) {
10967       if (StringDoesHaveText (grp->locus_tag)) {
10968         gene = SeqMgrGetGeneByLocusTag (bsp, grp->locus_tag, NULL);
10969       } else if (StringDoesHaveText (grp->locus)) {
10970         gene = SeqMgrGetFeatureByLabel (bsp, grp->locus, SEQFEAT_GENE, 0, NULL);
10971       }
10972     }
10973     if (gene != NULL && gene->pseudo && sfp->pseudo) {
10974       sfp_pseudo = "unqualified";
10975       gene_pseudo = "unqualified";
10976       for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
10977         if (StringICmp (gbq->qual, "pseudogene") != 0) continue;
10978         if (StringHasNoText (gbq->val)) continue;
10979         sfp_pseudo = gbq->val;
10980       }
10981       for (gbq = gene->qual; gbq != NULL; gbq = gbq->next) {
10982         if (StringICmp (gbq->qual, "pseudogene") != 0) continue;
10983         if (StringHasNoText (gbq->val)) continue;
10984         gene_pseudo = gbq->val;
10985       }
10986       if (StringCmp (sfp_pseudo, gene_pseudo) != 0) {
10987         if (sfp->idx.subtype == FEATDEF_CDS) {
10988           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InconsistentPseudogeneValue,
10989                     "Different pseudogene values on CDS (%s) and gene (%s)",
10990                     sfp_pseudo, gene_pseudo);
10991         } else if (sfp->idx.subtype == FEATDEF_mRNA) {
10992           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InconsistentPseudogeneValue,
10993                     "Different pseudogene values on mRNA (%s) and gene (%s)",
10994                     sfp_pseudo, gene_pseudo);
10995         }
10996       }
10997     }
10998   }
10999 
11000   if (gcp != NULL) {
11001     gcp->itemID = olditemid;
11002     gcp->thistype = olditemtype;
11003   }
11004 
11005   return TRUE;
11006 }
11007 
GeneSpansOrigin(SeqMgrFeatContextPtr context,Int4 bsplength)11008 static Boolean GeneSpansOrigin (SeqMgrFeatContextPtr context, Int4 bsplength)
11009 
11010 {
11011   Int4Ptr  ivals;
11012 
11013   if (context == NULL || bsplength < 1) return FALSE;
11014   ivals = context->ivals;
11015   if (ivals == NULL || context->numivals != 2) return FALSE;
11016   if (context->strand == Seq_strand_minus) {
11017     if (ivals [1] == 0 && ivals [2] == bsplength - 1) return TRUE;
11018   } else {
11019     if (ivals [2] == 0 && ivals [1] == bsplength - 1) return TRUE;
11020   }
11021   return FALSE;
11022 }
11023 
CheckMultiIntervalGene(SeqFeatPtr sfp,SeqMgrFeatContextPtr context,ValidStructPtr vsp,GatherContextPtr gcp)11024 static void CheckMultiIntervalGene (SeqFeatPtr sfp, SeqMgrFeatContextPtr context, ValidStructPtr vsp, GatherContextPtr gcp)
11025 
11026 {
11027   BioseqPtr     bsp;
11028   Int4          count;
11029   SeqLocPtr     mappedloc = NULL;
11030   Uint2         olditemtype = 0;
11031   Uint4         olditemid = 0;
11032   Boolean       segmented = FALSE;
11033   ErrSev        sev = /* SEV_ERROR */ SEV_WARNING;
11034   SeqIdPtr      sip;
11035   SeqLocPtr     slp;
11036   TextSeqIdPtr  tsip;
11037 
11038   if (sfp == NULL || context == NULL || vsp == NULL) return;
11039   if (context->numivals < 2) return;
11040 
11041   if (sfp->excpt) {
11042     if (StringISearch (sfp->except_text, "trans-splicing") != NULL) return;
11043   }
11044 
11045   if (SeqLocId (sfp->location) == NULL) {
11046     bsp = context->bsp;
11047     if (bsp == NULL || bsp->repr != Seq_repr_seg) return;
11048 //LCOV_EXCL_START
11049 // Only for SegSets
11050     mappedloc = SeqLocMerge (bsp, sfp->location, NULL, FALSE, TRUE, FALSE);
11051     if (mappedloc == NULL) return;
11052     count = 0;
11053     slp = SeqLocFindNext (mappedloc, NULL);
11054     while (slp != NULL) {
11055       count++;
11056       slp = SeqLocFindNext (mappedloc, slp);
11057     }
11058     SeqLocFree (mappedloc);
11059     if (count < 2) return;
11060     segmented = TRUE;
11061 //LCOV_EXCL_STOP
11062   }
11063 
11064   bsp = context->bsp;
11065   if (bsp != NULL) {
11066     for (sip = bsp->id; sip != NULL; sip = sip->next) {
11067       if (sip->choice == SEQID_OTHER) {
11068         tsip = (TextSeqIdPtr) sip->data.ptrvalue;
11069         if (tsip != NULL && tsip->accession != NULL) {
11070           if (StringNICmp (tsip->accession, "NC_", 3) == 0) {
11071             sev = SEV_WARNING;
11072           }
11073         }
11074       } else if (sip->choice == SEQID_EMBL || sip->choice == SEQID_DDBJ) {
11075         sev = SEV_WARNING;
11076       }
11077     }
11078     if (bsp->topology == 2) {
11079       if (context->numivals == 2 && GeneSpansOrigin (context, bsp->length)) return;
11080       sev = SEV_WARNING;
11081     }
11082   }
11083 
11084   if (gcp != NULL) {
11085     olditemid = gcp->itemID;
11086     olditemtype = gcp->thistype;
11087     gcp->itemID = context->itemID;
11088     gcp->thistype = OBJ_SEQFEAT;
11089   }
11090 
11091   vsp->sfp = sfp;
11092   if (segmented) {
11093     //LCOV_EXCL_START
11094     //segmented sets are obsolete
11095     ValidErr (vsp, sev, ERR_SEQ_FEAT_SegmentedGeneProblem,
11096               "Gene feature on segmented sequence should cover all bases within its extremes");
11097     //LCOV_EXCL_STOP
11098   } else if (vsp->is_small_genome_set) {
11099     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_MultiIntervalGene,
11100               "Multiple interval gene feature in small genome set - set trans-splicing exception if appropriate");
11101   } else {
11102     ValidErr (vsp, sev, ERR_SEQ_FEAT_MultiIntervalGene,
11103               "Gene feature on non-segmented sequence should not have multiple intervals");
11104   }
11105   vsp->sfp = NULL;
11106 
11107   if (gcp != NULL) {
11108     gcp->itemID = olditemid;
11109     gcp->thistype = olditemtype;
11110   }
11111 }
11112 
ValidateSeqFeatIndexed(SeqFeatPtr sfp,SeqMgrFeatContextPtr context)11113 static Boolean LIBCALLBACK ValidateSeqFeatIndexed (SeqFeatPtr sfp, SeqMgrFeatContextPtr context)
11114 {
11115   ValidStructPtr  vsp;
11116   BioseqValidStrPtr bvsp;
11117 
11118   bvsp = (BioseqValidStrPtr) context->userdata;
11119   vsp = bvsp->vsp;
11120 
11121   if (sfp->data.choice == SEQFEAT_GENE) {
11122     CheckMultiIntervalGene (sfp, context, vsp, vsp->gcp);
11123   }
11124 
11125   return ValidateSeqFeatCommon (sfp, bvsp, vsp, context->left, context->right, context->numivals, context->itemID, context->farloc, context->bsp);
11126 }
11127 
11128 //LCOV_EXCL_START
ValidateSeqFeatContext(GatherContextPtr gcp)11129 static void ValidateSeqFeatContext (GatherContextPtr gcp)
11130 {
11131   ValidStructPtr  vsp;
11132   BioseqValidStrPtr bvsp;
11133   SeqFeatPtr      sfp;
11134 
11135   bvsp = (BioseqValidStrPtr) (gcp->userdata);
11136   vsp = bvsp->vsp;
11137   sfp = (SeqFeatPtr) (gcp->thisitem);
11138 
11139   ValidateSeqFeatCommon (sfp, bvsp, vsp, gcp->extremes.left, gcp->extremes.right, 0, 0, FALSE, NULL);
11140 }
11141 //LCOV_EXCL_STOP
11142 
11143 /*****************************************************************************
11144 *
11145 *   CountryIsValid(name)
11146 *      Validates subsource country against official country names
11147 *
11148 *****************************************************************************/
11149 
11150 static CharPtr  Nlm_valid_country_codes [] = {
11151   "Afghanistan",
11152   "Albania",
11153   "Algeria",
11154   "American Samoa",
11155   "Andorra",
11156   "Angola",
11157   "Anguilla",
11158   "Antarctica",
11159   "Antigua and Barbuda",
11160   "Arctic Ocean",
11161   "Argentina",
11162   "Armenia",
11163   "Aruba",
11164   "Ashmore and Cartier Islands",
11165   "Atlantic Ocean",
11166   "Australia",
11167   "Austria",
11168   "Azerbaijan",
11169   "Bahamas",
11170   "Bahrain",
11171   "Baker Island",
11172   "Baltic Sea",
11173   "Bangladesh",
11174   "Barbados",
11175   "Bassas da India",
11176   "Belarus",
11177   "Belgium",
11178   "Belize",
11179   "Benin",
11180   "Bermuda",
11181   "Bhutan",
11182   "Bolivia",
11183   "Borneo",
11184   "Bosnia and Herzegovina",
11185   "Botswana",
11186   "Bouvet Island",
11187   "Brazil",
11188   "British Virgin Islands",
11189   "Brunei",
11190   "Bulgaria",
11191   "Burkina Faso",
11192   "Burundi",
11193   "Cambodia",
11194   "Cameroon",
11195   "Canada",
11196   "Cape Verde",
11197   "Cayman Islands",
11198   "Central African Republic",
11199   "Chad",
11200   "Chile",
11201   "China",
11202   "Christmas Island",
11203   "Clipperton Island",
11204   "Cocos Islands",
11205   "Colombia",
11206   "Comoros",
11207   "Cook Islands",
11208   "Coral Sea Islands",
11209   "Costa Rica",
11210   "Cote d'Ivoire",
11211   "Croatia",
11212   "Cuba",
11213   "Curacao",
11214   "Cyprus",
11215   "Czech Republic",
11216   "Democratic Republic of the Congo",
11217   "Denmark",
11218   "Djibouti",
11219   "Dominica",
11220   "Dominican Republic",
11221   "Ecuador",
11222   "Egypt",
11223   "El Salvador",
11224   "Equatorial Guinea",
11225   "Eritrea",
11226   "Estonia",
11227   "Ethiopia",
11228   "Europa Island",
11229   "Falkland Islands (Islas Malvinas)",
11230   "Faroe Islands",
11231   "Fiji",
11232   "Finland",
11233   "France",
11234   "French Guiana",
11235   "French Polynesia",
11236   "French Southern and Antarctic Lands",
11237   "Gabon",
11238   "Gambia",
11239   "Gaza Strip",
11240   "Georgia",
11241   "Germany",
11242   "Ghana",
11243   "Gibraltar",
11244   "Glorioso Islands",
11245   "Greece",
11246   "Greenland",
11247   "Grenada",
11248   "Guadeloupe",
11249   "Guam",
11250   "Guatemala",
11251   "Guernsey",
11252   "Guinea",
11253   "Guinea-Bissau",
11254   "Guyana",
11255   "Haiti",
11256   "Heard Island and McDonald Islands",
11257   "Honduras",
11258   "Hong Kong",
11259   "Howland Island",
11260   "Hungary",
11261   "Iceland",
11262   "India",
11263   "Indian Ocean",
11264   "Indonesia",
11265   "Iran",
11266   "Iraq",
11267   "Ireland",
11268   "Isle of Man",
11269   "Israel",
11270   "Italy",
11271   "Jamaica",
11272   "Jan Mayen",
11273   "Japan",
11274   "Jarvis Island",
11275   "Jersey",
11276   "Johnston Atoll",
11277   "Jordan",
11278   "Juan de Nova Island",
11279   "Kazakhstan",
11280   "Kenya",
11281   "Kerguelen Archipelago",
11282   "Kingman Reef",
11283   "Kiribati",
11284   "Kosovo",
11285   "Kuwait",
11286   "Kyrgyzstan",
11287   "Laos",
11288   "Latvia",
11289   "Lebanon",
11290   "Lesotho",
11291   "Liberia",
11292   "Libya",
11293   "Liechtenstein",
11294   "Line Islands",
11295   "Lithuania",
11296   "Luxembourg",
11297   "Macau",
11298   "Macedonia",
11299   "Madagascar",
11300   "Malawi",
11301   "Malaysia",
11302   "Maldives",
11303   "Mali",
11304   "Malta",
11305   "Marshall Islands",
11306   "Martinique",
11307   "Mauritania",
11308   "Mauritius",
11309   "Mayotte",
11310   "Mediterranean Sea",
11311   "Mexico",
11312   "Micronesia",
11313   "Midway Islands",
11314   "Moldova",
11315   "Monaco",
11316   "Mongolia",
11317   "Montenegro",
11318   "Montserrat",
11319   "Morocco",
11320   "Mozambique",
11321   "Myanmar",
11322   "Namibia",
11323   "Nauru",
11324   "Navassa Island",
11325   "Nepal",
11326   "Netherlands",
11327   "New Caledonia",
11328   "New Zealand",
11329   "Nicaragua",
11330   "Niger",
11331   "Nigeria",
11332   "Niue",
11333   "Norfolk Island",
11334   "North Korea",
11335   "North Sea",
11336   "Northern Mariana Islands",
11337   "Norway",
11338   "Oman",
11339   "Pacific Ocean",
11340   "Pakistan",
11341   "Palau",
11342   "Palestine",
11343   "Palmyra Atoll",
11344   "Panama",
11345   "Papua New Guinea",
11346   "Paracel Islands",
11347   "Paraguay",
11348   "Peru",
11349   "Philippines",
11350   "Pitcairn Islands",
11351   "Poland",
11352   "Portugal",
11353   "Puerto Rico",
11354   "Qatar",
11355   "Republic of the Congo",
11356   "Reunion",
11357   "Romania",
11358   "Ross Sea",
11359   "Russia",
11360   "Rwanda",
11361   "Saint Barthelemy",
11362   "Saint Helena",
11363   "Saint Kitts and Nevis",
11364   "Saint Lucia",
11365   "Saint Martin",
11366   "Saint Pierre and Miquelon",
11367   "Saint Vincent and the Grenadines",
11368   "Samoa",
11369   "San Marino",
11370   "Sao Tome and Principe",
11371   "Saudi Arabia",
11372   "Senegal",
11373   "Serbia",
11374   "Seychelles",
11375   "Sierra Leone",
11376   "Singapore",
11377   "Sint Maarten",
11378   "Slovakia",
11379   "Slovenia",
11380   "Solomon Islands",
11381   "Somalia",
11382   "South Africa",
11383   "South Georgia and the South Sandwich Islands",
11384   "South Korea",
11385   "South Sudan",
11386   "Southern Ocean",
11387   "Spain",
11388   "Spratly Islands",
11389   "Sri Lanka",
11390   "State of Palestine",
11391   "Sudan",
11392   "Suriname",
11393   "Svalbard",
11394   "Swaziland",
11395   "Sweden",
11396   "Switzerland",
11397   "Syria",
11398   "Taiwan",
11399   "Tajikistan",
11400   "Tanzania",
11401   "Tasman Sea",
11402   "Thailand",
11403   "Timor-Leste",
11404   "Togo",
11405   "Tokelau",
11406   "Tonga",
11407   "Trinidad and Tobago",
11408   "Tromelin Island",
11409   "Tunisia",
11410   "Turkey",
11411   "Turkmenistan",
11412   "Turks and Caicos Islands",
11413   "Tuvalu",
11414   "Uganda",
11415   "Ukraine",
11416   "United Arab Emirates",
11417   "United Kingdom",
11418   "Uruguay",
11419   "USA",
11420   "Uzbekistan",
11421   "Vanuatu",
11422   "Venezuela",
11423   "Viet Nam",
11424   "Virgin Islands",
11425   "Wake Island",
11426   "Wallis and Futuna",
11427   "West Bank",
11428   "Western Sahara",
11429   "Yemen",
11430   "Zambia",
11431   "Zimbabwe",
11432   NULL
11433 };
11434 
11435 static CharPtr  Nlm_formerly_valid_country_codes [] = {
11436   "Belgian Congo",
11437   "British Guiana",
11438   "Burma",
11439   "Czechoslovakia",
11440   "East Timor",
11441   "Korea",
11442   "Netherlands Antilles",
11443   "Serbia and Montenegro",
11444   "Siam",
11445   "USSR",
11446   "Yugoslavia",
11447   "Zaire",
11448   NULL
11449 };
11450 
11451 //LCOV_EXCL_START
GetValidCountryList(void)11452 NLM_EXTERN CharPtr PNTR GetValidCountryList(void)
11453 
11454 {
11455   return (CharPtr PNTR) Nlm_valid_country_codes;
11456 }
11457 
GetFormerCountryList(void)11458 NLM_EXTERN CharPtr PNTR GetFormerCountryList (void)
11459 
11460 {
11461   return (CharPtr PNTR) Nlm_formerly_valid_country_codes;
11462 }
11463 //LCOV_EXCL_STOP
11464 
CountryIsValid(CharPtr name,BoolPtr old_countryP,BoolPtr bad_capP)11465 NLM_EXTERN Boolean CountryIsValid (CharPtr name, BoolPtr old_countryP, BoolPtr bad_capP)
11466 {
11467   Int2     L, R, mid;
11468   CharPtr  ptr;
11469   Char     str [256];
11470 
11471   if (StringHasNoText (name)) return FALSE;
11472 
11473   StringNCpy_0 (str, name, sizeof (str));
11474   ptr = StringChr (str, ':');
11475   if (ptr != NULL) {
11476     *ptr = '\0';
11477   }
11478 
11479   L = 0;
11480   R = sizeof (Nlm_valid_country_codes) / sizeof (Nlm_valid_country_codes [0]) - 1; /* -1 because now NULL terminated */
11481 
11482   while (L < R) {
11483     mid = (L + R) / 2;
11484     if (StringICmp (Nlm_valid_country_codes [mid], str) < 0) {
11485       L = mid + 1;
11486     } else {
11487       R = mid;
11488     }
11489   }
11490 
11491   if (StringICmp (Nlm_valid_country_codes [R], str) == 0) {
11492     if (bad_capP != NULL) {
11493       if (StringCmp (Nlm_valid_country_codes [R], str) != 0) {
11494         *bad_capP = TRUE;
11495       }
11496     }
11497     return TRUE;
11498   }
11499 
11500   L = 0;
11501   R = sizeof (Nlm_formerly_valid_country_codes) / sizeof (Nlm_formerly_valid_country_codes [0]) - 1; /* -1 because now NULL terminated */
11502 
11503   while (L < R) {
11504     mid = (L + R) / 2;
11505     if (StringICmp (Nlm_formerly_valid_country_codes [mid], str) < 0) {
11506       L = mid + 1;
11507     } else {
11508       R = mid;
11509     }
11510   }
11511 
11512   if (StringICmp (Nlm_formerly_valid_country_codes [R], str) == 0) {
11513     if (old_countryP != NULL) {
11514       *old_countryP = TRUE;
11515     }
11516     if (bad_capP != NULL) {
11517       if (StringCmp (Nlm_formerly_valid_country_codes [R], str) != 0) {
11518         *bad_capP = TRUE;
11519       }
11520     }
11521     return FALSE;
11522   }
11523 
11524   return FALSE;
11525 }
11526 
11527 
11528 //LCOV_EXCL_START
GetCorrectedCountryCapitalization(CharPtr name)11529 NLM_EXTERN CharPtr GetCorrectedCountryCapitalization (CharPtr name)
11530 {
11531   Int2     L, R, mid;
11532   CharPtr  ptr;
11533   Char     str [256];
11534 
11535   if (StringHasNoText (name)) return NULL;
11536 
11537   StringNCpy_0 (str, name, sizeof (str));
11538   ptr = StringChr (str, ':');
11539   if (ptr != NULL) {
11540     *ptr = '\0';
11541   }
11542 
11543   L = 0;
11544   R = sizeof (Nlm_valid_country_codes) / sizeof (Nlm_valid_country_codes [0]) - 1; /* -1 because now NULL terminated */
11545 
11546   while (L < R) {
11547     mid = (L + R) / 2;
11548     if (StringICmp (Nlm_valid_country_codes [mid], str) < 0) {
11549       L = mid + 1;
11550     } else {
11551       R = mid;
11552     }
11553   }
11554 
11555   if (StringICmp (Nlm_valid_country_codes [R], str) == 0) {
11556     return Nlm_valid_country_codes[R];
11557   }
11558 
11559   return NULL;
11560 }
11561 
11562 static CharPtr bodiesOfWater [] = {
11563   "Basin",
11564   "Bay",
11565   "Bight",
11566   "Canal",
11567   "Channel",
11568   "Coastal",
11569   "Cove",
11570   "Estuary",
11571   "Fjord",
11572   "Freshwater",
11573   "Gulf",
11574   "Harbor",
11575   "Inlet",
11576   "Lagoon",
11577   "Lake",
11578   "Narrows",
11579   "Ocean",
11580   "Offshore",
11581   "Passage",
11582   "Passages",
11583   "Reef",
11584   "River",
11585   "Sea",
11586   "Seawater",
11587   "Sound",
11588   "Strait",
11589   "Trench",
11590   "Trough",
11591   "Water",
11592   "Waters",
11593   NULL
11594 };
11595 
GetBodiesOfWaterFSA(void)11596 static TextFsaPtr GetBodiesOfWaterFSA (void)
11597 
11598 
11599 {
11600   TextFsaPtr  fsa;
11601   Int2        i;
11602   CharPtr     prop = "BodiesOfWaterFSA";
11603 
11604   fsa = (TextFsaPtr) GetAppProperty (prop);
11605   if (fsa != NULL) return fsa;
11606 
11607   fsa = TextFsaNew ();
11608   if (fsa != NULL) {
11609     for (i = 0; bodiesOfWater [i] != NULL; i++) {
11610       TextFsaAdd (fsa, bodiesOfWater [i]);
11611     }
11612   }
11613 
11614   SetAppProperty (prop, (Pointer) fsa);
11615 
11616   return fsa;
11617 }
11618 
StringContainsBodyOfWater(CharPtr str)11619 NLM_EXTERN Boolean StringContainsBodyOfWater (CharPtr str)
11620 
11621 {
11622   Char        ch;
11623   TextFsaPtr  fsa;
11624   CharPtr     ptr;
11625   Int4        state;
11626   ValNodePtr  matches;
11627 
11628   if (StringHasNoText (str)) return FALSE;
11629 
11630   fsa = GetBodiesOfWaterFSA ();
11631   if (fsa == NULL) return FALSE;
11632 
11633   state = 0;
11634   ptr = str;
11635   ch = *ptr;
11636 
11637   while (ch != '\0') {
11638     matches = NULL;
11639     state = TextFsaNext (fsa, state, ch, &matches);
11640     ptr++;
11641     ch = *ptr;
11642     if (ch == '\0' || ch == ',' || ch == ':' || ch == ';' || ch == ' ') {
11643       if (matches != NULL) return TRUE;
11644       state = 0;
11645     }
11646   }
11647 
11648   return FALSE;
11649 }
11650 //LCOV_EXCL_STOP
11651 
11652 /* BEGINNING OF NEW LATITUDE-LONGITUDE COUNTRY VALIDATION CODE */
11653 
11654 /* latitude-longitude to country conversion */
11655 
11656 typedef struct ctyblock {
11657   CharPtr  name;    /* name of country or country: subregion */
11658   CharPtr  level0;  /* just the country */
11659   CharPtr  level1;  /* just the subregion */
11660   Int4     area;    /* pixel area for choosing smallest overlapping subregion */
11661   Int4     minlat;  /* minimum latitude */
11662   Int4     maxlat;  /* maximum latitude */
11663   Int4     minlon;  /* minimum longitude */
11664   Int4     maxlon;  /* maximum longitude */
11665 } CtyBlock, PNTR CtyBlockPtr;
11666 
11667 typedef struct latblock {
11668   CtyBlockPtr  landmass;   /* points to instance in countries list */
11669   Int4         lat;        /* latitude (integer in 10ths of a degree) */
11670   Int4         minlon;     /* minimum longitude */
11671   Int4         maxlon;     /* maximum longitude */
11672 } LatBlock, PNTR LatBlockPtr;
11673 
11674 typedef struct ctryset {
11675   ValNodePtr        ctyblocks;      /* linked list of country blocks */
11676   CtyBlockPtr PNTR  ctyarray;       /* country blocks sorted by name */
11677   Int4              numCtyBlocks;
11678   ValNodePtr        latblocks;      /* linked list of latitude blocks */
11679   LatBlockPtr PNTR  latarray;       /* latitude blocks sorted by latitude then longitude */
11680   Int4              numLatBlocks;
11681   FloatHi           scale;
11682 } CtrySet, PNTR CtrySetPtr;
11683 
SortByCountry(VoidPtr ptr1,VoidPtr ptr2)11684 static int LIBCALLBACK SortByCountry (
11685   VoidPtr ptr1,
11686   VoidPtr ptr2
11687 )
11688 
11689 {
11690   CtyBlockPtr  cbp1;
11691   CtyBlockPtr  cbp2;
11692   int          cmp;
11693   ValNodePtr   vnp1;
11694   ValNodePtr   vnp2;
11695 
11696   if (ptr1 == NULL || ptr2 == NULL) return 0;
11697   vnp1 = *((ValNodePtr PNTR) ptr1);
11698   vnp2 = *((ValNodePtr PNTR) ptr2);
11699   if (vnp1 == NULL || vnp2 == NULL) return 0;
11700   cbp1 = (CtyBlockPtr) vnp1->data.ptrvalue;
11701   cbp2 = (CtyBlockPtr) vnp2->data.ptrvalue;
11702   if (cbp1 == NULL || cbp2 == NULL) return 0;
11703 
11704   cmp = StringICmp (cbp1->name, cbp2->name);
11705   if (cmp > 0) {
11706     return 1;
11707   } else if (cmp < 0) {
11708     return -1;
11709   }
11710 
11711   return 0;
11712 }
11713 
SortByLatLon(VoidPtr ptr1,VoidPtr ptr2)11714 static int LIBCALLBACK SortByLatLon (
11715   VoidPtr ptr1,
11716   VoidPtr ptr2
11717 )
11718 
11719 {
11720   CtyBlockPtr  cbp1;
11721   CtyBlockPtr  cbp2;
11722   int          cmp;
11723   LatBlockPtr  lbp1;
11724   LatBlockPtr  lbp2;
11725   ValNodePtr   vnp1;
11726   ValNodePtr   vnp2;
11727 
11728   if (ptr1 == NULL || ptr2 == NULL) return 0;
11729   vnp1 = *((ValNodePtr PNTR) ptr1);
11730   vnp2 = *((ValNodePtr PNTR) ptr2);
11731   if (vnp1 == NULL || vnp2 == NULL) return 0;
11732   lbp1 = (LatBlockPtr) vnp1->data.ptrvalue;
11733   lbp2 = (LatBlockPtr) vnp2->data.ptrvalue;
11734   if (lbp1 == NULL || lbp2 == NULL) return 0;
11735 
11736   if (lbp1->lat < lbp2->lat) {
11737     return -1;
11738   } else if (lbp1->lat > lbp2->lat) {
11739     return 1;
11740   }
11741 
11742   if (lbp1->minlon < lbp2->minlon) {
11743     return -1;
11744   } else if (lbp1->minlon > lbp2->minlon) {
11745     return 1;
11746   }
11747 
11748   if (lbp1->maxlon < lbp2->maxlon) {
11749     return 1;
11750   } else if (lbp1->maxlon > lbp2->maxlon) {
11751     return -1;
11752   }
11753 
11754   cbp1 = lbp1->landmass;
11755   cbp2 = lbp2->landmass;
11756   if (cbp1 == NULL || cbp2 == NULL) return 0;
11757 
11758   if (cbp1->area < cbp2->area) {
11759     return -1;
11760   } else if (cbp1->area > cbp2->area) {
11761     return 1;
11762   }
11763 
11764   cmp = StringICmp (cbp1->name, cbp2->name);
11765   if (cmp > 0) {
11766     return 1;
11767   } else if (cmp < 0) {
11768     return -1;
11769   }
11770 
11771   return 0;
11772 }
11773 
11774 #define EPSILON 0.001
11775 
ConvertLat(FloatHi lat,FloatHi scale)11776 static Int4 ConvertLat (FloatHi lat, FloatHi scale) {
11777 
11778   Int4  val = 0;
11779 
11780   if (lat < -90.0) {
11781     lat = -90.0;
11782   }
11783   if (lat > 90.0) {
11784     lat = 90.0;
11785   }
11786 
11787   if (lat > 0) {
11788     val = (Int4) (lat * scale + EPSILON);
11789   } else {
11790     val = (Int4) (-(-lat * scale + EPSILON));
11791   }
11792 
11793   return val;
11794 }
11795 
ConvertLon(FloatHi lon,FloatHi scale)11796 static Int4 ConvertLon (FloatHi lon, FloatHi scale) {
11797 
11798   Int4  val = 0;
11799 
11800   if (lon < -180.0) {
11801     lon = -180.0;
11802   }
11803   if (lon > 180.0) {
11804     lon = 180.0;
11805   }
11806 
11807   if (lon > 0) {
11808     val = (Int4) (lon * scale + EPSILON);
11809   } else {
11810     val = (Int4) (-(-lon * scale + EPSILON));
11811   }
11812 
11813   return val;
11814 }
11815 
FreeLatLonCountryData(CtrySetPtr csp)11816 static CtrySetPtr FreeLatLonCountryData (
11817   CtrySetPtr csp
11818 )
11819 
11820 {
11821   CtyBlockPtr  cbp;
11822   ValNodePtr   vnp;
11823 
11824   if (csp == NULL) return NULL;
11825 
11826   for (vnp = csp->ctyblocks; vnp != NULL; vnp = vnp->next) {
11827     cbp = (CtyBlockPtr) vnp->data.ptrvalue;
11828     if (cbp == NULL) continue;
11829     MemFree (cbp->name);
11830     MemFree (cbp->level0);
11831     MemFree (cbp->level1);
11832   }
11833 
11834   ValNodeFreeData (csp->ctyblocks);
11835   ValNodeFreeData (csp->latblocks);
11836 
11837   MemFree (csp->ctyarray);
11838   MemFree (csp->latarray);
11839 
11840   MemFree (csp);
11841 
11842   return NULL;
11843 }
11844 
11845 /* Original data source is Natural Earth.  Free vector and raster map data @ http://naturalearthdata.com */
11846 
LatLonCountryReadNextLine(FileCache PNTR fcp,CharPtr buf,size_t bufsize,CharPtr PNTR local,Int4Ptr idxP)11847 static CharPtr LatLonCountryReadNextLine (
11848   FileCache PNTR fcp,
11849   CharPtr buf,
11850   size_t bufsize,
11851   CharPtr PNTR local,
11852   Int4Ptr idxP
11853 )
11854 
11855 {
11856   Int4     idx;
11857   CharPtr  str = NULL;
11858 
11859   if (fcp != NULL) {
11860     str = FileCacheReadLine (fcp, buf, bufsize, NULL);
11861   }
11862 
11863   if (local != NULL && idxP != NULL) {
11864     idx = *idxP;
11865     str = local [idx];
11866     if (str != NULL) {
11867       StringNCpy_0 (buf, local [idx], bufsize);
11868       str = buf;
11869     }
11870     idx++;
11871     *idxP = idx;
11872   }
11873 
11874   return str;
11875 }
11876 
ReadLatLonCountryData(CharPtr prop,CharPtr file,CharPtr PNTR local)11877 static CtrySetPtr ReadLatLonCountryData (
11878   CharPtr prop,
11879   CharPtr file,
11880   CharPtr PNTR local
11881 )
11882 
11883 {
11884   Char              buf [128];
11885   Char              ch;
11886   CtyBlockPtr       cbp = NULL;
11887   CtrySetPtr        csp = NULL;
11888   CtyBlockPtr PNTR  ctyarray;
11889   ValNodePtr        ctyblocks = NULL;
11890   FileCache         fc;
11891   FileCache PNTR    fcp = NULL;
11892   FILE              *fp = NULL;
11893   Int4              i;
11894   Int4              idx = 0;
11895   ValNodePtr        lastlatblock = NULL;
11896   ValNodePtr        lastctyblock = NULL;
11897   FloatHi           latitude;
11898   LatBlockPtr PNTR  latarray;
11899   ValNodePtr        latblocks = NULL;
11900   LatBlockPtr       lbp;
11901   Char              line [1024];
11902   FloatHi           maxlongitude;
11903   FloatHi           minlongitude;
11904   Char              path [PATH_MAX];
11905   CharPtr           ptr;
11906   CharPtr           recentCountry = NULL;
11907   FloatHi           scale = 0.0;
11908   Boolean           scale_not_set = TRUE;
11909   ErrSev            sev;
11910   CharPtr           str;
11911   Char              tmp [128];
11912   double            val;
11913   ValNodePtr        vnp;
11914   CharPtr           wrk;
11915 
11916   if (FindPath ("ncbi", "ncbi", "data", path, sizeof (path))) {
11917     FileBuildPath (path, NULL, file);
11918     sev = ErrSetMessageLevel (SEV_ERROR);
11919     fp = FileOpen (path, "r");
11920     ErrSetMessageLevel (sev);
11921   }
11922 
11923   if (fp != NULL) {
11924     FileCacheSetup (&fc, fp);
11925     fcp = &fc;
11926     local = NULL;
11927   } else if (local == NULL) {
11928     return NULL;
11929   }
11930 
11931   for (str = LatLonCountryReadNextLine (fcp, line, sizeof (line), local, &idx);
11932        str != NULL;
11933        str = LatLonCountryReadNextLine (fcp, line, sizeof (line), local, &idx)) {
11934     if (StringHasNoText (str)) continue;
11935 
11936     /* if reading from local copy, str cannot be modified, so copy to local buf and reset pointer */
11937 
11938     StringNCpy_0 (buf, str, sizeof (buf));
11939     str = buf;
11940 
11941     ch = str [0];
11942 
11943     /* ignore comment lines starting with hyphen */
11944 
11945     if (ch == '-') continue;
11946 
11947     /* Scale should be at top of file, after comments */
11948 
11949     if (IS_DIGIT (ch)) {
11950       if (scale_not_set && sscanf (str, "%lf", &val) == 1) {
11951         scale = (FloatHi) val;
11952         scale_not_set = FALSE;
11953       }
11954 
11955       continue;
11956     }
11957 
11958     /* Country starts on first column */
11959 
11960     if (IS_ALPHA (ch)) {
11961 
11962       if (scale_not_set) {
11963         scale = 20.0;
11964         scale_not_set = FALSE;
11965       }
11966 
11967       ptr = StringChr (str, '\t');
11968       if (ptr != NULL) {
11969         *ptr = '\0';
11970       }
11971 
11972       if (StringCmp (str, recentCountry) == 0) continue;
11973 
11974       cbp = (CtyBlockPtr) MemNew (sizeof (CtyBlock));
11975       if (cbp == NULL) continue;
11976 
11977       TrimSpacesAroundString (str);
11978       cbp->name = StringSave (str);
11979       StringNCpy_0 (tmp, str, sizeof (tmp));
11980       ptr = StringChr (tmp, ':');
11981       if (ptr != NULL) {
11982         *ptr = '\0';
11983         ptr++;
11984         TrimSpacesAroundString (ptr);
11985         if (StringDoesHaveText (ptr)) {
11986           cbp->level1 = StringSave (ptr);
11987         }
11988         TrimSpacesAroundString (tmp);
11989         cbp->level0 = StringSave (tmp);
11990       } else {
11991         TrimSpacesAroundString (str);
11992         cbp->level0 = StringSave (str);
11993       }
11994       cbp->area = 0;
11995       cbp->minlat = INT4_MAX;
11996       cbp->maxlat = INT4_MIN;
11997       cbp->minlon = INT4_MAX;
11998       cbp->maxlon = INT4_MIN;
11999       vnp = ValNodeAddPointer (&lastctyblock, 0, (Pointer) cbp);
12000       if (ctyblocks == NULL) {
12001         ctyblocks = vnp;
12002       }
12003       lastctyblock = vnp;
12004 
12005       recentCountry = cbp->name;
12006 
12007       continue;
12008     }
12009 
12010     /* Latitude with longitude min/max pairs on line starting with tab */
12011 
12012     if (ch != '\t') continue;
12013 
12014     wrk = StringSave (str + 1);
12015     if (wrk == NULL) continue;
12016 
12017     ptr = StringChr (wrk, '\t');
12018     if (ptr != NULL) {
12019       *ptr = '\0';
12020       ptr++;
12021       if (sscanf (wrk, "%lf", &val) == 1) {
12022         latitude = (FloatHi) val;
12023 
12024         str = ptr;
12025         while (StringDoesHaveText (str)) {
12026           ptr = StringChr (str, '\t');
12027           if (ptr != NULL) {
12028             *ptr = '\0';
12029             ptr++;
12030           }
12031           if (sscanf (str, "%lf", &val) != 1) {
12032             /* prevent infinite loop if it fails */
12033             str = NULL;
12034           } else {
12035             minlongitude = (FloatHi) val;
12036             str = ptr;
12037             if (StringDoesHaveText (str)) {
12038               ptr = StringChr (str, '\t');
12039               if (ptr != NULL) {
12040                 *ptr = '\0';
12041                 ptr++;
12042               }
12043               if (sscanf (str, "%lf", &val) == 1) {
12044                 maxlongitude = (FloatHi) val;
12045 
12046                 lbp = (LatBlockPtr) MemNew (sizeof (LatBlock));
12047                 if (lbp != NULL) {
12048                   lbp->landmass = cbp;
12049                   lbp->lat = ConvertLat (latitude, scale);
12050                   lbp->minlon = ConvertLon (minlongitude, scale);
12051                   lbp->maxlon = ConvertLon (maxlongitude, scale);
12052 
12053                   vnp = ValNodeAddPointer (&lastlatblock, 0, (Pointer) lbp);
12054                   if (latblocks == NULL) {
12055                     latblocks = vnp;
12056                   }
12057                   lastlatblock = vnp;
12058                 }
12059               }
12060             }
12061             str = ptr;
12062           }
12063         }
12064       }
12065     }
12066 
12067     MemFree (wrk);
12068   }
12069 
12070   if (fp != NULL) {
12071     FileClose (fp);
12072   }
12073 
12074   if (ctyblocks == NULL || latblocks == NULL) {
12075     return NULL;
12076   }
12077 
12078   csp = (CtrySetPtr) MemNew (sizeof (CtrySet));
12079   if (csp == NULL) return NULL;
12080 
12081   for (vnp = latblocks; vnp != NULL; vnp = vnp->next) {
12082     lbp = (LatBlockPtr) vnp->data.ptrvalue;
12083     if (lbp == NULL) continue;
12084     cbp = lbp->landmass;
12085     if (cbp == NULL) continue;
12086     cbp->area += lbp->maxlon - lbp->minlon + 1;
12087     if (cbp->minlat > lbp->lat) {
12088       cbp->minlat = lbp->lat;
12089     }
12090     if (cbp->maxlat < lbp->lat) {
12091       cbp->maxlat = lbp->lat;
12092     }
12093     if (cbp->minlon > lbp->minlon) {
12094       cbp->minlon = lbp->minlon;
12095     }
12096     if (cbp->maxlon < lbp->maxlon) {
12097       cbp->maxlon = lbp->maxlon;
12098     }
12099   }
12100 
12101   ctyblocks = ValNodeSort (ctyblocks, SortByCountry);
12102   csp->ctyblocks = ctyblocks;
12103   csp->numCtyBlocks = ValNodeLen (ctyblocks);
12104 
12105   latblocks = ValNodeSort (latblocks, SortByLatLon);
12106   csp->latblocks = latblocks;
12107   csp->numLatBlocks = ValNodeLen (latblocks);
12108 
12109   if (scale_not_set) {
12110     scale = 20.0;
12111   }
12112   csp->scale = scale;
12113 
12114   ctyarray = (CtyBlockPtr PNTR) MemNew (sizeof (CtyBlockPtr) * (csp->numCtyBlocks + 1));
12115   if (ctyarray != NULL) {
12116     for (vnp = ctyblocks, i = 0; vnp != NULL; vnp = vnp->next, i++) {
12117       cbp = (CtyBlockPtr) vnp->data.ptrvalue;
12118       ctyarray [i] = cbp;
12119     }
12120 
12121     csp->ctyarray = ctyarray;
12122   }
12123 
12124   latarray = (LatBlockPtr PNTR) MemNew (sizeof (LatBlockPtr) * (csp->numLatBlocks + 1));
12125   if (latarray != NULL) {
12126     for (vnp = latblocks, i = 0; vnp != NULL; vnp = vnp->next, i++) {
12127       lbp = (LatBlockPtr) vnp->data.ptrvalue;
12128       latarray [i] = lbp;
12129     }
12130 
12131     csp->latarray = latarray;
12132   }
12133 
12134 /*
12135 {
12136   FILE *fp;
12137   fp = FileOpen ("ctrymap.txt", "w");
12138   if (fp != NULL) {
12139     for (vnp = latblocks; vnp != NULL; vnp = vnp->next) {
12140       lbp = (LatBlockPtr) vnp->data.ptrvalue;
12141       if (lbp == NULL) continue;
12142       cbp = lbp->landmass;
12143       if (cbp == NULL) continue;
12144       fprintf (fp, "%s\t[%d]\t%d\t%d\t%d\n", cbp->name, (int) cbp->area,
12145                (int) lbp->lat, (int) lbp->minlon, (int) lbp->maxlon);
12146     }
12147     FileClose (fp);
12148   }
12149 }
12150 */
12151 
12152   return csp;
12153 }
12154 
12155 static Boolean ctryset_not_found = FALSE;
12156 static Boolean watrset_not_found = FALSE;
12157 
12158 extern CharPtr latlon_onedegree [];
12159 extern CharPtr water_onedegree [];
12160 
GetLatLonCountryData(void)12161 static CtrySetPtr GetLatLonCountryData (void)
12162 
12163 {
12164   CtrySetPtr  csp = NULL;
12165   CharPtr     prop = "CountryLatLonData";
12166 
12167   csp = (CtrySetPtr) GetAppProperty (prop);
12168   if (csp != NULL) return csp;
12169 
12170   if (ctryset_not_found) return NULL;
12171 
12172   csp = ReadLatLonCountryData (prop, "lat_lon_country.txt", latlon_onedegree);
12173 
12174   if (csp == NULL) {
12175     ctryset_not_found = TRUE;
12176     return NULL;
12177   }
12178 
12179   SetAppProperty (prop, (Pointer) csp);
12180 
12181   return csp;
12182 }
12183 
GetLatLonWaterData(void)12184 static CtrySetPtr GetLatLonWaterData (void)
12185 
12186 {
12187   CtrySetPtr  csp = NULL;
12188   CharPtr     prop = "WaterLatLonData";
12189 
12190   csp = (CtrySetPtr) GetAppProperty (prop);
12191   if (csp != NULL) return csp;
12192 
12193   if (watrset_not_found) return NULL;
12194 
12195   csp = ReadLatLonCountryData (prop, "lat_lon_water.txt", water_onedegree);
12196 
12197   if (csp == NULL) {
12198     watrset_not_found = TRUE;
12199     return NULL;
12200   }
12201 
12202   SetAppProperty (prop, (Pointer) csp);
12203 
12204   return csp;
12205 }
12206 
GetEntryInLatLonListIndex(CharPtr country,CtrySetPtr csp)12207 static CtyBlockPtr GetEntryInLatLonListIndex (
12208   CharPtr country,
12209   CtrySetPtr csp
12210 )
12211 
12212 {
12213   CtyBlockPtr PNTR  array;
12214   CtyBlockPtr       cbp;
12215   Int2              L, R, mid;
12216 
12217   if (StringHasNoText (country)) return NULL;
12218   if (csp == NULL) return NULL;
12219 
12220   array = csp->ctyarray;
12221   if (array == NULL) return NULL;
12222 
12223   L = 0;
12224   R = csp->numCtyBlocks - 1;
12225 
12226   while (L < R) {
12227     mid = (L + R) / 2;
12228     cbp = array [mid];
12229     if (cbp != NULL && cbp->name != NULL && StringICmp (cbp->name, country) < 0) {
12230       L = mid + 1;
12231     } else {
12232       R = mid;
12233     }
12234   }
12235 
12236   cbp = array [R];
12237   if (cbp != NULL && cbp->name != NULL && StringICmp (cbp->name, country) == 0) return cbp;
12238 
12239   return NULL;
12240 }
12241 
CountryIsInLatLonList(CharPtr country)12242 NLM_EXTERN Boolean CountryIsInLatLonList (
12243   CharPtr country
12244 )
12245 
12246 {
12247   CtyBlockPtr  cbp;
12248   CtrySetPtr   csp;
12249 
12250   if (StringHasNoText (country)) return FALSE;
12251   csp = GetLatLonCountryData ();
12252   if (csp == NULL) return FALSE;
12253 
12254   cbp = GetEntryInLatLonListIndex (country, csp);
12255   if (cbp != NULL && cbp->name != NULL && StringICmp (cbp->name, country) == 0) return TRUE;
12256 
12257   return FALSE;
12258 }
12259 
12260 //LCOV_EXCL_START
IsCountryInLatLonList(CharPtr country)12261 NLM_EXTERN Boolean IsCountryInLatLonList (
12262   CharPtr country
12263 )
12264 
12265 {
12266   return CountryIsInLatLonList (country);
12267 }
12268 //LCOV_EXCL_STOP
12269 
WaterIsInLatLonList(CharPtr country)12270 NLM_EXTERN Boolean WaterIsInLatLonList (
12271   CharPtr country
12272 )
12273 
12274 {
12275   CtyBlockPtr  cbp;
12276   CtrySetPtr   csp;
12277 
12278   if (StringHasNoText (country)) return FALSE;
12279   csp = GetLatLonWaterData ();
12280   if (csp == NULL) return FALSE;
12281 
12282   cbp = GetEntryInLatLonListIndex (country, csp);
12283   if (cbp != NULL && cbp->name != NULL && StringICmp (cbp->name, country) == 0) return TRUE;
12284 
12285   return FALSE;
12286 }
12287 
LatLonCmp(LatBlockPtr lbp,Int2 latitude)12288 static int LatLonCmp (
12289   LatBlockPtr lbp,
12290   Int2 latitude
12291 )
12292 
12293 {
12294   if (lbp == NULL) return 0;
12295 
12296   if (lbp->lat < latitude) {
12297     return -1;
12298   } else if (lbp->lat > latitude) {
12299     return 1;
12300   }
12301 
12302   return 0;
12303 }
12304 
GetLatLonIndex(CtrySetPtr csp,LatBlockPtr PNTR array,Int2 latitude)12305 static Int4 GetLatLonIndex (
12306   CtrySetPtr csp,
12307   LatBlockPtr PNTR array,
12308   Int2 latitude
12309 )
12310 
12311 {
12312   LatBlockPtr  lbp;
12313   Int4         L, R, mid;
12314 
12315   if (csp == NULL || array == NULL) return 0;
12316 
12317   L = 0;
12318   R = csp->numLatBlocks - 1;
12319 
12320   while (L < R) {
12321     mid = (L + R) / 2;
12322     lbp = array [mid];
12323     if (lbp != NULL && LatLonCmp (lbp, latitude) < 0) {
12324       L = mid + 1;
12325     } else {
12326       R = mid;
12327     }
12328   }
12329 
12330   return R;
12331 }
12332 
SubregionStringICmp(CharPtr region,CharPtr country)12333 static Boolean SubregionStringICmp (
12334   CharPtr region,
12335   CharPtr country
12336 )
12337 
12338 {
12339   Char     possible [256];
12340   CharPtr  ptr;
12341 
12342   if (StringHasNoText (region) || StringHasNoText (country)) return FALSE;
12343   StringNCpy_0 (possible, region, sizeof (possible));
12344   ptr = StringChr (possible, ':');
12345   if (ptr == NULL) return FALSE;
12346   *ptr = '\0';
12347   if (StringICmp (possible, country) == 0) return TRUE;
12348   return FALSE;
12349 }
12350 
RegionContainsLatLon(CharPtr country,FloatHi lat,FloatHi lon,CtrySetPtr csp)12351 static Boolean RegionContainsLatLon (
12352   CharPtr country,
12353   FloatHi lat,
12354   FloatHi lon,
12355   CtrySetPtr csp
12356 )
12357 
12358 {
12359   LatBlockPtr PNTR  array;
12360   CtyBlockPtr       cbp;
12361   Int4              latitude;
12362   Int4              longitude;
12363   LatBlockPtr       lbp;
12364   Int4              R;
12365 
12366   if (StringHasNoText (country)) return FALSE;
12367   if (csp == NULL) return FALSE;
12368 
12369   array = csp->latarray;
12370   if (array == NULL) return FALSE;
12371 
12372   latitude = ConvertLat (lat, csp->scale);
12373   longitude = ConvertLon (lon, csp->scale);
12374 
12375   for (R = GetLatLonIndex (csp, array, latitude); R < csp->numLatBlocks; R++) {
12376     lbp = array [R];
12377     if (lbp == NULL) break;
12378     if (latitude != lbp->lat) break;
12379 
12380     if (longitude < lbp->minlon) continue;
12381     if (longitude > lbp->maxlon) continue;
12382 
12383     cbp = lbp->landmass;
12384     if (cbp == NULL) continue;
12385     if (StringICmp (cbp->name, country) == 0) return TRUE;
12386     if (SubregionStringICmp (cbp->name, country)) return TRUE;
12387   }
12388 
12389   return FALSE;
12390 }
12391 
CountryContainsLatLon(CharPtr country,FloatHi lat,FloatHi lon)12392 NLM_EXTERN Boolean CountryContainsLatLon (
12393   CharPtr country,
12394   FloatHi lat,
12395   FloatHi lon
12396 )
12397 
12398 {
12399   CtrySetPtr  csp;
12400 
12401   if (StringHasNoText (country)) return FALSE;
12402 
12403   csp = GetLatLonCountryData ();
12404   if (csp == NULL) return FALSE;
12405 
12406   return RegionContainsLatLon (country, lat, lon, csp);
12407 }
12408 
12409 //LCOV_EXCL_START
TestLatLonForCountry(CharPtr country,FloatHi lat,FloatHi lon)12410 NLM_EXTERN Boolean TestLatLonForCountry (
12411   CharPtr country,
12412   FloatHi lat,
12413   FloatHi lon
12414 )
12415 
12416 {
12417   return CountryContainsLatLon (country, lat, lon);
12418 }
12419 //LCOV_EXCL_STOP
12420 
WaterContainsLatLon(CharPtr country,FloatHi lat,FloatHi lon)12421 NLM_EXTERN Boolean WaterContainsLatLon (
12422   CharPtr country,
12423   FloatHi lat,
12424   FloatHi lon
12425 )
12426 
12427 {
12428   CtrySetPtr  csp;
12429 
12430   if (StringHasNoText (country)) return FALSE;
12431 
12432   csp = GetLatLonWaterData ();
12433   if (csp == NULL) return FALSE;
12434 
12435   return RegionContainsLatLon (country, lat, lon, csp);
12436 }
12437 
NewLatLonCandidateIsBetter(CharPtr country,CharPtr province,CtyBlockPtr best,CtyBlockPtr cbp,Boolean newer_is_smaller)12438 static Boolean NewLatLonCandidateIsBetter (
12439   CharPtr country,
12440   CharPtr province,
12441   CtyBlockPtr best,
12442   CtyBlockPtr cbp,
12443   Boolean newer_is_smaller
12444 )
12445 
12446 {
12447   if (cbp == NULL) return FALSE;
12448   if (best == NULL) return TRUE;
12449 
12450   /* if no preferred country, just look for smallest area */
12451   if (country == NULL) {
12452     return newer_is_smaller;
12453   }
12454 
12455   /* if match to preferred country */
12456   if (StringICmp (country, cbp->level0) == 0) {
12457 
12458     /* if best was not preferred country, take new match */
12459     if (StringICmp (country, best->level0) != 0) return TRUE;
12460 
12461     /* if match to preferred province */
12462     if (province != NULL && StringICmp (province, cbp->level1) == 0) {
12463 
12464       /* if best was not preferred province, take new match */
12465       if (StringICmp (province, best->level1) != 0) return TRUE;
12466     }
12467 
12468     /* if both match province, or neither does, or no preferred province, take smallest */
12469     return newer_is_smaller;
12470   }
12471 
12472   /* if best matches preferred country, keep */
12473   if (StringICmp (country, best->level0) == 0) return FALSE;
12474 
12475   /* otherwise take smallest */
12476   return newer_is_smaller;
12477 }
12478 
LookupRegionByLatLon(FloatHi lat,FloatHi lon,CharPtr country,CharPtr province,CtrySetPtr csp)12479 static CtyBlockPtr LookupRegionByLatLon (
12480   FloatHi lat,
12481   FloatHi lon,
12482   CharPtr country,
12483   CharPtr province,
12484   CtrySetPtr csp
12485 )
12486 
12487 {
12488   LatBlockPtr PNTR  array;
12489   CtyBlockPtr       cbp, best = NULL;
12490   Int4              latitude;
12491   Int4              longitude;
12492   LatBlockPtr       lbp;
12493   Int4              R;
12494 
12495   if (csp == NULL) return NULL;
12496 
12497   array = csp->latarray;
12498   if (array == NULL) return NULL;
12499 
12500   latitude = ConvertLat (lat, csp->scale);
12501   longitude = ConvertLon (lon, csp->scale);
12502 
12503   for (R = GetLatLonIndex (csp, array, latitude); R < csp->numLatBlocks; R++) {
12504     lbp = array [R];
12505     if (lbp == NULL) break;
12506     if (latitude != lbp->lat) break;
12507 
12508     if (longitude < lbp->minlon) continue;
12509     if (longitude > lbp->maxlon) continue;
12510 
12511     cbp = lbp->landmass;
12512     if (cbp == NULL) continue;
12513 
12514     if (best == NULL || NewLatLonCandidateIsBetter (country, province, best, cbp, (Boolean) (cbp->area < best->area))) {
12515       best = cbp;
12516     }
12517   }
12518 
12519   return best;
12520 }
12521 
GuessCountryByLatLon(FloatHi lat,FloatHi lon,CharPtr country,CharPtr province)12522 static CtyBlockPtr GuessCountryByLatLon (
12523   FloatHi lat,
12524   FloatHi lon,
12525   CharPtr country,
12526   CharPtr province
12527 )
12528 
12529 {
12530   CtrySetPtr  csp;
12531 
12532   csp = GetLatLonCountryData ();
12533   if (csp == NULL) return NULL;
12534 
12535   return LookupRegionByLatLon (lat, lon, country, province, csp);
12536 }
12537 
GuessWaterByLatLon(FloatHi lat,FloatHi lon,CharPtr country)12538 static CtyBlockPtr GuessWaterByLatLon (
12539   FloatHi lat,
12540   FloatHi lon,
12541   CharPtr country
12542 )
12543 
12544 {
12545   CtrySetPtr  csp;
12546 
12547   csp = GetLatLonWaterData ();
12548   if (csp == NULL) return NULL;
12549 
12550   return LookupRegionByLatLon (lat, lon, country, NULL, csp);
12551 }
12552 
12553 //LCOV_EXCL_START
LookupCountryByLatLon(FloatHi lat,FloatHi lon)12554 NLM_EXTERN CharPtr LookupCountryByLatLon(
12555   FloatHi lat,
12556   FloatHi lon
12557 )
12558 
12559 {
12560   CtyBlockPtr  cbp;
12561 
12562   cbp = GuessCountryByLatLon (lat, lon, NULL, NULL);
12563   if (cbp == NULL) return NULL;
12564 
12565   return cbp->name;
12566 }
12567 
GuessCountryForLatLon(FloatHi lat,FloatHi lon)12568 NLM_EXTERN CharPtr GuessCountryForLatLon(
12569   FloatHi lat,
12570   FloatHi lon
12571 )
12572 
12573 {
12574   return LookupCountryByLatLon (lat, lon);
12575 }
12576 
LookupWaterByLatLon(FloatHi lat,FloatHi lon)12577 NLM_EXTERN CharPtr LookupWaterByLatLon(
12578   FloatHi lat,
12579   FloatHi lon
12580 )
12581 
12582 {
12583   CtyBlockPtr  cbp;
12584 
12585   cbp = GuessWaterByLatLon (lat, lon, NULL);
12586   if (cbp == NULL) return NULL;
12587 
12588   return cbp->name;
12589 }
12590 
CountryDataScaleIs(void)12591 NLM_EXTERN FloatHi CountryDataScaleIs(void)
12592 
12593 {
12594   CtrySetPtr  csp;
12595 
12596   csp = GetLatLonCountryData ();
12597   if (csp == NULL) return 0.0;
12598 
12599   return csp->scale;
12600 }
12601 
WaterDataScaleIs(void)12602 NLM_EXTERN FloatHi WaterDataScaleIs(void)
12603 
12604 {
12605   CtrySetPtr  csp;
12606 
12607   csp = GetLatLonWaterData ();
12608   if (csp == NULL) return 0.0;
12609 
12610   return csp->scale;
12611 }
12612 
12613 
RegionExtremesOverlap(CharPtr first,CharPtr second,CtrySetPtr csp)12614 static Boolean RegionExtremesOverlap(
12615   CharPtr first,
12616   CharPtr second,
12617   CtrySetPtr csp
12618 )
12619 
12620 {
12621   CtyBlockPtr  cbp1, cbp2;
12622 
12623   if (StringHasNoText (first) || StringHasNoText (second)) return FALSE;
12624   if (csp == NULL) return FALSE;
12625 
12626   cbp1 = GetEntryInLatLonListIndex (first, csp);
12627   if (cbp1 == NULL || cbp1->name == NULL || StringICmp (cbp1->name, first) != 0) return FALSE;
12628 
12629   cbp2 = GetEntryInLatLonListIndex (second, csp);
12630   if (cbp2 == NULL || cbp2->name == NULL || StringICmp (cbp2->name, second) != 0) return FALSE;
12631 
12632   if (cbp1->minlat > cbp2->maxlat) return FALSE;
12633   if (cbp2->minlat > cbp1->maxlat) return FALSE;
12634   if (cbp1->minlon > cbp2->maxlon) return FALSE;
12635   if (cbp2->minlon > cbp1->maxlon) return FALSE;
12636 
12637   return TRUE;
12638 }
12639 
CountryExtremesOverlap(CharPtr first,CharPtr second)12640 NLM_EXTERN Boolean CountryExtremesOverlap (
12641   CharPtr first,
12642   CharPtr second
12643 )
12644 
12645 {
12646   CtrySetPtr  csp;
12647 
12648   if (StringHasNoText (first) || StringHasNoText (second)) return FALSE;
12649   csp = GetLatLonCountryData ();
12650   if (csp == NULL) return FALSE;
12651 
12652   return RegionExtremesOverlap (first, second, csp);
12653 }
12654 
CountryBoxesOverlap(CharPtr country1,CharPtr country2)12655 NLM_EXTERN Boolean CountryBoxesOverlap (
12656   CharPtr country1,
12657   CharPtr country2
12658 )
12659 
12660 {
12661   return CountryExtremesOverlap (country1, country2);
12662 }
12663 
WaterExtremesOverlap(CharPtr first,CharPtr second)12664 NLM_EXTERN Boolean WaterExtremesOverlap (
12665   CharPtr first,
12666   CharPtr second
12667 )
12668 
12669 {
12670   CtrySetPtr  csp;
12671 
12672   if (StringHasNoText (first) || StringHasNoText (second)) return FALSE;
12673   csp = GetLatLonWaterData ();
12674   if (csp == NULL) return FALSE;
12675 
12676   return RegionExtremesOverlap (first, second, csp);
12677 }
12678 //LCOV_EXCL_STOP
12679 
12680 
12681 /*
12682 Distance on a spherical surface calculation adapted from
12683 http://www.linuxjournal.com/magazine/
12684 work-shell-calculating-distance-between-two-latitudelongitude-points
12685 */
12686 
12687 #define EARTH_RADIUS 6371.0 /* average radius of non-spherical earth in kilometers */
12688 #define CONST_PI 3.14159265359
12689 
DegreesToRadians(FloatHi degrees)12690 static double DegreesToRadians (
12691   FloatHi degrees
12692 )
12693 
12694 {
12695   return (degrees * (CONST_PI / 180.0));
12696 }
12697 
DistanceOnGlobe(FloatHi latA,FloatHi lonA,FloatHi latB,FloatHi lonB)12698 static FloatHi DistanceOnGlobe (
12699   FloatHi latA,
12700   FloatHi lonA,
12701   FloatHi latB,
12702   FloatHi lonB
12703 )
12704 
12705 {
12706   double lat1, lon1, lat2, lon2;
12707   double dLat, dLon, a, c;
12708 
12709   lat1 = DegreesToRadians (latA);
12710   lon1 = DegreesToRadians (lonA);
12711   lat2 = DegreesToRadians (latB);
12712   lon2 = DegreesToRadians (lonB);
12713 
12714   dLat = lat2 - lat1;
12715   dLon = lon2 - lon1;
12716 
12717    a = sin (dLat / 2) * sin (dLat / 2) +
12718        cos (lat1) * cos (lat2) * sin (dLon / 2) * sin (dLon / 2);
12719    c = 2 * atan2 (sqrt (a), sqrt (1 - a));
12720 
12721   return (FloatHi) (EARTH_RADIUS * c);
12722 }
12723 
ErrorDistance(FloatHi latA,FloatHi lonA,FloatHi scale)12724 static FloatHi ErrorDistance (
12725   FloatHi latA,
12726   FloatHi lonA,
12727   FloatHi scale)
12728 {
12729   double lat1, lon1, lat2, lon2;
12730   double dLat, dLon, a, c;
12731 
12732   lat1 = DegreesToRadians (latA);
12733   lon1 = DegreesToRadians (lonA);
12734   lat2 = DegreesToRadians (latA + (1.0 / scale));
12735   lon2 = DegreesToRadians (lonA + (1.0 / scale));
12736 
12737   dLat = lat2 - lat1;
12738   dLon = lon2 - lon1;
12739 
12740    a = sin (dLat / 2) * sin (dLat / 2) +
12741        cos (lat1) * cos (lat2) * sin (dLon / 2) * sin (dLon / 2);
12742    c = 2 * atan2 (sqrt (a), sqrt (1 - a));
12743 
12744   return (FloatHi) (EARTH_RADIUS * c);
12745 
12746 }
12747 
12748 
RegionClosestToLatLon(FloatHi lat,FloatHi lon,FloatHi range,FloatHi PNTR distanceP,CtrySetPtr csp)12749 static CtyBlockPtr RegionClosestToLatLon (
12750   FloatHi lat,
12751   FloatHi lon,
12752   FloatHi range,
12753   FloatHi PNTR distanceP,
12754   CtrySetPtr csp
12755 )
12756 
12757 {
12758   LatBlockPtr PNTR  array;
12759   CtyBlockPtr       cbp, best = NULL;
12760   FloatHi           closest = EARTH_RADIUS * CONST_PI * 2;
12761   FloatHi           delta;
12762   Int4              latitude;
12763   Int4              longitude;
12764   Int4              maxDelta;
12765   LatBlockPtr       lbp;
12766   Int4              R;
12767   Int4              x;
12768   Int4              y;
12769   Boolean           is_geographically_better;
12770 
12771   if (distanceP != NULL) {
12772     *distanceP = 0.0;
12773   }
12774 
12775   if (csp == NULL) return NULL;
12776 
12777   array = csp->latarray;
12778   if (array == NULL) return NULL;
12779 
12780   latitude = ConvertLat (lat, csp->scale);
12781   longitude = ConvertLon (lon, csp->scale);
12782 
12783   maxDelta = (Int4) (range * csp->scale + EPSILON);
12784 
12785   for (R = GetLatLonIndex (csp, array, latitude - maxDelta); R < csp->numLatBlocks; R++) {
12786     lbp = array [R];
12787     if (lbp == NULL) break;
12788     if (latitude + maxDelta < lbp->lat) break;
12789 
12790     if (longitude < lbp->minlon - maxDelta) continue;
12791     if (longitude > lbp->maxlon + maxDelta) continue;
12792 
12793     cbp = lbp->landmass;
12794     if (cbp == NULL) continue;
12795 
12796     if (longitude < lbp->minlon) {
12797       x = lbp->minlon;
12798     } else if (longitude > lbp->maxlon) {
12799       x = lbp->maxlon;
12800     } else {
12801       x = longitude;
12802     }
12803 
12804     y = lbp->lat;
12805 
12806     delta = DistanceOnGlobe (lat, lon, (FloatHi) (y / csp->scale), (FloatHi) (x / csp->scale));
12807 
12808     is_geographically_better = FALSE;
12809     if (delta < closest) {
12810       is_geographically_better = TRUE;
12811     } else if (delta - closest < 0.000001) {
12812       if (best == NULL || cbp->area < best->area) {
12813         is_geographically_better = TRUE;
12814       }
12815     }
12816 
12817     if (best == NULL || NewLatLonCandidateIsBetter (NULL, NULL, best, cbp, is_geographically_better)) {
12818       best = cbp;
12819       closest = delta;
12820     }
12821   }
12822 
12823   if (best != NULL) {
12824     if (distanceP != NULL) {
12825       *distanceP = closest;
12826     }
12827   }
12828 
12829   return best;
12830 }
12831 
NearestCountryByLatLon(FloatHi lat,FloatHi lon,FloatHi range,FloatHi PNTR distanceP)12832 static CtyBlockPtr NearestCountryByLatLon (
12833   FloatHi lat,
12834   FloatHi lon,
12835   FloatHi range,
12836   FloatHi PNTR distanceP
12837 )
12838 
12839 {
12840   CtrySetPtr  csp;
12841 
12842   csp = GetLatLonCountryData ();
12843   if (csp == NULL) return NULL;
12844 
12845   return RegionClosestToLatLon (lat, lon, range, distanceP, csp);
12846 }
12847 
12848 //LCOV_EXCL_START
12849 //map used during regression is too good, no areas of data insufficiency
NearestWaterByLatLon(FloatHi lat,FloatHi lon,FloatHi range,FloatHi PNTR distanceP)12850 static CtyBlockPtr NearestWaterByLatLon (
12851   FloatHi lat,
12852   FloatHi lon,
12853   FloatHi range,
12854   FloatHi PNTR distanceP
12855 )
12856 
12857 {
12858   CtrySetPtr  csp;
12859 
12860   csp = GetLatLonWaterData ();
12861   if (csp == NULL) return NULL;
12862 
12863   return RegionClosestToLatLon (lat, lon, range, distanceP, csp);
12864 }
12865 
12866 
CountryClosestToLatLon(FloatHi lat,FloatHi lon,FloatHi range,FloatHi PNTR distanceP)12867 NLM_EXTERN CharPtr CountryClosestToLatLon (
12868   FloatHi lat,
12869   FloatHi lon,
12870   FloatHi range,
12871   FloatHi PNTR distanceP
12872 )
12873 
12874 {
12875   CtyBlockPtr  cbp;
12876 
12877   cbp = NearestCountryByLatLon (lat, lon, range, distanceP);
12878   if (cbp == NULL) return NULL;
12879 
12880   return cbp->name;
12881 }
12882 
WaterClosestToLatLon(FloatHi lat,FloatHi lon,FloatHi range,FloatHi PNTR distanceP)12883 NLM_EXTERN CharPtr WaterClosestToLatLon (
12884   FloatHi lat,
12885   FloatHi lon,
12886   FloatHi range,
12887   FloatHi PNTR distanceP
12888 )
12889 
12890 {
12891   CtyBlockPtr  cbp;
12892 
12893   cbp = NearestWaterByLatLon (lat, lon, range, distanceP);
12894   if (cbp == NULL) return NULL;
12895 
12896   return cbp->name;
12897 }
12898 //LCOV_EXCL_STOP
12899 
12900 
RegionIsNearLatLon(CharPtr country,CharPtr province,FloatHi lat,FloatHi lon,FloatHi range,FloatHi PNTR distanceP,CtrySetPtr csp)12901 static CtyBlockPtr RegionIsNearLatLon (
12902   CharPtr country,
12903   CharPtr province,
12904   FloatHi lat,
12905   FloatHi lon,
12906   FloatHi range,
12907   FloatHi PNTR distanceP,
12908   CtrySetPtr csp
12909 )
12910 
12911 {
12912   LatBlockPtr PNTR  array;
12913   CtyBlockPtr       cbp, best = NULL;
12914   FloatHi           closest = EARTH_RADIUS * CONST_PI * 2;
12915   FloatHi           delta;
12916   Int4              latitude;
12917   Int4              longitude;
12918   Int4              maxDelta;
12919   LatBlockPtr       lbp;
12920   Int4              R;
12921   Int4              x;
12922   Int4              y;
12923 
12924   if (distanceP != NULL) {
12925     *distanceP = 0.0;
12926   }
12927 
12928   if (StringHasNoText (country)) return NULL;
12929   if (csp == NULL) return NULL;
12930 
12931   array = csp->latarray;
12932   if (array == NULL) return NULL;
12933 
12934   latitude = ConvertLat (lat, csp->scale);
12935   longitude = ConvertLon (lon, csp->scale);
12936 
12937   maxDelta = (Int4) (range * csp->scale + EPSILON);
12938 
12939   for (R = GetLatLonIndex (csp, array, latitude - maxDelta); R < csp->numLatBlocks; R++) {
12940     lbp = array [R];
12941     if (lbp == NULL) break;
12942     if (latitude + maxDelta < lbp->lat) break;
12943 
12944     if (longitude < lbp->minlon - maxDelta) continue;
12945     if (longitude > lbp->maxlon + maxDelta) continue;
12946 
12947     cbp = lbp->landmass;
12948     if (cbp == NULL) continue;
12949 
12950     if (StringICmp (country, cbp->level0) != 0) continue;
12951     if (/* province != NULL && */ StringICmp (province, cbp->level1) != 0) continue;
12952 
12953     if (longitude < lbp->minlon) {
12954       x = lbp->minlon;
12955     } else if (longitude > lbp->maxlon) {
12956       x = lbp->maxlon;
12957     } else {
12958       x = longitude;
12959     }
12960 
12961     y = lbp->lat;
12962 
12963     delta = DistanceOnGlobe (lat, lon, (FloatHi) (y / csp->scale), (FloatHi) (x / csp->scale));
12964 
12965     if (best == NULL || delta < closest) {
12966       best = cbp;
12967       closest = delta;
12968     }
12969   }
12970 
12971   if (best != NULL) {
12972     if (distanceP != NULL) {
12973       *distanceP = closest;
12974     }
12975   }
12976 
12977   return best;
12978 }
12979 
CountryToLatLonDistance(CharPtr country,CharPtr province,FloatHi lat,FloatHi lon,FloatHi range,FloatHi PNTR distanceP)12980 static CtyBlockPtr CountryToLatLonDistance (
12981   CharPtr country,
12982   CharPtr province,
12983   FloatHi lat,
12984   FloatHi lon,
12985   FloatHi range,
12986   FloatHi PNTR distanceP
12987 )
12988 
12989 {
12990   CtrySetPtr  csp;
12991 
12992   csp = GetLatLonCountryData ();
12993   if (csp == NULL) return NULL;
12994 
12995   return RegionIsNearLatLon (country, province, lat, lon, range, distanceP, csp);
12996 }
12997 
WaterToLatLonDistance(CharPtr country,FloatHi lat,FloatHi lon,FloatHi range,FloatHi PNTR distanceP)12998 static CtyBlockPtr WaterToLatLonDistance (
12999   CharPtr country,
13000   FloatHi lat,
13001   FloatHi lon,
13002   FloatHi range,
13003   FloatHi PNTR distanceP
13004 )
13005 
13006 {
13007   CtrySetPtr  csp;
13008 
13009   csp = GetLatLonWaterData ();
13010   if (csp == NULL) return NULL;
13011 
13012   return RegionIsNearLatLon (country, NULL, lat, lon, range, distanceP, csp);
13013 }
13014 
13015 //LCOV_EXCL_START
13016 //map used during regression is too good, no areas of data insufficiency
CountryIsNearLatLon(CharPtr country,FloatHi lat,FloatHi lon,FloatHi range,FloatHi PNTR distanceP)13017 NLM_EXTERN Boolean CountryIsNearLatLon (
13018   CharPtr country,
13019   FloatHi lat,
13020   FloatHi lon,
13021   FloatHi range,
13022   FloatHi PNTR distanceP
13023 )
13024 
13025 {
13026   CtyBlockPtr  cbp;
13027 
13028   cbp = CountryToLatLonDistance (country, NULL, lat, lon, range, distanceP);
13029   if (cbp == NULL) return FALSE;
13030 
13031   return TRUE;
13032 }
13033 
13034 //map used during regression is too good, no areas of data insufficiency
WaterIsNearLatLon(CharPtr country,FloatHi lat,FloatHi lon,FloatHi range,FloatHi PNTR distanceP)13035 NLM_EXTERN Boolean WaterIsNearLatLon (
13036   CharPtr country,
13037   FloatHi lat,
13038   FloatHi lon,
13039   FloatHi range,
13040   FloatHi PNTR distanceP
13041 )
13042 
13043 {
13044   CtyBlockPtr  cbp;
13045 
13046   cbp = WaterToLatLonDistance (country, lat, lon, range, distanceP);
13047   if (cbp == NULL) return FALSE;
13048 
13049   return TRUE;
13050 }
13051 //LCOV_EXCL_STOP
13052 
13053 /*
13054 static void WriteLatLonRegionData (
13055   CtrySetPtr csp,
13056   FILE* fp
13057 )
13058 
13059 {
13060   Char         buf [150];
13061   CtyBlockPtr  cbp;
13062   LatBlockPtr  lbp;
13063   ValNodePtr   vnp;
13064 
13065   if (csp == NULL || fp == NULL) return;
13066 
13067   for (vnp = csp->latblocks; vnp != NULL; vnp = vnp->next) {
13068     lbp = (LatBlockPtr) vnp->data.ptrvalue;
13069     if (lbp == NULL) {
13070       fprintf (fp, "NULL LatBlockPtr\n");
13071       continue;
13072     }
13073     cbp = lbp->landmass;
13074     if (cbp == NULL) {
13075       fprintf (fp, "NULL CtyBlockPtr\n");
13076       continue;
13077     }
13078 
13079     if (StringHasNoText (cbp->name)) {
13080       fprintf (fp, "NULL cbp->name\n");
13081       continue;
13082     }
13083 
13084     StringNCpy_0 (buf, cbp->name, 50);
13085     StringCat (buf, "                                                  ");
13086     buf [50] = '\0';
13087 
13088     fprintf (fp, "%s %4d : %4d  .. %4d\n", buf, (int) lbp->lat, (int) lbp->minlon, (int) lbp->maxlon);
13089   }
13090 
13091   fprintf (fp, "\n\n");
13092 }
13093 
13094 static void TestLatLonCountryData (void)
13095 
13096 {
13097   CtrySetPtr  csp;
13098   FILE        *fp;
13099 
13100   fp = FileOpen ("stdout", "w");
13101   if (fp == NULL) {
13102     Message (MSG_OK, "Unable to open output file");
13103     return;
13104   }
13105 
13106   csp = GetLatLonCountryData ();
13107   if (csp == NULL) {
13108     fprintf (fp, "GetLatLonCountryData failed\n");
13109     FileClose (fp);
13110     return;
13111   }
13112 
13113   WriteLatLonRegionData (csp, fp);
13114 
13115   csp = GetLatLonWaterData ();
13116   if (csp == NULL) {
13117     fprintf (fp, "GetLatLonWaterData failed\n");
13118     FileClose (fp);
13119     return;
13120   }
13121 
13122   WriteLatLonRegionData (csp, fp);
13123 
13124   FileClose (fp);
13125 }
13126 */
13127 
13128 /* END OF NEW LATITUDE-LONGITUDE COUNTRY VALIDATION CODE */
13129 
StringListIsUnique(ValNodePtr list)13130 static Boolean StringListIsUnique (ValNodePtr list)
13131 
13132 {
13133   CharPtr     last;
13134   ValNodePtr  next;
13135   CharPtr     str;
13136   ValNodePtr  vnp;
13137 
13138   if (list == NULL) return TRUE;
13139   last = (CharPtr) list->data.ptrvalue;
13140   vnp = list->next;
13141   while (vnp != NULL) {
13142     next = vnp->next;
13143     str = (CharPtr) vnp->data.ptrvalue;
13144     if (StringICmp (last, str) == 0) {
13145       return FALSE;
13146     } else {
13147       last = (CharPtr) vnp->data.ptrvalue;
13148     }
13149     vnp = next;
13150   }
13151 
13152   return TRUE;
13153 }
13154 
13155 static CharPtr modified_base_abbrevs [] = {
13156   "<ac4c>",
13157   "<chm5u>",
13158   "<cm>",
13159   "<cmnm5s2u>",
13160   "<cmnm5u>",
13161   "<d>",
13162   "<fm>",
13163   "<gal q>",
13164   "<gm>",
13165   "<i>",
13166   "<i6a>",
13167   "<m1a>",
13168   "<m1f>",
13169   "<m1g>",
13170   "<m1i>",
13171   "<m22g>",
13172   "<m2a>",
13173   "<m2g>",
13174   "<m3c>",
13175   "<m4c>",
13176   "<m5c>",
13177   "<m6a>",
13178   "<m7g>",
13179   "<mam5u>",
13180   "<mam5s2u>",
13181   "<man q>",
13182   "<mcm5s2u>",
13183   "<mcm5u>",
13184   "<mo5u>",
13185   "<ms2i6a>",
13186   "<ms2t6a>",
13187   "<mt6a>",
13188   "<mv>",
13189   "<o5u>",
13190   "<osyw>",
13191   "<p>",
13192   "<q>",
13193   "<s2c>",
13194   "<s2t>",
13195   "<s2u>",
13196   "<s4u>",
13197   "<t>",
13198   "<t6a>",
13199   "<tm>",
13200   "<um>",
13201   "<yw>",
13202   "<x>",
13203   "<OTHER>",
13204   NULL
13205 };
13206 
InitializeModBaseFSA(ValidStructPtr vsp)13207 static void InitializeModBaseFSA (ValidStructPtr vsp)
13208 
13209 {
13210   Int2  i;
13211 
13212   vsp->modifiedBases = TextFsaNew ();
13213   for (i = 0; modified_base_abbrevs [i] != NULL; i++) {
13214     TextFsaAdd (vsp->modifiedBases, modified_base_abbrevs [i]);
13215   }
13216 }
13217 
AltitudeIsValid(CharPtr name)13218 NLM_EXTERN Boolean AltitudeIsValid (CharPtr name)
13219 
13220 {
13221   Char        ch;
13222   size_t      len;
13223   CharPtr     ptr;
13224 
13225   if (StringHasNoText (name)) return FALSE;
13226   len = StringLen (name);
13227   if (len < 1) return FALSE;
13228 
13229   ptr = name;
13230   ch = *ptr;
13231 
13232   if (ch == '+' || ch == '-') {
13233     ptr++;
13234     ch = *ptr;
13235   }
13236 
13237   if (! IS_DIGIT (ch)) return FALSE;
13238 
13239   ptr++;
13240   ch = *ptr;
13241   while (IS_DIGIT (ch)) {
13242     ptr++;
13243     ch = *ptr;
13244   }
13245 
13246   if (ch == '.') {
13247     ptr++;
13248     ch = *ptr;
13249     if (! IS_DIGIT (ch)) return FALSE;
13250     ptr++;
13251     ch = *ptr;
13252     while (IS_DIGIT (ch)) {
13253       ptr++;
13254       ch = *ptr;
13255     }
13256   }
13257 
13258   if (ch != ' ') return FALSE;
13259   ptr++;
13260   ch = *ptr;
13261   if (ch != 'm') return FALSE;
13262 
13263   /*
13264   ptr++;
13265   ch = *ptr;
13266   if (ch != '.') return FALSE;
13267   */
13268 
13269   ptr++;
13270   ch = *ptr;
13271   if (ch != '\0') return FALSE;
13272 
13273   return TRUE;
13274 }
13275 
13276 static CharPtr type_prefixes [] = {
13277   "type strain",
13278   "neotype strain",
13279   "holotype",
13280   "paratype",
13281   "neotype",
13282   "allotype",
13283   "hapanotype",
13284   "syntype",
13285   "lectotype",
13286   "paralectotype",
13287   "isotype",
13288   "epitype",
13289   "isosyntype",
13290   "ex-type",
13291   "reference strain",
13292   "type material",
13293   NULL
13294 };
13295 
TypeMaterialIsValid(CharPtr name)13296 NLM_EXTERN Boolean TypeMaterialIsValid (CharPtr name)
13297 
13298 {
13299   Int2     i;
13300   size_t   len;
13301   CharPtr  str;
13302 
13303   if (StringHasNoText (name)) return FALSE;
13304 
13305   for (i = 0; type_prefixes [i] != NULL; i++) {
13306     str = type_prefixes [i];
13307     len = StringLen (str);
13308     if (StringNICmp (name, str, len) == 0) return TRUE;
13309   }
13310 
13311   return FALSE;
13312 }
13313 
PrimerSeqIsValid(ValidStructPtr vsp,CharPtr name,Char PNTR badch)13314 static Boolean PrimerSeqIsValid (ValidStructPtr vsp, CharPtr name, Char PNTR badch)
13315 
13316 {
13317   Char        ch;
13318   TextFsaPtr  fsa;
13319   size_t      len;
13320   ValNodePtr  matches;
13321   CharPtr     ptr;
13322   Int4        state;
13323   Boolean     first;
13324 
13325   if (badch != NULL) {
13326     *badch = '\0';
13327   }
13328 
13329   if (vsp == NULL) return FALSE;
13330   if (vsp->modifiedBases == NULL) {
13331     InitializeModBaseFSA (vsp);
13332   }
13333   fsa = vsp->modifiedBases;
13334   if (fsa == NULL) return FALSE;
13335 
13336   if (StringHasNoText (name)) return FALSE;
13337   len = StringLen (name);
13338   if (len < 1) return FALSE;
13339 
13340   if (StringChr (name, ',') != NULL) {
13341     if (name [0] != '(' || name [len - 1] != ')') return FALSE;
13342   } else {
13343     if (StringChr (name, '(') != NULL) return FALSE;
13344     if (StringChr (name, ')') != NULL) return FALSE;
13345   }
13346 
13347   if (StringChr (name, ';') != NULL) return FALSE;
13348   /* if (StringChr (name, ' ') != NULL) return FALSE; */
13349 
13350   ptr = name;
13351   ch = *ptr;
13352   while (ch != '\0') {
13353     if (ch == '<') {
13354       state = 0;
13355       matches = NULL;
13356       first = TRUE;
13357       while (ch != '\0' && ch != '>' && (first || ch != '<')) {
13358         state = TextFsaNext (fsa, state, ch, &matches);
13359         ptr++;
13360         ch = *ptr;
13361         first = FALSE;
13362       }
13363       if (ch != '>' || ch == '<') {
13364         if (badch != NULL) {
13365           *badch = ch;
13366         }
13367         return FALSE;
13368       }
13369       state = TextFsaNext (fsa, state, ch, &matches);
13370       if (matches == NULL) {
13371         if (badch != NULL) {
13372           *badch = ch;
13373         }
13374         return FALSE;
13375       }
13376     } else {
13377       if (ch != '(' && ch != ')' && ch != ',' && ch != ':') {
13378         if (! (IS_ALPHA (ch))) {
13379           if (badch != NULL) {
13380             *badch = ch;
13381           }
13382           return FALSE;
13383         }
13384         ch = TO_UPPER (ch);
13385         if (StringChr ("ABCDGHKMNRSTVWY", ch) == NULL) {
13386           if (badch != NULL) {
13387             ch = TO_LOWER (ch);
13388             *badch = ch;
13389           }
13390           return FALSE;
13391         }
13392       }
13393     }
13394     ptr++;
13395     ch = *ptr;
13396   }
13397 
13398   return TRUE;
13399 }
13400 
13401 /*
13402 static ValNodePtr ParsePrimerSeqIntoComponents (
13403   CharPtr strs
13404 )
13405 
13406 {
13407   Char        ch;
13408   ValNodePtr  head = NULL;
13409   CharPtr     ptr, str, tmp;
13410 
13411   if (StringHasNoText (strs)) return NULL;
13412 
13413   tmp = StringSave (strs);
13414   if (tmp == NULL) return NULL;
13415 
13416   str = tmp;
13417   while (StringDoesHaveText (str)) {
13418     ptr = str;
13419     ch = *ptr;
13420 
13421     while (ch != '\0' && ch != '(' && ch != ')' && ch != ',' && ch != ';' && ch != ':') {
13422       ptr++;
13423       ch = *ptr;
13424     }
13425     if (ch != '\0' && ptr != NULL) {
13426       *ptr = '\0';
13427       ptr++;
13428     }
13429 
13430     TrimSpacesAroundString (str);
13431     if (StringDoesHaveText (str)) {
13432       ValNodeCopyStr (&head, 0, str);
13433     }
13434 
13435     str = ptr;
13436   }
13437 
13438   MemFree (tmp);
13439   return head;
13440 }
13441 
13442 static Boolean PrimerSeqHasDuplicates (CharPtr name)
13443 
13444 {
13445   ValNodePtr  head;
13446   Boolean     rsult = FALSE;
13447 
13448   if (StringHasNoText (name)) return FALSE;
13449 
13450   head = ParsePrimerSeqIntoComponents (name);
13451   if (head == NULL) return FALSE;
13452   head = ValNodeSort (head, SortVnpByString);
13453   if (! StringListIsUnique (head)) {
13454     rsult = TRUE;
13455   }
13456   ValNodeFreeData (head);
13457 
13458   return rsult;
13459 }
13460 */
13461 
CountDigits(CharPtr str)13462 static Int2 CountDigits (CharPtr str)
13463 
13464 {
13465   Char  ch;
13466   Int2  count = 0;
13467 
13468   if (str == NULL) return count;
13469   ch = *str;
13470   while (IS_DIGIT (ch)) {
13471     count++;
13472     str++;
13473     ch = *str;
13474   }
13475   return count;
13476 }
13477 
LatLonIsValid(CharPtr name)13478 static Boolean LatLonIsValid (CharPtr name)
13479 
13480 {
13481   Char     ch;
13482   Int2     count;
13483   CharPtr  str;
13484 
13485   if (StringHasNoText (name)) return FALSE;
13486   str = name;
13487 
13488   count = CountDigits (str);
13489   if (count < 1 || count > 2) return FALSE;
13490   str += count;
13491 
13492   ch = *str;
13493   if (ch == '.') {
13494     str++;
13495     count = CountDigits (str);
13496     if (count != 2) return FALSE;
13497     str += count;
13498   }
13499 
13500   ch = *str;
13501   if (ch != ' ') return FALSE;
13502   str++;
13503   ch = *str;
13504   if (ch != 'N' && ch != 'S') return FALSE;
13505   str++;
13506   ch = *str;
13507   if (ch != ' ') return FALSE;
13508   str++;
13509 
13510   count = CountDigits (str);
13511   if (count < 1 || count > 3) return FALSE;
13512   str += count;
13513 
13514   ch = *str;
13515   if (ch == '.') {
13516     str++;
13517     count = CountDigits (str);
13518     if (count != 2) return FALSE;
13519     str += count;
13520   }
13521 
13522   ch = *str;
13523   if (ch != ' ') return FALSE;
13524   str++;
13525   ch = *str;
13526   if (ch != 'E' && ch != 'W') return FALSE;
13527   str++;
13528 
13529   ch = *str;
13530   if (ch != '\0') return FALSE;
13531 
13532   return TRUE;
13533 }
13534 
13535 static CharPtr source_qual_prefixes [] = {
13536   "acronym:",
13537   "altitude:",
13538   "anamorph:",
13539   "authority:",
13540   "biotype:",
13541   "biovar:",
13542   "bio_material:",
13543   "breed:",
13544   "cell_line:",
13545   "cell_type:",
13546   "chemovar:",
13547   "chromosome:",
13548   "clone:",
13549   "clone_lib:",
13550   "collected_by:",
13551   "collection_date:",
13552   "common:",
13553   "country:",
13554   "cultivar:",
13555   "culture_collection:",
13556   "dev_stage:",
13557   "dosage:",
13558   "ecotype:",
13559   "endogenous_virus_name:",
13560   "environmental_sample:",
13561   "forma:",
13562   "forma_specialis:",
13563   "frequency:",
13564   "fwd_pcr_primer_name",
13565   "fwd_pcr_primer_seq",
13566   "fwd_primer_name",
13567   "fwd_primer_seq",
13568   "genotype:",
13569   "germline:",
13570   "group:",
13571   "haplogroup:",
13572   "haplotype:",
13573   "identified_by:",
13574   "insertion_seq_name:",
13575   "isolate:",
13576   "isolation_source:",
13577   "lab_host:",
13578   "lat_lon:",
13579   "left_primer:",
13580   "linkage_group:",
13581   "map:",
13582   "mating_type:",
13583   "metagenome_source:",
13584   "metagenomic:",
13585   "nat_host:",
13586   "pathovar:",
13587   "phenotype:",
13588   "placement:",
13589   "plasmid_name:",
13590   "plastid_name:",
13591   "pop_variant:",
13592   "rearranged:",
13593   "rev_pcr_primer_name",
13594   "rev_pcr_primer_seq",
13595   "rev_primer_name",
13596   "rev_primer_seq",
13597   "right_primer:",
13598   "segment:",
13599   "serogroup:",
13600   "serotype:",
13601   "serovar:",
13602   "sex:",
13603   "specimen_voucher:",
13604   "strain:",
13605   "subclone:",
13606   "subgroup:",
13607   "substrain:",
13608   "subtype:",
13609   "sub_species:",
13610   "synonym:",
13611   "taxon:",
13612   "teleomorph:",
13613   "tissue_lib:",
13614   "tissue_type:",
13615   "transgenic:",
13616   "transposon_name:",
13617   "type:",
13618   "variety:",
13619   "whole_replicon:",
13620   NULL
13621 };
13622 
InitializeSourceQualTags(ValidStructPtr vsp)13623 static void InitializeSourceQualTags (ValidStructPtr vsp)
13624 
13625 {
13626   Int2  i;
13627 
13628   vsp->sourceQualTags = TextFsaNew ();
13629   for (i = 0; source_qual_prefixes [i] != NULL; i++) {
13630     TextFsaAdd (vsp->sourceQualTags, source_qual_prefixes [i]);
13631   }
13632 }
13633 
ValidateSourceQualTags(ValidStructPtr vsp,GatherContextPtr gcp,BioSourcePtr biop,CharPtr str)13634 static void ValidateSourceQualTags (ValidStructPtr vsp, GatherContextPtr gcp, BioSourcePtr biop, CharPtr str)
13635 
13636 {
13637   Char        ch;
13638   CharPtr     hit;
13639   Boolean     okay;
13640   CharPtr     ptr;
13641   CharPtr     tmp;
13642   Int4        state;
13643   ValNodePtr  matches;
13644 
13645   if (vsp->sourceQualTags == NULL || StringHasNoText (str)) return;
13646   state = 0;
13647   ptr = str;
13648   ch = *ptr;
13649   while (ch != '\0') {
13650     matches = NULL;
13651     state = TextFsaNext (vsp->sourceQualTags, state, ch, &matches);
13652     if (matches != NULL) {
13653       hit = (CharPtr) matches->data.ptrvalue;
13654       if (StringHasNoText (hit)) {
13655         hit = "?";
13656       }
13657       okay = TRUE;
13658       tmp = ptr - StringLen (hit);
13659       if (tmp > str) {
13660         ch = *tmp;
13661         if ((! IS_WHITESP (ch)) && ch != ';') {
13662           okay = FALSE;
13663         }
13664       }
13665       if (okay) {
13666         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_StructuredSourceNote,
13667                   "Source note has structured tag '%s'", hit);
13668       }
13669     }
13670     ptr++;
13671     ch = *ptr;
13672   }
13673 }
13674 
13675 
GetOrgModWarning(Uint2 subtype)13676 static CharPtr GetOrgModWarning (Uint2 subtype)
13677 {
13678   CharPtr warning = NULL;
13679 
13680   switch (subtype) {
13681     /*
13682     case ORGMOD_biovar:
13683       warning = "Biovar value specified is not found in taxname";
13684       break;
13685     */
13686     case ORGMOD_forma:
13687       warning = "Forma value specified is not found in taxname";
13688       break;
13689     case ORGMOD_forma_specialis:
13690       warning = "Forma specialis value specified is not found in taxname";
13691       break;
13692     /*
13693     case ORGMOD_pathovar:
13694       warning = "Pathovar value specified is not found in taxname";
13695       break;
13696     */
13697     case ORGMOD_sub_species:
13698       warning = "Subspecies value specified is not found in taxname";
13699       break;
13700     case ORGMOD_variety:
13701       warning = "Variety value specified is not found in taxname";
13702       break;
13703   }
13704   return warning;
13705 }
13706 
13707 
ValidateOrgModInTaxName(ValidStructPtr vsp,OrgModPtr mod,CharPtr taxname,Boolean varietyOK)13708 static Boolean ValidateOrgModInTaxName (ValidStructPtr vsp, OrgModPtr mod, CharPtr taxname, Boolean varietyOK)
13709 {
13710   CharPtr cp, f, warn;
13711   Int4    word_len, name_len;
13712 
13713   if (vsp == NULL || mod == NULL) return FALSE;
13714 
13715   name_len = StringLen (mod->subname);
13716 
13717   /* skip first word */
13718   word_len = StringCSpn (taxname, " ");
13719   cp = taxname + word_len;
13720   cp += StringSpn (cp, " ");
13721   /* skip second word */
13722   word_len = StringCSpn (cp, " ");
13723   cp += word_len;
13724   cp += StringSpn (cp, " ");
13725 
13726   f = StringSearch (cp, mod->subname);
13727   while (f != NULL && ((f != cp && isalpha (*(f - 1))) || isalpha (*(f + name_len)))) {
13728     f = StringSearch (f + 1, mod->subname);
13729   }
13730   if (f == NULL) {
13731     warn = GetOrgModWarning (mod->subtype);
13732     if (warn != NULL) {
13733       /* variety is sorted before sub_species, so if variety was okay in taxname, can ignore missing sub_species */
13734       if (mod->subtype == ORGMOD_sub_species && varietyOK) return FALSE;
13735       ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, warn);
13736       return FALSE;
13737     }
13738   }
13739 
13740   return TRUE;
13741 }
13742 
13743 /* institution:collection is now stored as a ValNode list of strings, sorted and indexed */
13744 
13745 static Boolean inst_code_not_found = FALSE;
13746 
13747 static ValNodePtr    ic_code_list = NULL;
13748 static CharPtr PNTR  ic_code_data = NULL;
13749 static Uint1 PNTR    ic_code_type = NULL;
13750 static Int4          ic_code_len = 0;
13751 
13752 #define BIO_MATERIAL_TYPE       1
13753 #define CULTURE_COLLECTION_TYPE 2
13754 #define SPECIMEN_VOUCHER_TYPE   4
13755 
SetupInstCollTable(void)13756 static void SetupInstCollTable (void)
13757 
13758 {
13759   FileCache   fc;
13760   CharPtr     file = "institution_codes.txt";
13761   FILE        *fp = NULL;
13762   Int4        i;
13763   ValNodePtr  last = NULL;
13764   Char        line [512];
13765   Char        path [PATH_MAX];
13766   CharPtr     ptr;
13767   ErrSev      sev;
13768   CharPtr     str;
13769   CharPtr     tmp;
13770   Uint1       type;
13771   ValNodePtr  vnp;
13772 
13773   if (ic_code_data != NULL) return;
13774   if (inst_code_not_found) return;
13775 
13776   if (FindPath ("ncbi", "ncbi", "data", path, sizeof (path))) {
13777     FileBuildPath (path, NULL, file);
13778     sev = ErrSetMessageLevel (SEV_ERROR);
13779     fp = FileOpen (path, "r");
13780     ErrSetMessageLevel (sev);
13781   }
13782 
13783   if (fp == NULL) {
13784     inst_code_not_found = TRUE;
13785     return;
13786   }
13787 
13788   FileCacheSetup (&fc, fp);
13789 
13790   str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
13791   while (str != NULL) {
13792     if (StringDoesHaveText (str)) {
13793       type = 0;
13794       ptr = StringChr (str, '\t');
13795       if (ptr != NULL) {
13796         *ptr = '\0';
13797         ptr++;
13798         tmp = StringChr (ptr, '\t');
13799         if (tmp != NULL) {
13800           *tmp = '\0';
13801           if (StringChr (ptr, 'b') != NULL) {
13802             type |= BIO_MATERIAL_TYPE;
13803           }
13804           if (StringChr (ptr, 'c') != NULL) {
13805             type |= CULTURE_COLLECTION_TYPE;
13806           }
13807           if (StringChr (ptr, 's') != NULL) {
13808             type |= SPECIMEN_VOUCHER_TYPE;
13809           }
13810         }
13811       }
13812       TrimSpacesAroundString (str);
13813       vnp = ValNodeCopyStr (&last, type, str);
13814       if (ic_code_list == NULL) {
13815         ic_code_list = vnp;
13816       }
13817       last = vnp;
13818     }
13819     str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
13820   }
13821 
13822   FileClose (fp);
13823 
13824   ic_code_len = ValNodeLen (ic_code_list);
13825   if (ic_code_len > 0) {
13826     ic_code_list = ValNodeSort (ic_code_list, SortVnpByString);
13827     ic_code_data = (CharPtr PNTR) MemNew (sizeof (CharPtr) * (ic_code_len + 1));
13828     if (ic_code_data != NULL) {
13829       for (vnp = ic_code_list, i = 0; vnp != NULL; vnp = vnp->next, i++) {
13830         str = (CharPtr) vnp->data.ptrvalue;
13831         ic_code_data [i] = str;
13832       }
13833     }
13834 
13835     ic_code_type = (Uint1 PNTR) MemNew (sizeof (Uint1) * (ic_code_len + 1));
13836     if (ic_code_type != NULL) {
13837       for (vnp = ic_code_list, i = 0; vnp != NULL; vnp = vnp->next, i++) {
13838         ic_code_type [i] = vnp->choice;
13839       }
13840     }
13841   }
13842 }
13843 
CheckInstCollName(CharPtr name,Uint1Ptr typeP)13844 static CharPtr CheckInstCollName (CharPtr name, Uint1Ptr typeP)
13845 
13846 {
13847   Int4     L, R, mid;
13848   CharPtr  str;
13849 
13850   SetupInstCollTable ();
13851 
13852   if (typeP != NULL) {
13853     *typeP = 0;
13854   }
13855 
13856   L = 0;
13857   R = ic_code_len - 1;
13858   while (L < R) {
13859     mid = (L + R) / 2;
13860     str = ic_code_data [(int) mid];
13861     if (StringICmp (str, name) < 0) {
13862       L = mid + 1;
13863     } else {
13864       R = mid;
13865     }
13866   }
13867   if (R < 0) return NULL;
13868 
13869   if (typeP != NULL) {
13870     *typeP = ic_code_type [(int) R];
13871   }
13872 
13873   return ic_code_data [(int) R];
13874 }
13875 
13876 
13877 //LCOV_EXCL_START
FixOrgModVoucher(OrgModPtr mod)13878 NLM_EXTERN Boolean FixOrgModVoucher (OrgModPtr mod)
13879 {
13880   Boolean  rval = FALSE;
13881   CharPtr  cpy, inst = NULL, id = NULL, ptr, ptr2, match;
13882   Uint1    type = 0, allowed_type = 0;
13883 
13884   if (mod == NULL || StringHasNoText (mod->subname)) {
13885     return FALSE;
13886   }
13887 
13888   switch (mod->subtype) {
13889     case ORGMOD_bio_material:
13890       type = BIO_MATERIAL_TYPE;
13891       break;
13892     case ORGMOD_culture_collection:
13893       type = CULTURE_COLLECTION_TYPE;
13894       break;
13895     case ORGMOD_specimen_voucher:
13896       type = SPECIMEN_VOUCHER_TYPE;
13897       break;
13898     default:
13899       break;
13900   }
13901   if (type == 0) {
13902     return FALSE;
13903   }
13904 
13905   cpy = StringSave(mod->subname);
13906   if (ParseStructuredVoucher (cpy, &inst, &id) && inst != NULL && inst[0] != ':') {
13907     /* see if we need to eliminate unnecessary country code */
13908     match = CheckInstCollName (inst, &type);
13909     if (match == NULL || StringCmp (match, inst) != 0) {
13910         if ((ptr = StringChr(inst, '<')) != NULL
13911             && (ptr2 = StringChr(ptr, '>')) != NULL) {
13912           StringCpy (ptr, ptr2 + 1);
13913           match = CheckInstCollName (inst, &type);
13914           if (match != NULL && StringCmp (match, inst) == 0) {
13915             mod->subname = MemFree (mod->subname);
13916             mod->subname = (CharPtr) MemNew(sizeof (Char) * (StringLen(inst) + StringLen (id) + 2));
13917             sprintf (mod->subname, "%s:%s", inst, id);
13918             rval = TRUE;
13919           }
13920         }
13921     }
13922   } else {
13923 #if 0
13924     /* removed from BasicCleanup */
13925     /* add structure if missing */
13926     ptr = cpy;
13927     inst_len = 0;
13928     while (*ptr != 0 && isalpha(*ptr)) {
13929       ++ptr;
13930       ++inst_len;
13931     }
13932 
13933     if (inst_len >= 3) {
13934       /* can only continue if three or more characters in institution code */
13935       while (*ptr != 0 && isspace (*ptr)) {
13936         ++ptr;
13937       }
13938       if (*ptr != 0) {
13939         id = ptr;
13940         while (*ptr != 0 && isdigit(*ptr)) {
13941           ptr++;
13942         }
13943         if (*ptr == 0 && ptr - id > 0) {
13944           /* can only continue if ID is non-empty and all numbers */
13945           inst = (CharPtr) MemNew (sizeof (Char) * inst_len + 1);
13946           StringNCpy (inst, cpy, inst_len);
13947           inst[inst_len] = 0;
13948           match = CheckInstCollName(inst, &allowed_type);
13949           if (match != NULL && StringCmp (match, inst) == 0 && (type & allowed_type)) {
13950             mod->subname = MemFree (mod->subname);
13951             mod->subname = (CharPtr) MemNew(sizeof (Char) * (StringLen(inst) + StringLen (id) + 2));
13952             sprintf (mod->subname, "%s:%s", inst, id);
13953             rval = TRUE;
13954           }
13955           inst = MemFree (inst);
13956         }
13957       }
13958     }
13959 #endif
13960   }
13961   cpy = MemFree (cpy);
13962   return rval;
13963 }
13964 //LCOV_EXCL_STOP
13965 
13966 
ValidateOrgModVoucher(ValidStructPtr vsp,OrgModPtr mod)13967 static void ValidateOrgModVoucher (ValidStructPtr vsp, OrgModPtr mod)
13968 
13969 {
13970   Char     buf [512];
13971   CharPtr  inst = NULL, id = NULL, coll = NULL, ptr, str;
13972   size_t   len1, len2;
13973   Uint1    type;
13974 
13975   if (vsp == NULL || mod == NULL) return;
13976 
13977   StringNCpy_0 (buf, mod->subname, sizeof (buf));
13978   if (StringChr (buf, ':') == NULL) {
13979     if (mod->subtype == ORGMOD_culture_collection) {
13980       ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_UnstructuredVoucher, "Culture_collection should be structured, but is not");
13981     }
13982     return;
13983   }
13984   if (! ParseStructuredVoucher (buf, &inst, &id) || inst == NULL || inst[0] == ':') {
13985     if (StringHasNoText (inst) || inst [0] == ':') {
13986       ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadInstitutionCode, "Voucher is missing institution code");
13987     }
13988     if (StringHasNoText (id)) {
13989       ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadVoucherID, "Voucher is missing specific identifier");
13990     }
13991     return;
13992   }
13993   if (inst == NULL) return;
13994 
13995   str = CheckInstCollName (inst, &type);
13996   if (StringCmp (str, inst) == 0) {
13997     if ((mod->subtype == ORGMOD_bio_material && (type & BIO_MATERIAL_TYPE) == 0) ||
13998         (mod->subtype == ORGMOD_culture_collection && (type & CULTURE_COLLECTION_TYPE) == 0) ||
13999         (mod->subtype == ORGMOD_specimen_voucher && (type & SPECIMEN_VOUCHER_TYPE) == 0)) {
14000       if ((type & BIO_MATERIAL_TYPE) != 0) {
14001         ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_WrongVoucherType, "Institution code %s should be bio_material", inst);
14002       } else if ((type & CULTURE_COLLECTION_TYPE) != 0) {
14003         ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_WrongVoucherType, "Institution code %s should be culture_collection", inst);
14004       } else if ((type & SPECIMEN_VOUCHER_TYPE) != 0) {
14005         ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_WrongVoucherType, "Institution code %s should be specimen_voucher", inst);
14006       }
14007     }
14008     return;
14009   }
14010 
14011   if (StringICmp (str, inst) == 0) {
14012     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadInstitutionCode,
14013               "Institution code %s exists, but correct capitalization is %s", inst, str);
14014     return;
14015   }
14016 
14017   /* previously ignored personal collections, now complain if name missing */
14018   if (StringNICmp (inst, "personal", 8) == 0) {
14019     if (StringICmp (inst, "personal") == 0 && StringLen (str) > 0) {
14020       ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MissingPersonalCollectionName,
14021                 "Personal collection does not have name of collector");
14022     }
14023     return;
14024   }
14025 
14026   len1 = StringLen (inst);
14027   len2 = StringLen (str);
14028 
14029   if (len1 < len2) {
14030     if (StringNICmp (str, inst, len1) == 0 && str [len1] == '<') {
14031       ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadInstitutionCode, "Institution code %s needs to be qualified with a <COUNTRY> designation", inst);
14032       return;
14033     }
14034   }
14035 
14036   coll = StringChr (inst, ':');
14037   if (coll == NULL) {
14038     ptr = StringChr (inst, '<');
14039     if (ptr != NULL) {
14040       *ptr = '\0';
14041       str = CheckInstCollName (inst, &type);
14042       if (StringCmp (str, inst) == 0) {
14043         *ptr = '<';
14044         ValidErr(vsp, SEV_WARNING, ERR_SEQ_DESCR_BadInstitutionCountry, "Institution code %s should not be qualified with a <COUNTRY> designation", inst, ptr + 1);
14045         return;
14046       }
14047       *ptr = '<';
14048     }
14049     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadInstitutionCode, "Institution code %s is not in list", inst);
14050     return;
14051   }
14052 
14053   *coll = '\0';
14054   coll++;
14055   str = CheckInstCollName (inst, &type);
14056   if (StringCmp (str, inst) == 0) {
14057     if (StringCmp (coll, "DNA") == 0) {
14058       /* DNA is a valid collection for any institution (using bio_material) */
14059       if (mod->subtype != ORGMOD_bio_material) {
14060         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_WrongVoucherType, "DNA should be bio_material");
14061       }
14062     } else {
14063       ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadCollectionCode,
14064                 "Institution code %s exists, but collection %s:%s is not in list", inst, inst, coll);
14065     }
14066     return;
14067   }
14068 
14069   len1 = StringLen (inst);
14070   len2 = StringLen (str);
14071 
14072   if (len1 < len2) {
14073     if (StringNICmp (str, inst, len1) == 0 && str [len1] == '<') {
14074       ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadInstitutionCode, "Institution code in %s:%s needs to be qualified with a <COUNTRY> designation", inst, coll);
14075       return;
14076     }
14077   }
14078 
14079   ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadInstitutionCode, "Institution code %s:%s is not in list", inst, coll);
14080 }
14081 
14082 /* returns reconstructed institution:id if valid but instutution:collection:id is invalid */
14083 //LCOV_EXCL_START
14084 // not part of validation
RemoveBadInstitutionCollection(OrgModPtr mod)14085 NLM_EXTERN CharPtr RemoveBadInstitutionCollection (OrgModPtr mod)
14086 
14087 {
14088   Char     buf [512];
14089   CharPtr  inst = NULL, id = NULL, coll = NULL, ptr, str;
14090   size_t   len, len1, len2;
14091   Uint1    type;
14092 
14093   if (mod == NULL) return NULL;
14094 
14095   if (mod->subtype != ORGMOD_bio_material && mod->subtype != ORGMOD_culture_collection && mod->subtype != ORGMOD_specimen_voucher) {
14096     return NULL;
14097   }
14098 
14099   StringNCpy_0 (buf, mod->subname, sizeof (buf));
14100   if (StringChr (buf, ':') == NULL) {
14101     return NULL;
14102   }
14103   if (! ParseStructuredVoucher (buf, &inst, &id) || inst == NULL || inst[0] == ':') {
14104     return NULL;
14105   }
14106   if (inst == NULL) return NULL;
14107 
14108   str = CheckInstCollName (inst, &type);
14109   if (StringCmp (str, inst) == 0) {
14110     if ((mod->subtype == ORGMOD_bio_material && (type & BIO_MATERIAL_TYPE) == 0) ||
14111         (mod->subtype == ORGMOD_culture_collection && (type & CULTURE_COLLECTION_TYPE) == 0) ||
14112         (mod->subtype == ORGMOD_specimen_voucher && (type & SPECIMEN_VOUCHER_TYPE) == 0)) {
14113     }
14114     return NULL;
14115   }
14116 
14117   if (StringICmp (str, inst) == 0) {
14118     return NULL;
14119   }
14120 
14121   /* previously ignored personal collections, now complain if name missing */
14122   if (StringNICmp (inst, "personal", 8) == 0) {
14123     return NULL;
14124   }
14125 
14126   len1 = StringLen (inst);
14127   len2 = StringLen (str);
14128 
14129   if (len1 < len2) {
14130     if (StringNICmp (str, inst, len1) == 0 && str [len1] == '<') {
14131       return NULL;
14132     }
14133   }
14134 
14135   coll = StringChr (inst, ':');
14136   if (coll == NULL) {
14137     return NULL;
14138   }
14139 
14140   *coll = '\0';
14141   coll++;
14142   str = CheckInstCollName (inst, &type);
14143   if (StringCmp (str, inst) == 0) {
14144     if (StringCmp (coll, "DNA") == 0) {
14145       /* DNA is a valid collection for any institution (using bio_material) */
14146       return NULL;
14147     }
14148     len = StringLen (inst) + StringLen (id) + 10;
14149     ptr = (CharPtr) MemNew (sizeof (Char) * len);
14150     if (ptr != NULL) {
14151       StringCpy (ptr, inst);
14152       StringCat (ptr, ":");
14153       StringCat (ptr, id);
14154       return ptr;
14155     }
14156   }
14157 
14158   return NULL;
14159 }
14160 
14161 /* returns reconstructed institution:id if valid but instutution<country>:id is invalid */
14162 // not part of validation
RemoveBadInstitutionCountry(OrgModPtr mod)14163 NLM_EXTERN CharPtr RemoveBadInstitutionCountry (OrgModPtr mod)
14164 
14165 {
14166   Char     buf [512];
14167   CharPtr  inst = NULL, id = NULL, ctry = NULL, ptr, str;
14168   size_t   len, len1, len2;
14169   Uint1    type;
14170 
14171   if (mod == NULL) return NULL;
14172 
14173   if (mod->subtype != ORGMOD_bio_material && mod->subtype != ORGMOD_culture_collection && mod->subtype != ORGMOD_specimen_voucher) {
14174     return NULL;
14175   }
14176 
14177   StringNCpy_0 (buf, mod->subname, sizeof (buf));
14178   if (StringChr (buf, ':') == NULL) {
14179     return NULL;
14180   }
14181   if (! ParseStructuredVoucher (buf, &inst, &id) || inst == NULL || inst[0] == ':') {
14182     return NULL;
14183   }
14184   if (inst == NULL) return NULL;
14185 
14186   str = CheckInstCollName (inst, &type);
14187   if (StringCmp (str, inst) == 0) {
14188     if ((mod->subtype == ORGMOD_bio_material && (type & BIO_MATERIAL_TYPE) == 0) ||
14189         (mod->subtype == ORGMOD_culture_collection && (type & CULTURE_COLLECTION_TYPE) == 0) ||
14190         (mod->subtype == ORGMOD_specimen_voucher && (type & SPECIMEN_VOUCHER_TYPE) == 0)) {
14191     }
14192     return NULL;
14193   }
14194 
14195   if (StringICmp (str, inst) == 0) {
14196     return NULL;
14197   }
14198 
14199   /* previously ignored personal collections, now complain if name missing */
14200   if (StringNICmp (inst, "personal", 8) == 0) {
14201     return NULL;
14202   }
14203 
14204   len1 = StringLen (inst);
14205   len2 = StringLen (str);
14206 
14207   if (len1 < len2) {
14208     if (StringNICmp (str, inst, len1) == 0 && str [len1] == '<') {
14209       return NULL;
14210     }
14211   }
14212 
14213   ctry = StringChr (inst, '<');
14214   if (ctry == NULL) {
14215     return NULL;
14216   }
14217 
14218   *ctry = '\0';
14219   ctry++;
14220   str = CheckInstCollName (inst, &type);
14221   if (StringCmp (str, inst) == 0) {
14222     len = StringLen (inst) + StringLen (id) + 10;
14223     ptr = (CharPtr) MemNew (sizeof (Char) * len);
14224     if (ptr != NULL) {
14225       StringCpy (ptr, inst);
14226       StringCat (ptr, ":");
14227       StringCat (ptr, id);
14228       return ptr;
14229     }
14230   }
14231 
14232   return NULL;
14233 }
14234 
14235 // not part of validation
VoucherInstitutionIsValid(CharPtr inst)14236 NLM_EXTERN Boolean VoucherInstitutionIsValid (CharPtr inst)
14237 
14238 {
14239   CharPtr  str;
14240   Uint1    type;
14241 
14242   if (StringHasNoText (inst)) return FALSE;
14243 
14244   str = CheckInstCollName (inst, &type);
14245   if (StringCmp (str, inst) == 0) return TRUE;
14246 
14247   return FALSE;
14248 }
14249 //LCOV_EXCL_STOP
14250 
14251 /* works on subname copy that it can change */
14252 
ParseStructuredVoucher(CharPtr subname,CharPtr PNTR inst,CharPtr PNTR id)14253 NLM_EXTERN Boolean ParseStructuredVoucher (
14254   CharPtr subname,
14255   CharPtr PNTR inst,
14256   CharPtr PNTR id
14257 )
14258 
14259 {
14260   CharPtr  ptr;
14261   CharPtr  tmp;
14262 
14263   if (StringHasNoText (subname)) return FALSE;
14264   if (StringLen (subname) < 3) return FALSE;
14265   TrimSpacesAroundString (subname);
14266 
14267   ptr = StringChr (subname, ':');
14268   if (ptr == NULL) return FALSE;
14269 
14270   *inst = subname;
14271 
14272   tmp = StringChr (ptr + 1, ':');
14273   if (tmp != NULL) {
14274     *tmp = '\0';
14275     tmp++;
14276     TrimSpacesAroundString (tmp);
14277     *id = tmp;
14278   } else {
14279     *ptr = '\0';
14280     ptr++;
14281     TrimSpacesAroundString (ptr);
14282     *id = ptr;
14283   }
14284 
14285   if (StringHasNoText (*inst) || StringHasNoText (*id)) return FALSE;
14286 
14287   return TRUE;
14288 }
14289 
ValidateLatLon(ValidStructPtr vsp,CharPtr lat_lon)14290 static void ValidateLatLon (ValidStructPtr vsp, CharPtr lat_lon)
14291 
14292 {
14293   Boolean format_ok = FALSE, lat_in_range = FALSE, lon_in_range = FALSE, precision_ok = FALSE;
14294   CharPtr ptr;
14295   Char    tmp [128];
14296 
14297   IsCorrectLatLonFormat (lat_lon, &format_ok, &precision_ok, &lat_in_range, &lon_in_range);
14298 
14299   if (! format_ok) {
14300     /* may have comma and then altitude, so just get lat_lon component */
14301     StringNCpy_0 (tmp, lat_lon, sizeof (tmp));
14302     ptr = StringChr (tmp, ',');
14303     if (ptr != NULL) {
14304       *ptr = '\0';
14305       lat_lon = tmp;
14306       IsCorrectLatLonFormat (tmp, &format_ok, &precision_ok, &lat_in_range, &lon_in_range);
14307       if (format_ok) {
14308         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_LatLonFormat, "lat_lon format has extra text after correct dd.dd N|S ddd.dd E|W format");
14309       }
14310     }
14311   }
14312 
14313   if (!format_ok) {
14314     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_LatLonFormat, "lat_lon format is incorrect - should be dd.dd N|S ddd.dd E|W");
14315   } else {
14316     if (!lat_in_range) {
14317       ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_LatLonRange, "latitude value is out of range - should be between 90.00 N and 90.00 S");
14318     }
14319     if (!lon_in_range) {
14320       ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_LatLonRange, "longitude value is out of range - should be between 180.00 E and 180.00 W");
14321     }
14322     if (! precision_ok) {
14323       /*
14324       ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_LatLonPrecision, "lat_lon precision is incorrect - should only have two digits to the right of the decimal point");
14325       */
14326     }
14327   }
14328 }
14329 
14330 
ValidateLocationForHIV(ValidStructPtr vsp,BioSourcePtr biop,BioseqPtr bsp)14331 static void ValidateLocationForHIV (ValidStructPtr vsp, BioSourcePtr biop, BioseqPtr bsp)
14332 {
14333   SeqDescrPtr sdp;
14334   SeqMgrDescContext context;
14335   MolInfoPtr mip;
14336 
14337   if (vsp == NULL || biop == NULL) {
14338     return;
14339   }
14340 
14341   if (bsp != NULL) {
14342     if (bsp->mol == Seq_mol_dna) {
14343       if (biop->genome != GENOME_proviral) {
14344         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "HIV with moltype DNA should be proviral");
14345       }
14346     } else if (bsp->mol == Seq_mol_rna) {
14347       sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &context);
14348       if (sdp != NULL
14349           && (mip = (MolInfoPtr) sdp->data.ptrvalue) != NULL
14350           && mip->biomol == MOLECULE_TYPE_MRNA) {
14351         ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_BioSourceInconsistency, "HIV with mRNA molecule type is rare");
14352       }
14353     }
14354   }
14355 }
14356 
14357 /*****************************************************************************
14358 *
14359 *   ValidateSeqDescrContext(gcp)
14360 *      Gather callback helper function for validating context on a Bioseq
14361 *
14362 *****************************************************************************/
UnbalancedParentheses(CharPtr str)14363 static Boolean UnbalancedParentheses (CharPtr str)
14364 
14365 {
14366   Char  ch;
14367   Int4  fwd_par = 0, rev_par = 0, fwd_bkt = 0, rev_bkt = 0;
14368 
14369   if (StringHasNoText (str)) return FALSE;
14370 
14371   ch = *str;
14372   while (ch != '\0') {
14373     if (ch == '(') {
14374       fwd_par++;
14375     } else if (ch == ')') {
14376       rev_par++;
14377     } else if (ch == '[') {
14378       fwd_bkt++;
14379     } else if (ch == ']') {
14380       rev_bkt++;
14381     }
14382     if (fwd_par < rev_par) return TRUE;
14383     if (fwd_bkt < rev_bkt) return TRUE;
14384     str++;
14385     ch = *str;
14386   }
14387 
14388   if (fwd_par != rev_par) return TRUE;
14389   if (fwd_bkt != rev_bkt) return TRUE;
14390 
14391   return FALSE;
14392 }
14393 
14394 static CharPtr sgml_strings [] = {
14395   "&gt;",
14396   "&lt;",
14397   "&amp;",
14398   "&agr;",
14399   "&Agr;",
14400   "&bgr;",
14401   "&Bgr;",
14402   "&ggr;",
14403   "&Ggr;",
14404   "&dgr;",
14405   "&Dgr;",
14406   "&egr;",
14407   "&Egr;",
14408   "&zgr;",
14409   "&Zgr;",
14410   "&eegr;",
14411   "&EEgr;",
14412   "&thgr;",
14413   "&THgr;",
14414   "&igr;",
14415   "&Igr;",
14416   "&kgr;",
14417   "&Kgr;",
14418   "&lgr;",
14419   "&Lgr;",
14420   "&mgr;",
14421   "&Mgr;",
14422   "&ngr;",
14423   "&Ngr;",
14424   "&xgr;",
14425   "&Xgr;",
14426   "&ogr;",
14427   "&Ogr;",
14428   "&pgr;",
14429   "&Pgr;",
14430   "&rgr;",
14431   "&Rgr;",
14432   "&sgr;",
14433   "&Sgr;",
14434   "&sfgr;",
14435   "&tgr;",
14436   "&Tgr;",
14437   "&ugr;",
14438   "&Ugr;",
14439   "&phgr;",
14440   "&PHgr;",
14441   "&khgr;",
14442   "&KHgr;",
14443   "&psgr;",
14444   "&PSgr;",
14445   "&ohgr;",
14446   "&OHgr;",
14447   NULL
14448 };
14449 
InitializeSgmlStringsFSA(ValidStructPtr vsp)14450 static void InitializeSgmlStringsFSA (ValidStructPtr vsp)
14451 
14452 {
14453   Int2  i;
14454 
14455   vsp->sgmlStrings = TextFsaNew ();
14456   for (i = 0; sgml_strings [i] != NULL; i++) {
14457     TextFsaAdd (vsp->sgmlStrings, sgml_strings [i]);
14458   }
14459 }
14460 
StringHasSgml(ValidStructPtr vsp,CharPtr str)14461 static Boolean StringHasSgml (ValidStructPtr vsp, CharPtr str)
14462 
14463 {
14464   Int2        ascii_len;
14465   Char        buf [256];
14466   Char        ch;
14467   TextFsaPtr  fsa;
14468   ValNodePtr  matches;
14469   Boolean     not_sgml;
14470   CharPtr     ptr;
14471   ErrSev      sev;
14472   Int4        state;
14473 
14474   if (StringHasNoText (str)) return FALSE;
14475   if (StringChr (str, '&') == NULL) return FALSE;
14476 
14477   if (vsp == NULL) return FALSE;
14478   if (vsp->sgmlStrings == NULL) {
14479     InitializeSgmlStringsFSA (vsp);
14480   }
14481   fsa = vsp->sgmlStrings;
14482   if (fsa == NULL) return FALSE;
14483 
14484   not_sgml = TRUE;
14485   state = 0;
14486   matches = NULL;
14487   for (ptr = str, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
14488     state = TextFsaNext (fsa, state, ch, &matches);
14489     if (matches != NULL) {
14490       not_sgml = FALSE;
14491     }
14492   }
14493   if (not_sgml) return FALSE;
14494 
14495   sev = ErrSetMessageLevel (SEV_REJECT);
14496   ascii_len = Sgml2AsciiLen (str);
14497   if (ascii_len + 2 >= sizeof (buf)) {
14498     ErrSetMessageLevel (sev);
14499     return FALSE;
14500   }
14501 
14502   buf [0] = '\0';
14503   Sgml2Ascii (str, buf, ascii_len + 1);
14504   ErrSetMessageLevel (sev);
14505 
14506   if (StringHasNoText (buf)) return FALSE;
14507   if (StringCmp (str, buf) == 0) return FALSE;
14508 
14509   return TRUE;
14510 }
14511 
14512 static CharPtr valid_sex_values [] = {
14513   "female",
14514   "male",
14515   "hermaphrodite",
14516   "unisexual",
14517   "bisexual",
14518   "asexual",
14519   "intersex",
14520   "mixed",
14521   "monoecious",
14522   "monecious",
14523   "dioecious",
14524   "diecious",
14525   "neuter",
14526   "pooled males and females",
14527   "pooled male and female",
14528   NULL
14529 };
14530 
IsValidSexValue(CharPtr str)14531 static Boolean IsValidSexValue (CharPtr str)
14532 
14533 {
14534   int  i;
14535 
14536   if (StringHasNoText (str)) return FALSE;
14537 
14538   for (i = 0; valid_sex_values [i] != NULL; i++) {
14539     if (StringICmp (str, valid_sex_values [i]) == 0) return TRUE;
14540   }
14541 
14542   return FALSE;
14543 }
14544 
LatLonInRange(FloatHi lat,FloatHi lon)14545 static Boolean LatLonInRange (
14546   FloatHi lat,
14547   FloatHi lon
14548 )
14549 
14550 {
14551   if (lat < -90.0001 || lat > 90.0001) return FALSE;
14552   if (lon < -180.0001 || lon > 180.0001) return FALSE;
14553 
14554   return TRUE;
14555 }
14556 
RegionIsClosestToLatLon(CharPtr country,FloatHi lat,FloatHi lon,FloatHi range,FloatHi PNTR distanceP,CtrySetPtr csp)14557 static Boolean RegionIsClosestToLatLon (
14558   CharPtr country,
14559   FloatHi lat,
14560   FloatHi lon,
14561   FloatHi range,
14562   FloatHi PNTR distanceP,
14563   CtrySetPtr csp
14564 )
14565 
14566 {
14567   LatBlockPtr PNTR  array;
14568   CtyBlockPtr       cbp;
14569   FloatHi           closest = EARTH_RADIUS * CONST_PI * 2;
14570   CharPtr           guess = NULL;
14571   FloatHi           delta;
14572   Int4              latitude;
14573   Int4              longitude;
14574   Int4              maxDelta;
14575   LatBlockPtr       lbp;
14576   Int4              R;
14577   Int4              x;
14578   Int4              y;
14579 
14580 
14581   if (StringHasNoText (country)) return FALSE;
14582 
14583   if (distanceP != NULL) {
14584     *distanceP = 0.0;
14585   }
14586 
14587   if (csp == NULL) return FALSE;
14588 
14589   array = csp->latarray;
14590   if (array == NULL) return FALSE;
14591 
14592   latitude = ConvertLat (lat, csp->scale);
14593   longitude = ConvertLon (lon, csp->scale);
14594 
14595   maxDelta = (Int4) (range * csp->scale + EPSILON);
14596 
14597   for (R = GetLatLonIndex (csp, array, latitude - maxDelta); R < csp->numLatBlocks; R++) {
14598     lbp = array [R];
14599     if (lbp == NULL) break;
14600     if (latitude + maxDelta < lbp->lat) break;
14601 
14602     if (longitude < lbp->minlon - maxDelta) continue;
14603     if (longitude > lbp->maxlon + maxDelta) continue;
14604 
14605     cbp = lbp->landmass;
14606     if (cbp == NULL) continue;
14607 
14608     if (longitude < lbp->minlon) {
14609       x = lbp->minlon;
14610     } else if (longitude > lbp->maxlon) {
14611       x = lbp->maxlon;
14612     } else {
14613       x = longitude;
14614     }
14615 
14616     y = lbp->lat;
14617 
14618     delta = DistanceOnGlobe (lat, lon, (FloatHi) (y / csp->scale), (FloatHi) (x / csp->scale));
14619 
14620     if (delta < closest) {
14621       guess = cbp->name;
14622       closest = delta;
14623     } else if (delta == closest) {
14624       if (StringCmp (country, cbp->name) == 0) {
14625         guess = cbp->name;
14626       }
14627     }
14628   }
14629 
14630   if (guess != NULL) {
14631     if (distanceP != NULL) {
14632       *distanceP = closest;
14633     }
14634   }
14635 
14636   if (StringCmp (guess, country) == 0) return TRUE;
14637 
14638   return FALSE;
14639 }
14640 
14641 
CountryIsClosestToLatLon(CharPtr country,FloatHi lat,FloatHi lon,FloatHi range,FloatHi PNTR distanceP)14642 static Boolean CountryIsClosestToLatLon (
14643   CharPtr country,
14644   FloatHi lat,
14645   FloatHi lon,
14646   FloatHi range,
14647   FloatHi PNTR distanceP
14648 )
14649 
14650 {
14651   CtrySetPtr  csp;
14652 
14653   csp = GetLatLonCountryData ();
14654   if (csp == NULL) return FALSE;
14655 
14656   return RegionIsClosestToLatLon (country, lat, lon, range, distanceP, csp);
14657 }
14658 
14659 
AdjustAndRoundDistance(FloatHi distance,FloatHi scale)14660 static int AdjustAndRoundDistance (
14661   FloatHi distance,
14662   FloatHi scale
14663 )
14664 
14665 {
14666   if (scale < 1.1) {
14667     distance += 111.19;
14668   } else if (scale > 19.5 && scale < 20.5) {
14669     distance += 5.56;
14670   } else if (scale > 99.5 && scale < 100.5) {
14671     distance += 1.11;
14672   }
14673 
14674   return (int) (distance + 0.5);
14675 }
14676 
14677 typedef struct latlonmap {
14678   FloatHi  lat;
14679   FloatHi  lon;
14680   CharPtr  fullguess;
14681   CharPtr  guesscountry;
14682   CharPtr  guessprovince;
14683   CharPtr  guesswater;
14684   CharPtr  closestfull;
14685   CharPtr  closestcountry;
14686   CharPtr  closestprovince;
14687   CharPtr  closestwater;
14688   CharPtr  claimedfull;
14689   int      landdistance;
14690   int      waterdistance;
14691   int      claimeddistance;
14692 } LatLonMap, PNTR LatLonMapPtr;
14693 
CalculateLatLonMap(FloatHi lat,FloatHi lon,CharPtr country,CharPtr province,FloatHi scale,LatLonMapPtr lmp)14694 static void CalculateLatLonMap (
14695   FloatHi lat,
14696   FloatHi lon,
14697   CharPtr country,
14698   CharPtr province,
14699   FloatHi scale,
14700   LatLonMapPtr lmp
14701 )
14702 
14703 {
14704   CtyBlockPtr  cbp;
14705   FloatHi      landdistance = 0.0, waterdistance = 0.0, claimeddistance = 0.0;
14706   Boolean      goodmatch = FALSE;
14707 
14708   if (lmp == NULL) return;
14709 
14710   /* initialize result values */
14711   MemSet ((Pointer) lmp, 0, sizeof (LatLonMap));
14712 
14713   lmp->lat = lat;
14714   lmp->lon = lon;
14715 
14716   /* lookup region by coordinates, or find nearest region and calculate distance */
14717   cbp = GuessCountryByLatLon (lat, lon, country, province);
14718   if (cbp != NULL) {
14719     /* successfully found inside some country */
14720     lmp->fullguess = cbp->name;
14721     lmp->guesscountry = cbp->level0;
14722     lmp->guessprovince = cbp->level1;
14723     if (StringICmp (country, lmp->guesscountry) == 0 && (province == NULL || StringICmp (province, lmp->guessprovince) == 0)) {
14724       goodmatch = TRUE;
14725     }
14726   } else {
14727     /* not inside a country, check water */
14728     cbp = GuessWaterByLatLon (lat, lon, country);
14729     if (cbp != NULL) {
14730       /* found inside water */
14731       lmp->guesswater = cbp->name;
14732       if (StringICmp (country, lmp->guesswater) == 0) {
14733         goodmatch = TRUE;
14734       }
14735       /*
14736       also see if close to land for coastal warning (if country is land)
14737       or proximity message (if country is water)
14738       */
14739       cbp = NearestCountryByLatLon (lat, lon, 5.0, &landdistance);
14740       if (cbp != NULL) {
14741         lmp->closestfull = cbp->name;
14742         lmp->closestcountry = cbp->level0;
14743         lmp->closestprovince = cbp->level1;
14744         lmp->landdistance = AdjustAndRoundDistance (landdistance, scale);
14745         if (StringICmp (country, lmp->closestcountry) == 0 && (province == NULL || StringICmp (province, lmp->closestprovince) == 0)) {
14746           goodmatch = TRUE;
14747         }
14748       }
14749     } else {
14750       //LCOV_EXCL_START
14751       //map used during regression is too good, no areas of data insufficiency
14752       /* may be coastal inlet, area of data insufficiency */
14753       cbp = NearestCountryByLatLon (lat, lon, 5.0, &landdistance);
14754       if (cbp != NULL) {
14755         lmp->closestfull = cbp->name;
14756         lmp->closestcountry = cbp->level0;
14757         lmp->closestprovince = cbp->level1;
14758         lmp->landdistance = AdjustAndRoundDistance (landdistance, scale);
14759         if (StringICmp (country, lmp->closestcountry) == 0 && (province == NULL || StringICmp (province, lmp->closestprovince) == 0)) {
14760           goodmatch = TRUE;
14761         }
14762       }
14763       cbp = NearestWaterByLatLon (lat, lon, 5.0, &waterdistance);
14764       if (cbp != NULL) {
14765         lmp->closestwater = cbp->level0;
14766         lmp->waterdistance = AdjustAndRoundDistance (waterdistance, scale);
14767         if (StringICmp (country, lmp->closestwater) == 0) {
14768           goodmatch = TRUE;
14769         }
14770       }
14771       //LCOV_EXCL_STOP
14772     }
14773   }
14774   /* if guess is not the provided country or province, calculate distance to claimed country */
14775   if (! goodmatch) {
14776     cbp = CountryToLatLonDistance (country, province, lat, lon, 5.0, &claimeddistance);
14777     if (cbp != NULL) {
14778       if (claimeddistance < ErrorDistance(lmp->lat, lmp->lon, scale)) {
14779         lmp->guesscountry = country;
14780         lmp->guessprovince = province;
14781         lmp->fullguess = cbp->name;
14782       } else {
14783         lmp->claimedfull = cbp->name;
14784         lmp->claimeddistance = AdjustAndRoundDistance (claimeddistance, scale);
14785       }
14786     } else if (province == NULL) {
14787       cbp = WaterToLatLonDistance (country, lat, lon, 5.0, &claimeddistance);
14788       if (cbp != NULL) {
14789         lmp->claimedfull = cbp->name;
14790         lmp->claimeddistance = AdjustAndRoundDistance (claimeddistance, scale);
14791       }
14792     }
14793   }
14794 }
14795 
14796 
14797 enum {
14798   eLatLonClassify_CountryMatch = 1 ,
14799   eLatLonClassify_ProvinceMatch = 2 ,
14800   eLatLonClassify_WaterMatch = 4 ,
14801   eLatLonClassify_CountryClosest = 8 ,
14802   eLatLonClassify_ProvinceClosest = 16 ,
14803   eLatLonClassify_WaterClosest = 32 ,
14804   eLatLonClassify_Error = 256
14805 } ELatLonClassify;
14806 
14807 
ClassifyLatLonMap(CharPtr fullname,CharPtr country,CharPtr province,LatLonMapPtr lmp)14808 static Uint4 ClassifyLatLonMap (
14809   CharPtr fullname,
14810   CharPtr country,
14811   CharPtr province,
14812   LatLonMapPtr lmp
14813 )
14814 
14815 {
14816   Uint4 rval = 0;
14817 
14818   if (lmp == NULL) return eLatLonClassify_Error;
14819 
14820   /* compare guesses or closest regions to indicated country and province */
14821   if (lmp->guesscountry != NULL) {
14822 
14823     /* if top level countries match */
14824     if (StringICmp (country, lmp->guesscountry) == 0) {
14825       rval |= eLatLonClassify_CountryMatch;
14826       /* if both are null, call it a match */
14827       if (StringICmp (province, lmp->guessprovince) == 0) {
14828         rval |= eLatLonClassify_ProvinceMatch;
14829       }
14830     }
14831     /* if they don't match, do they overlap or are closest? */
14832     if (!(rval & eLatLonClassify_CountryMatch)) {
14833       if (StringICmp (country, lmp->closestcountry) == 0) {
14834         rval |= eLatLonClassify_CountryClosest;
14835         if (StringICmp (province, lmp->closestprovince) == 0) {
14836           rval |= eLatLonClassify_ProvinceClosest;
14837         }
14838       }
14839     } else if (!(rval & eLatLonClassify_ProvinceMatch) && province != NULL) {
14840       if (StringICmp (province, lmp->closestprovince) == 0) {
14841         rval |= eLatLonClassify_ProvinceClosest;
14842       }
14843     }
14844   }
14845   if (lmp->guesswater != NULL) {
14846     /* was the non-approved body of water correctly indicated? */
14847     if (StringICmp (country, lmp->guesswater) == 0) {
14848       rval |= eLatLonClassify_WaterMatch;
14849     } else if (StringICmp (country, lmp->closestwater) == 0) {
14850       rval |= eLatLonClassify_WaterClosest;
14851     }
14852   }
14853   if (lmp->closestcountry != NULL && StringICmp (country, lmp->closestcountry) == 0) {
14854     if (lmp->guesscountry == NULL && lmp->guesswater == NULL) {
14855       /* coastal area */
14856       rval |= eLatLonClassify_CountryMatch;
14857       lmp->guesscountry = lmp->closestcountry;
14858       lmp->fullguess = lmp->closestcountry;
14859       if (lmp->closestprovince != NULL && StringICmp (province, lmp->closestprovince) == 0) {
14860         rval |= eLatLonClassify_ProvinceMatch;
14861         lmp->guessprovince = lmp->closestprovince;
14862         lmp->fullguess = lmp->closestfull;
14863       } else if (lmp->closestprovince != NULL && province == NULL) {
14864         lmp->guessprovince = lmp->closestprovince;
14865         lmp->fullguess = lmp->closestfull;
14866       }
14867     } else {
14868       rval |= eLatLonClassify_CountryClosest;
14869       if (lmp->closestprovince != NULL && StringICmp (province, lmp->closestprovince) == 0) {
14870         rval |= eLatLonClassify_ProvinceClosest;
14871       }
14872     }
14873   }
14874   return rval;
14875 }
14876 
14877 
LatLonWaterErrors(ValidStructPtr vsp,LatLonMapPtr lmp,Uint4 test,FloatHi neardist,CharPtr country,CharPtr province,CharPtr lat_lon,CharPtr fullname,FloatHi scale)14878 static void LatLonWaterErrors (
14879   ValidStructPtr vsp,
14880   LatLonMapPtr lmp,
14881   Uint4 test,
14882   FloatHi neardist,
14883   CharPtr country,
14884   CharPtr province,
14885   CharPtr lat_lon,
14886   CharPtr fullname,
14887   FloatHi scale
14888   )
14889 {
14890   CharPtr fmt = "Lat_lon '%s' is closest to %s'%s' at distance %d km, but in water '%s'";
14891   CharPtr claimed_fmt = "Lat_lon '%s' is closest to %s'%s' at distance %d km, but in water '%s' - claimed region '%s' is at distance %d km";
14892 
14893   Boolean suppress = FALSE;
14894   CharPtr reportregion;
14895   CharPtr nosubphrase = "";
14896   CharPtr desphrase = "designated subregion ";
14897   CharPtr subphrase = "another subregion ";
14898   CharPtr phrase = nosubphrase;
14899   Boolean show_claimed = FALSE;
14900 
14901   if (test & (eLatLonClassify_CountryClosest | eLatLonClassify_ProvinceClosest)) {
14902 
14903     if (lmp->landdistance < 22) {
14904       /* for now, will not report */
14905       /* this is a policy decision */
14906       suppress = TRUE;
14907     } else if (StringStr (fullname, "Island") != NULL) {
14908       suppress = TRUE;
14909     }
14910 
14911     if (test & eLatLonClassify_ProvinceClosest) {
14912       reportregion = fullname;
14913       phrase = desphrase;
14914     } else {
14915       /* wasn't closest province, so must be closest country */
14916       if (province != NULL && vsp->testLatLonSubregion) {
14917         phrase = subphrase;
14918         reportregion = lmp->closestfull;
14919       } else {
14920         reportregion = lmp->closestcountry;
14921       }
14922       if (lmp->claimedfull != NULL) {
14923         show_claimed = TRUE;
14924       }
14925     }
14926 
14927     if (!suppress) {
14928       if (show_claimed) {
14929         ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_LatLonWater, claimed_fmt, lat_lon,
14930                   phrase, reportregion,
14931                   lmp->landdistance, lmp->guesswater,
14932                   lmp->claimedfull, lmp->claimeddistance);
14933       } else {
14934         ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_LatLonWater,
14935                   fmt, lat_lon,
14936                   phrase, reportregion,
14937                   lmp->landdistance, lmp->guesswater);
14938       }
14939     }
14940 
14941   } else if (neardist > 0) {
14942     fmt = "Lat_lon '%s' is in water '%s', '%s' is %d km away";
14943     ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_LatLonWater, fmt, lat_lon, lmp->guesswater, fullname, AdjustAndRoundDistance (neardist, scale));
14944   } else {
14945     fmt = "Lat_lon '%s' is in water '%s'";
14946     ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_LatLonWater, fmt, lat_lon, lmp->guesswater);
14947   }
14948 }
14949 
14950 
LatLonLandErrors(ValidStructPtr vsp,LatLonMapPtr lmp,CharPtr country,CharPtr province,CharPtr lat_lon,CharPtr fullname)14951 static void LatLonLandErrors (
14952   ValidStructPtr vsp,
14953   LatLonMapPtr lmp,
14954   CharPtr country,
14955   CharPtr province,
14956   CharPtr lat_lon,
14957   CharPtr fullname
14958   )
14959 {
14960   CharPtr fmt;
14961 
14962   if (lmp->claimedfull != NULL) {
14963     fmt = "Lat_lon '%s' maps to '%s' instead of '%s' - claimed region '%s' is at distance %d km";
14964     if (province != NULL) {
14965       if (StringICmp (lmp->guesscountry, country) == 0) {
14966         if (vsp->testLatLonSubregion) {
14967           ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_LatLonState, fmt, lat_lon, lmp->fullguess, fullname, lmp->claimedfull, lmp->claimeddistance);
14968         }
14969       } else {
14970         ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_LatLonCountry, fmt, lat_lon, lmp->fullguess, fullname, lmp->claimedfull, lmp->claimeddistance);
14971       }
14972     /* check for changed names, e.g., Curacao from Netherlands Antilles, ignore */
14973     } else if (StringCmp (fullname, "Netherlands Antilles: Curacao") == 0 && StringCmp (lmp->fullguess, "Curacao") == 0) {
14974     } else if (StringCmp (fullname, "Netherlands Antilles: Sint Maarten") == 0 && StringCmp (lmp->fullguess, "Sint Maarten") == 0) {
14975     } else if (StringCmp (lmp->claimedfull, "Netherlands Antilles") == 0 && StringCmp (lmp->fullguess, "Curacao") == 0) {
14976     } else if (StringCmp (lmp->claimedfull, "Netherlands Antilles") == 0 && StringCmp (lmp->fullguess, "Sint Maarten") == 0) {
14977     } else {
14978       ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_LatLonCountry, fmt, lat_lon, lmp->fullguess, country, lmp->claimedfull, lmp->claimeddistance);
14979     }
14980   } else {
14981     fmt = "Lat_lon '%s' maps to '%s' instead of '%s'";
14982     if (StringICmp (lmp->guesscountry, country) == 0) {
14983       if (vsp->testLatLonSubregion) {
14984         ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_LatLonState, fmt, lat_lon, lmp->fullguess, fullname);
14985       }
14986     } else {
14987       if (StringNCmp (fullname, "Norway: Svalbard", 16) == 0 && StringCmp (lmp->fullguess, "Svalbard") == 0) {
14988       } else {
14989         ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_LatLonCountry, fmt, lat_lon, lmp->fullguess, fullname);
14990       }
14991     }
14992   }
14993 }
14994 
14995 
14996 typedef enum {
14997   eLatLonAdjust_none = 0 ,
14998   eLatLonAdjust_flip = 1 ,
14999   eLatLonAdjust_negate_lat = 2 ,
15000   eLatLonAdjust_negate_lon = 4
15001 } ELatLonAdjust;
15002 
15003 
15004 static void
CheckForFlippedCoordinates(FloatHi lat,FloatHi lon,FloatHi scale,CharPtr country,CharPtr province,CharPtr fullname,LatLonMapPtr adjusted,Uint4Ptr adjust_test,Uint4Ptr adjust)15005 CheckForFlippedCoordinates
15006 (FloatHi lat,
15007  FloatHi lon,
15008  FloatHi scale,
15009  CharPtr country,
15010  CharPtr province,
15011  CharPtr fullname,
15012  LatLonMapPtr adjusted,
15013  Uint4Ptr adjust_test,
15014  Uint4Ptr adjust)
15015 {
15016   *adjust_test = 0;
15017   *adjust = eLatLonAdjust_none;
15018 
15019   /* try using lon for lat, lat for lon */
15020   CalculateLatLonMap (lon, lat, country, province, scale, adjusted);
15021   *adjust_test = ClassifyLatLonMap (fullname, country, province, adjusted);
15022   if (*adjust_test) {
15023     *adjust = eLatLonAdjust_flip;
15024   } else {
15025     /* try negative lat */
15026     CalculateLatLonMap (-lat, lon, country, province, scale, adjusted);
15027     *adjust_test = ClassifyLatLonMap (fullname, country, province, adjusted);
15028     if (*adjust_test) {
15029       *adjust = eLatLonAdjust_negate_lat;
15030     } else {
15031       /* try negative lon */
15032       CalculateLatLonMap (lat, -lon, country, province, scale, adjusted);
15033       *adjust_test = ClassifyLatLonMap (fullname, country, province, adjusted);
15034       if (*adjust_test) {
15035         *adjust = eLatLonAdjust_negate_lon;
15036       }
15037     }
15038   }
15039 }
15040 
15041 typedef struct waterpair {
15042   CharPtr  sea;
15043   CharPtr  ocean;
15044 } WaterPairData, PNTR WaterPairPtr;
15045 
15046 static WaterPairData sea_parents [] = {
15047   {"Adriatic Sea",         "Mediterranean Sea"},
15048   {"Aegean Sea",           "Mediterranean Sea"},
15049   {"Alboran Sea",          "Mediterranean Sea"},
15050   {"Andaman Sea",          "Indian Ocean"},
15051   {"Arabian Sea",          "Indian Ocean"},
15052   {"Argentine Sea",        "Atlantic Ocean"},
15053   {"Ariake Sea",           "Pacific Ocean"},
15054   {"Baffin Bay",           "Atlantic Ocean"},
15055   {"Balearic Sea",         "Mediterranean Sea"},
15056   {"Baltic Sea",           "Atlantic Ocean"},
15057   {"Barents Sea",          "Arctic Ocean"},
15058   {"Bay of Bengal",        "Indian Ocean"},
15059   {"Beaufort Sea",         "Arctic Ocean"},
15060   {"Bering Sea",           "Pacific Ocean"},
15061   {"Bismarck Sea",         "Pacific Ocean"},
15062   {"Black Sea",            "Mediterranean Sea"},
15063   {"Bohai Sea",            "Pacific Ocean"},
15064   {"Caribbean Sea",        "Atlantic Ocean"},
15065   {"Celebes Sea",          "Pacific Ocean"},
15066   {"Champlain Sea",        "Atlantic Ocean"},
15067   {"Chilean Sea",          "Pacific Ocean"},
15068   {"China Seas",           "Pacific Ocean"},
15069   {"Chukchi Sea",          "Arctic Ocean"},
15070   {"Coral Sea",            "Pacific Ocean"},
15071   {"Davis Strait",         "Atlantic Ocean"},
15072   {"East China Sea",       "Pacific Ocean"},
15073   {"East Siberian Sea",    "Arctic Ocean"},
15074   {"English Channel",      "Atlantic Ocean"},
15075   {"Erythraean Sea",       "Indian Ocean"},
15076   {"Greenland Sea",        "Arctic Ocean"},
15077   {"Gulf of Mexico",       "Atlantic Ocean"},
15078   {"Gulf of Thailand",     "Pacific Ocean"},
15079   {"Gulf of Tonkin",       "Pacific Ocean"},
15080   {"Hudson Bay",           "Arctic Ocean"},
15081   {"Ionian Sea",           "Mediterranean Sea"},
15082   {"Irish Sea",            "Atlantic Ocean"},
15083   {"Irminger Sea",         "Atlantic Ocean"},
15084   {"James Bay",            "Atlantic Ocean"},
15085   {"Java Sea",             "Indian Ocean"},
15086   {"Kara Sea",             "Arctic Ocean"},
15087   {"Koro Sea",             "Pacific Ocean"},
15088   {"Labrador Sea",         "Atlantic Ocean"},
15089   {"Laccadive Sea",        "Indian Ocean"},
15090   {"Laptev Sea",           "Arctic Ocean"},
15091   {"Ligurian Sea",         "Mediterranean Sea"},
15092   {"Lincoln Sea",          "Arctic Ocean"},
15093   {"Myrtoan Sea",          "Mediterranean Sea"},
15094   {"North Sea",            "Atlantic Ocean"},
15095   {"Norwegian Sea",        "Atlantic Ocean"},
15096   {"Pechora Sea",          "Arctic Ocean"},
15097   {"Persian Gulf",         "Indian Ocean"},
15098   {"Philippine Sea",       "Pacific Ocean"},
15099   {"Red Sea",              "Indian Ocean"},
15100   {"Salish Sea",           "Pacific Ocean"},
15101   {"Sargasso Sea",         "Atlantic Ocean"},
15102   {"Scotia Sea",           "Southern Ocean"},
15103   {"Sea of Azov",          "Black Sea"},
15104   {"Sea of Chiloe",        "Pacific Ocean"},
15105   {"Sea of Crete",         "Mediterranean Sea"},
15106   {"Sea of Japan",         "Pacific Ocean"},
15107   {"Sea of Okhotsk",       "Pacific Ocean"},
15108   {"Sea of the Hebrides",  "Atlantic Ocean"},
15109   {"Sea of Zanj",          "Indian Ocean"},
15110   {"Seas of Greenland",    "Atlantic Ocean"},
15111   {"Sethusamudram",        "Indian Ocean"},
15112   {"Sibutu Passage",       "Pacific Ocean"},
15113   {"Solomon Sea",          "Pacific Ocean"},
15114   {"South China Sea",      "Pacific Ocean"},
15115   {"Sulu Sea",             "Pacific Ocean"},
15116   {"Tasman Sea",           "Pacific Ocean"},
15117   {"Thracian Sea",         "Mediterranean Sea"},
15118   {"Timor Sea",            "Indian Ocean"},
15119   {"Tyrrhenian Sea",       "Mediterranean Sea"},
15120   {"Wandel Sea",           "Arctic Ocean"},
15121   {"White Sea",            "Arctic Ocean"},
15122   {"Yellow Sea",           "Pacific Ocean"}
15123 };
15124 
FindSurroundingOcean(CharPtr country)15125 static CharPtr FindSurroundingOcean (
15126   CharPtr country
15127 )
15128 
15129 {
15130   Int2 L, R, mid;
15131 
15132   if (StringHasNoText (country)) return NULL;
15133 
15134   L = 0;
15135   R = sizeof (sea_parents) / sizeof (WaterPairData) - 1;
15136 
15137   while (L < R) {
15138     mid = (L + R) / 2;
15139     if (StringICmp (sea_parents [mid].sea, country) < 0) {
15140       L = mid + 1;
15141     } else {
15142       R = mid;
15143     }
15144   }
15145 
15146   if (StringICmp (sea_parents [R].sea, country) == 0) {
15147     return sea_parents [R].ocean;
15148   }
15149 
15150   return NULL;
15151 }
15152 
RepairCountryName(CharPtr countryname,CharPtr cbuf)15153 static CharPtr RepairCountryName (
15154   CharPtr countryname,
15155   CharPtr cbuf
15156 )
15157 
15158 {
15159   CharPtr  comma;
15160 
15161   if (StringHasNoText (countryname)) return NULL;
15162   if (StringLen (countryname) > 400) return countryname;
15163 
15164   if (StringNCmp (countryname, "USA:", 4) != 0) return countryname;
15165 
15166   comma = StringChr (countryname, ',');
15167 
15168   if (StringICmp (countryname, "USA: Washington DC") == 0 || StringICmp (countryname, "USA: Washington, DC") == 0) {
15169     StringCpy (cbuf, "USA: District of Columbia");
15170     countryname = cbuf;
15171   } else if (StringICmp (comma, ", Puerto Rico") == 0) {
15172     StringCpy (cbuf, "USA: Puerto Rico");
15173     countryname = cbuf;
15174   }
15175 
15176   return countryname;
15177 }
15178 
NewerValidateCountryLatLon(ValidStructPtr vsp,GatherContextPtr gcp,CharPtr countryname,CharPtr lat_lon)15179 static void NewerValidateCountryLatLon (
15180   ValidStructPtr vsp,
15181   GatherContextPtr gcp,
15182   CharPtr countryname,
15183   CharPtr lat_lon
15184 )
15185 
15186 {
15187   Char        buf0 [256], buf1 [256], buf2 [256];
15188   CharPtr     country = NULL, province = NULL, fullname = NULL, parent;
15189   CtrySetPtr  csp;
15190   Boolean     format_ok = FALSE, lat_in_range = FALSE, lon_in_range = FALSE, precision_ok = FALSE;
15191   FloatHi     lat = 0.0;
15192   FloatHi     lon = 0.0;
15193   LatLonMap   llm, adjusted;
15194   CharPtr     ptr;
15195   FloatHi     scale = 1.0;
15196   FloatHi     neardist = 0.0;
15197   Uint4       adjust = eLatLonAdjust_none, adjust_test = eLatLonAdjust_none;
15198   ELatLonAdjust test;
15199   CharPtr       fmt;
15200 
15201   if (vsp == NULL || gcp == NULL) return;
15202   if (StringHasNoText (countryname)) return;
15203   if (StringHasNoText (lat_lon)) return;
15204 
15205   IsCorrectLatLonFormat (lat_lon, &format_ok, &precision_ok, &lat_in_range, &lon_in_range);
15206   if (! format_ok) {
15207     /* may have comma and then altitude, so just get lat_lon component */
15208     StringNCpy_0 (buf0, lat_lon, sizeof (buf0));
15209     ptr = StringChr (buf0, ',');
15210     if (ptr != NULL) {
15211       *ptr = '\0';
15212       lat_lon = buf0;
15213       IsCorrectLatLonFormat (lat_lon, &format_ok, &precision_ok, &lat_in_range, &lon_in_range);
15214     }
15215   }
15216 
15217   /* reality checks - do not bail if only precision issue */
15218   if (! format_ok) {
15219     /* incorrect lat_lon format should be reported elsewhere */
15220     return;
15221   }
15222   if (! lat_in_range) {
15223     /* incorrect latitude range should be reported elsewhere */
15224     return;
15225   }
15226   if (! lon_in_range) {
15227     /* incorrect longitude range should be reported elsewhere */
15228     return;
15229   }
15230 
15231   if (! ParseLatLon (lat_lon, &lat, &lon)) {
15232     /* report unable to parse lat_lon */
15233     return;
15234   }
15235 
15236   StringNCpy_0 (buf1, countryname, sizeof (buf1));
15237   /* trim at comma or semicolon, leaving only country/ocean and possibly state/province */
15238   ptr = StringChr (buf1, ',');
15239   if (ptr != NULL) {
15240     *ptr = '\0';
15241   }
15242   ptr = StringChr (buf1, ';');
15243   if (ptr != NULL) {
15244     *ptr = '\0';
15245   }
15246   TrimSpacesAroundString (buf1);
15247   if (StringDoesHaveText (buf1)) {
15248     fullname = buf1;
15249   }
15250 
15251   StringNCpy_0 (buf2, buf1, sizeof (buf2));
15252   /* separate country from state/province */
15253   ptr = StringChr (buf2, ':');
15254   if (ptr != NULL) {
15255     if (CountryIsInLatLonList (buf2)) {
15256       /* store province if in data list as subregion of designated country */
15257       *ptr = '\0';
15258       ptr++;
15259       TrimSpacesAroundString (ptr);
15260       if (StringDoesHaveText (ptr)) {
15261         province = ptr;
15262       }
15263     } else {
15264       /* otherwise just truncate country at colon, trimming further descriptive information */
15265       *ptr = '\0';
15266       ptr++;
15267     }
15268   }
15269   TrimSpacesAroundString (buf2);
15270   if (StringDoesHaveText (buf2)) {
15271     country = buf2;
15272   }
15273 
15274   if (StringHasNoText (country)) {
15275     /* report leading colon without country */
15276     return;
15277   }
15278 
15279   /* known exceptions - don't even bother calculating any further */
15280   if (StringCmp (country, "Antarctica") == 0 && lat < -60.0) {
15281     return;
15282   }
15283 
15284   if (province != NULL) {
15285     /* do not attempt quick exit */
15286   } else if (CountryIsInLatLonList (country)) {
15287     if (CountryContainsLatLon (country, lat, lon)) return;
15288   } else if (WaterIsInLatLonList (country)) {
15289     if (WaterContainsLatLon (country, lat, lon)) return;
15290   } else if (StringICmp (country, "Palestine") == 0 || StringICmp (country, "State of Palestine") == 0) {
15291   } else {
15292     /* report unrecognized country */
15293     return;
15294   }
15295 
15296   csp = GetLatLonCountryData ();
15297   if (csp == NULL) {
15298     /* report unable to find data */
15299     return;
15300   }
15301 
15302   /* scale (reciprocal of degree resolution) needed for adjusting offshore distance calculation */
15303   scale = csp->scale;
15304 
15305   /* calculate assignment or proximity by coordinates */
15306   CalculateLatLonMap (lat, lon, country, province, scale, &llm);
15307 
15308   if (llm.guesscountry == NULL && llm.guesswater != NULL) {
15309     parent = FindSurroundingOcean (llm.guesswater);
15310     if (parent != NULL) {
15311       if (StringICmp (parent, country) == 0) return;
15312     }
15313   }
15314 
15315   /* compare indicated country/province to guess/proximate country/water */
15316   test = ClassifyLatLonMap (fullname, country, province, &llm);
15317 
15318   if (!test /* && lat < 5.0 */ && llm.guesscountry != NULL && llm.guesswater == NULL) {
15319     CheckForFlippedCoordinates (lat, lon, scale, country, province, fullname, &adjusted, &adjust_test, &adjust);
15320     if (adjust_test && adjusted.guesscountry != NULL && adjusted.guesswater == NULL) {
15321       test = adjust_test;
15322       MemCopy (&llm, &adjusted, sizeof (LatLonMap));
15323     } else {
15324       adjust = eLatLonAdjust_none;
15325     }
15326   }
15327 
15328   if (!test && CountryIsNearLatLon(country, lat, lon, 2.0, &neardist) && neardist < 5.0) {
15329     llm.guesscountry = country;
15330     llm.guessprovince = NULL;
15331     test = ClassifyLatLonMap (fullname, country, province, &llm);
15332   }
15333 
15334   if (!test
15335       && !CountryIsNearLatLon(country, lat, lon, 20.0, &neardist)
15336       && !WaterIsNearLatLon(country, lat, lon, 20.0, &neardist)
15337       /* && lat >= 5.0 */ && llm.guesscountry != NULL && llm.guesswater == NULL) {
15338     CheckForFlippedCoordinates (lat, lon, scale, country, province, fullname, &adjusted, &adjust_test, &adjust);
15339     if (adjust_test && adjusted.guesscountry != NULL && adjusted.guesswater == NULL) {
15340       test = adjust_test;
15341       MemCopy (&llm, &adjusted, sizeof (LatLonMap));
15342     } else {
15343       adjust = eLatLonAdjust_none;
15344     }
15345   }
15346 
15347   if (adjust) {
15348     if (adjust == eLatLonAdjust_flip) {
15349       ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_LatLonValue, "Latitude and longitude values appear to be exchanged");
15350     } else if (adjust == eLatLonAdjust_negate_lat) {
15351       if (lat < 0.0) {
15352         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_LatLonValue, "Latitude should be set to N (northern hemisphere)");
15353       } else {
15354         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_LatLonValue, "Latitude should be set to S (southern hemisphere)");
15355       }
15356     } else if (adjust == eLatLonAdjust_negate_lon) {
15357       if (lon < 0.0) {
15358         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_LatLonValue, "Longitude should be set to E (eastern hemisphere)");
15359       } else {
15360         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_LatLonValue, "Longitude should be set to W (western hemisphere)");
15361       }
15362     }
15363   } else {
15364     if ((test & eLatLonClassify_CountryMatch) && (test & eLatLonClassify_ProvinceMatch)) {
15365       /* success!  nothing to report */
15366     } else if (test & eLatLonClassify_WaterMatch) {
15367       /* success!  nothing to report */
15368     } else if (test & eLatLonClassify_CountryMatch && province == NULL) {
15369       if (vsp->testLatLonSubregion) {
15370         fmt = "Lat_lon %s is in %s (more specific than %s)";
15371         ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_LatLonState, fmt, lat_lon, llm.fullguess, country);
15372       }
15373     } else if (llm.guesswater != NULL) {
15374       LatLonWaterErrors(vsp, &llm, test, neardist, country, province, lat_lon, fullname, scale);
15375     } else if (llm.guesscountry != NULL) {
15376       if (StringICmp (llm.guesscountry, "Hong Kong") == 0 && StringICmp (country, "China") == 0) {
15377         /* Hong Kong okay as China */
15378       } else if (StringICmp (llm.guesscountry, "Puerto Rico") == 0 && StringICmp (country, "USA") == 0) {
15379       } else if ((StringICmp (llm.guesscountry, "Gaza Strip") == 0 ||
15380                  StringICmp (llm.guesscountry, "West Bank") == 0) &&
15381                 (StringICmp (country, "Palestine") == 0 ||
15382                  StringICmp (country, "State of Palestine") == 0)) {
15383       } else {
15384         LatLonLandErrors (vsp, &llm, country, province, lat_lon, fullname);
15385       }
15386     } else if (llm.closestcountry != NULL) {
15387       fmt = "Lat_lon '%s' is closest to '%s' instead of '%s'";
15388       ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_LatLonCountry, fmt, lat_lon, llm.closestcountry, fullname);
15389     } else if (llm.closestwater != NULL) {
15390       fmt = "Lat_lon '%s' is closest to '%s' instead of '%s'";
15391       ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_LatLonWater, fmt, lat_lon, llm.closestwater, fullname);
15392     } else {
15393       fmt = "Unable to determine mapping for lat_lon '%s' and country '%s'";
15394       ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_LatLonCountry, fmt, lat_lon, fullname);
15395     }
15396   }
15397 }
15398 
15399 
15400 /* note - special case for sex because it prevents a different message from being displayed, do not list here */
15401 static const Uint1 sUnexpectedViralSubSourceQualifiers[] = {
15402   SUBSRC_cell_line,
15403   SUBSRC_cell_type,
15404   SUBSRC_tissue_type,
15405   SUBSRC_dev_stage
15406 };
15407 
15408 static const Int4 sNumUnexpectedViralSubSourceQualifiers = sizeof (sUnexpectedViralSubSourceQualifiers) / sizeof (Uint1);
15409 
15410 
IsUnexpectedViralSubSourceQualifier(Uint1 subtype)15411 static Boolean IsUnexpectedViralSubSourceQualifier (Uint1 subtype)
15412 {
15413   Int4 i;
15414   Boolean rval = FALSE;
15415 
15416   for (i = 0; i < sNumUnexpectedViralSubSourceQualifiers && !rval; i++) {
15417     if (subtype == sUnexpectedViralSubSourceQualifiers[i]) {
15418       rval = TRUE;
15419     }
15420   }
15421   return rval;
15422 }
15423 
15424 static const Uint1 sUnexpectedViralOrgModQualifiers[] = {
15425   ORGMOD_breed,
15426   ORGMOD_cultivar,
15427   ORGMOD_specimen_voucher
15428 };
15429 
15430 static const Int4 sNumUnexpectedViralOrgModQualifiers = sizeof (sUnexpectedViralOrgModQualifiers) / sizeof (Uint1);
15431 
15432 
IsUnexpectedViralOrgModQualifier(Uint1 subtype)15433 static Boolean IsUnexpectedViralOrgModQualifier (Uint1 subtype)
15434 {
15435   Int4 i;
15436   Boolean rval = FALSE;
15437 
15438   for (i = 0; i < sNumUnexpectedViralOrgModQualifiers && !rval; i++) {
15439     if (subtype == sUnexpectedViralOrgModQualifiers[i]) {
15440       rval = TRUE;
15441     }
15442   }
15443   return rval;
15444 }
15445 
ValGetDbtagStr(DbtagPtr dbt,CharPtr buf)15446 static CharPtr ValGetDbtagStr (DbtagPtr dbt, CharPtr buf)
15447 
15448 {
15449   ObjectIdPtr  oip;
15450   CharPtr      rslt;
15451 
15452   rslt = "";
15453   if (dbt == NULL || buf == NULL) return rslt;
15454 
15455   oip = dbt->tag;
15456   if (oip == NULL) return rslt;
15457 
15458   if (oip->str != NULL) return oip->str;
15459   if (oip->id == 0) return rslt;
15460 
15461   sprintf (buf, "%ld", (long) oip->id);
15462   return buf;
15463 }
15464 
15465 /**********************************************************/
s_IfContains(CharPtr name,CharPtr pat)15466 static Boolean s_IfContains(CharPtr name, CharPtr pat)
15467 {
15468     CharPtr p;
15469 
15470     p = StringISearch(name, pat);
15471 
15472     if(p && (p == name || *(p - 1) == ' '))
15473         return(TRUE);
15474     return(FALSE);
15475 }
15476 
ValidateBioSource(ValidStructPtr vsp,GatherContextPtr gcp,BioSourcePtr biop,SeqFeatPtr sfp,ValNodePtr sdp)15477 static void ValidateBioSource (ValidStructPtr vsp, GatherContextPtr gcp, BioSourcePtr biop, SeqFeatPtr sfp, ValNodePtr sdp)
15478 {
15479   Char            badch;
15480   Boolean         bad_cap = FALSE;
15481   Boolean         bad_frequency;
15482   BioseqPtr       bsp = NULL;
15483   BioseqSetPtr    bssp;
15484   Char            buf [32];
15485   Char            cbuf [512];
15486   Char            ch;
15487   Boolean         chromconf = FALSE;
15488   Int2            chromcount = 0;
15489   SubSourcePtr    chromosome = NULL;
15490   CharPtr         countryname = NULL;
15491   CtrySetPtr      csp;
15492   ValNodePtr      db;
15493   DbtagPtr        dbt;
15494   CharPtr         gb_synonym = NULL;
15495   CharPtr         good;
15496   Boolean         has_isolate = FALSE;
15497   Boolean         has_strain = FALSE;
15498   Boolean         has_taxon = FALSE;
15499   Boolean         has_fwd_pcr_seq = FALSE;
15500   Boolean         has_rev_pcr_seq = FALSE;
15501   Boolean         has_pcr_name = FALSE;
15502   Boolean         has_metagenome_source = FALSE;
15503   Int4            id;
15504   Boolean         is_iso_source = FALSE;
15505   Boolean         is_mating_type = FALSE;
15506   Boolean         is_sex = FALSE;
15507   Boolean         is_specific_host = FALSE;
15508   Boolean         isAnimal = FALSE;
15509   Boolean         isArchaea = FALSE;
15510   Boolean         isBacteria = FALSE;
15511   Boolean         isBioSample = FALSE;
15512   Boolean         isEukaryote = FALSE;
15513   Boolean         isFungal = FALSE;
15514   Boolean         isMicrosporidia = FALSE;
15515   Boolean         isPlant = FALSE;
15516   Boolean         isViral = FALSE;
15517   Boolean         is_bc;
15518   Boolean         is_rf;
15519   Boolean         is_sc;
15520   CharPtr         last_db = NULL;
15521   CharPtr         lat_lon = NULL;
15522   size_t          len;
15523   Int2            num_altitude = 0;
15524   Int2            num_bio_material = 0;
15525   Int2            num_collection_dates = 0;
15526   Int2            num_culture_collection = 0;
15527   Int2            num_specimen_voucher = 0;
15528   Int2            num_country = 0;
15529   Int2            num_lat_lon = 0;
15530   Int2            num_fwd_primer_seq = 0;
15531   Int2            num_rev_primer_seq = 0;
15532   Int2            num_fwd_primer_name = 0;
15533   Int2            num_rev_primer_name = 0;
15534   Int2            num_plasmid_name = 0;
15535   Int2            num_germline = 0;
15536   Int2            num_rearranged = 0;
15537   Int2            num_transgenic = 0;
15538   Int2            num_metagenomic = 0;
15539   Int2            num_env_sample = 0;
15540   ObjectIdPtr     oip;
15541   Boolean         old_country = FALSE;
15542   OrgNamePtr      onp;
15543   OrgModPtr       omp, nxtomp;
15544   OrgRefPtr       orp;
15545   ObjValNodePtr   ovp;
15546   Int4            primer_len_before;
15547   Int4            primer_len_after;
15548   ValNodePtr      pset;
15549   SeqEntryPtr     sep;
15550   ErrSev          sev;
15551   SubSourcePtr    ssp;
15552   CharPtr         str;
15553   CharPtr         synonym = NULL;
15554   UserFieldPtr    ufp;
15555   UserObjectPtr   uop;
15556   ValNodePtr      vnp;
15557   Boolean         varietyOK;
15558   CharPtr         inst1, inst2, id1, id2, coll1, coll2;
15559   Char            buf1 [512], buf2 [512];
15560   PCRPrimerPtr      ppp;
15561   PCRReactionSetPtr prp;
15562   SeqMgrDescContext dcontext;
15563   CharPtr p;
15564 
15565   if (vsp == NULL) return;
15566   if (vsp->sourceQualTags == NULL) {
15567     InitializeSourceQualTags (vsp);
15568   }
15569   if (biop == NULL) return;
15570 
15571   if (biop->genome == GENOME_transposon || biop->genome == GENOME_insertion_seq) {
15572     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_ObsoleteSourceLocation,
15573               "Transposon and insertion sequence are no longer legal locations");
15574   }
15575 
15576   if (vsp->indexerVersion && biop->genome == GENOME_chromosome) {
15577     ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_ChromosomeLocation, "INDEXER_ONLY - BioSource location is chromosome");
15578   }
15579 
15580   orp = biop->org;
15581   if (orp != NULL) {
15582 
15583     for (db = orp->db; db != NULL; db = db->next) {
15584       dbt = (DbtagPtr) db->data.ptrvalue;
15585       if (dbt == NULL) continue;
15586       if (StringICmp (dbt->db, "taxon") == 0) {
15587         has_taxon = TRUE;
15588       }
15589     }
15590 
15591     onp = orp->orgname;
15592     if (onp != NULL) {
15593       if (StringNICmp (onp->lineage, "Eukaryota; ", 11) == 0) {
15594         isEukaryote = TRUE;
15595         if (StringNICmp (onp->lineage, "Eukaryota; Metazoa; ", 20) == 0) {
15596           isAnimal = TRUE;
15597         } else if (StringNICmp (onp->lineage, "Eukaryota; Viridiplantae; Streptophyta; Embryophyta; ", 53) == 0 ||
15598                    StringNICmp (onp->lineage, "Eukaryota; Rhodophyta; ", 23) == 0 ||
15599                    StringNICmp (onp->lineage, "Eukaryota; stramenopiles; Phaeophyceae; ", 40) == 0) {
15600           isPlant = TRUE;
15601         } else if (StringNICmp (onp->lineage, "Eukaryota; Fungi; ", 18) == 0) {
15602           isFungal = TRUE;
15603           if (StringNICmp (onp->lineage, "Eukaryota; Fungi; Microsporidia; ", 33) == 0) {
15604             isMicrosporidia = TRUE;
15605           }
15606         }
15607       } else if (StringNICmp (onp->lineage, "Bacteria; ", 10) == 0) {
15608         isBacteria = TRUE;
15609       } else if (StringNICmp (onp->lineage, "Archaea; ", 9) == 0) {
15610         isArchaea = TRUE;
15611       } else if (StringNICmp (onp->lineage, "Viruses; ", 9) == 0) {
15612         isViral = TRUE;
15613       }
15614     }
15615   }
15616 
15617   if (isBacteria) {
15618     bsp = NULL;
15619     if (sfp != NULL) {
15620       bsp = BioseqFindFromSeqLoc (sfp->location);
15621     } else if (sdp != NULL && sdp->extended != 0) {
15622       ovp = (ObjValNodePtr) sdp;
15623       if (ovp->idx.parenttype == OBJ_BIOSEQ) {
15624         bsp = (BioseqPtr) ovp->idx.parentptr;
15625       } else if (ovp->idx.parenttype == OBJ_BIOSEQSET) {
15626         bssp = (BioseqSetPtr) ovp->idx.parentptr;
15627         if (bssp != NULL) {
15628           sep = bssp->seqentry;
15629           if (sep != NULL) {
15630             sep = FindNthBioseq (sep, 1);
15631             if (sep != NULL && IS_Bioseq (sep)) {
15632               bsp = (BioseqPtr) sep->data.ptrvalue;
15633             }
15634           }
15635         }
15636       }
15637     }
15638     if (bsp != NULL) {
15639       vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_user, &dcontext);
15640       while (vnp != NULL) {
15641         uop = (UserObjectPtr) vnp->data.ptrvalue;
15642         if (uop != NULL) {
15643           oip = uop->type;
15644           if (oip != NULL && StringICmp (oip->str, "DBLink") == 0) {
15645             for (ufp = uop->data; ufp != NULL; ufp = ufp->next) {
15646               oip = ufp->label;
15647               if (oip == NULL || oip->str == NULL) continue;
15648               if (StringICmp (oip->str, "BioSample") == 0 && (ufp->choice == 1 || ufp->choice == 7)) {
15649                 isBioSample = TRUE;
15650               }
15651             }
15652           }
15653         }
15654         vnp = SeqMgrGetNextDescriptor (bsp, vnp, Seq_descr_user, &dcontext);
15655       }
15656     }
15657   }
15658 
15659   ssp = biop->subtype;
15660   while (ssp != NULL) {
15661     str = ssp->name;
15662     if (StringCmp (str, "N/A") == 0 || StringCmp (str, "Missing") == 0) {
15663       ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Subsource name should not be %s", str);
15664     }
15665     if (ssp->subtype == SUBSRC_country) {
15666       num_country++;
15667       countryname = ssp->name;
15668       if (CountryIsValid (countryname, &old_country, &bad_cap)) {
15669         if (bad_cap) {
15670           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadCountryCapitalization, "Bad country capitalization [%s]", countryname);
15671         }
15672         len = StringLen (countryname);
15673         if (len > 0 && countryname [len - 1] == ':') {
15674           //LCOV_EXCL_START
15675           //BasicCleanup strips colon from end of country name
15676           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadCountryCode, "Colon at end of country name [%s]", countryname);
15677           //LCOV_EXCL_STOP
15678         }
15679       } else {
15680         if (StringHasNoText (countryname)) {
15681           countryname = "?";
15682         }
15683         if (old_country) {
15684           ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_ReplacedCountryCode, "Replaced country name [%s]", countryname);
15685         } else {
15686           /*
15687           sev = SEV_ERROR;
15688           if (vsp->is_barcode_sep && vsp->seqSubmitParent) {
15689             sev = SEV_WARNING;
15690           }
15691           */
15692           sev = SEV_ERROR;
15693           ValidErr (vsp, sev, ERR_SEQ_DESCR_BadCountryCode, "Bad country name [%s]", countryname);
15694         }
15695       }
15696     } else if (ssp->subtype == SUBSRC_chromosome) {
15697       chromcount++;
15698       if (chromosome != NULL) {
15699         if (StringICmp (ssp->name, chromosome->name) != 0) {
15700           chromconf = TRUE;
15701         }
15702       } else {
15703         chromosome = ssp;
15704       }
15705     } else if (ssp->subtype == SUBSRC_transposon_name || ssp->subtype == SUBSRC_insertion_seq_name) {
15706       ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_ObsoleteSourceQual,
15707                 "Transposon name and insertion sequence name are no longer legal qualifiers");
15708     } else if (ssp->subtype == 0) {
15709       //LCOV_EXCL_START
15710       //not valid ASN.1
15711       ValidErr (vsp, SEV_REJECT, ERR_SEQ_DESCR_BadSubSource, "Unknown subsource subtype %d", (int) (ssp->subtype));
15712       //LCOV_EXCL_STOP
15713     } else if (ssp->subtype == SUBSRC_other) {
15714       ValidateSourceQualTags (vsp, gcp, biop, ssp->name);
15715     } else if (ssp->subtype == SUBSRC_germline) {
15716       num_germline++;
15717       str = ssp->name;
15718       if (str == NULL || str [0] != '\0') {
15719         //LCOV_EXCL_START
15720         //BasicCleanup removes germline text
15721         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Germline qualifier should not have descriptive text");
15722         //LCOV_EXCL_STOP
15723       }
15724     } else if (ssp->subtype == SUBSRC_rearranged) {
15725       num_rearranged++;
15726       str = ssp->name;
15727       if (str == NULL || str [0] != '\0') {
15728         //LCOV_EXCL_START
15729         //BasicCleanup removes rearranged text
15730         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Rearranged qualifier should not have descriptive text");
15731         //LCOV_EXCL_STOP
15732       }
15733     } else if (ssp->subtype == SUBSRC_transgenic) {
15734       num_transgenic++;
15735       str = ssp->name;
15736       if (str == NULL || str [0] != '\0') {
15737         //LCOV_EXCL_START
15738         //BasicCleanup removes transgenic text
15739         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Transgenic qualifier should not have descriptive text");
15740         //LCOV_EXCL_STOP
15741       }
15742     } else if (ssp->subtype == SUBSRC_environmental_sample) {
15743       num_env_sample++;
15744       str = ssp->name;
15745       if (str == NULL || str [0] != '\0') {
15746         //LCOV_EXCL_START
15747         //BasicCleanup removes environmental-sample text
15748         ValidErr(vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Environmental_sample qualifier should not have descriptive text");
15749         //LCOV_EXCL_STOP
15750       }
15751     } else if (ssp->subtype == SUBSRC_metagenomic) {
15752       num_metagenomic++;
15753       str = ssp->name;
15754       if (str == NULL || str [0] != '\0') {
15755         //LCOV_EXCL_START
15756         //BasicCleanup removes metagenomic text
15757         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Metagenomic qualifier should not have descriptive text");
15758         //LCOV_EXCL_STOP
15759       }
15760     } else if (ssp->subtype == SUBSRC_isolation_source) {
15761       is_iso_source = TRUE;
15762     } else if (ssp->subtype == SUBSRC_sex) {
15763       is_sex = TRUE;
15764       str = ssp->name;
15765       if (StringHasNoText (str)) {
15766         str = "?";
15767       }
15768       sev = SEV_WARNING;
15769       if (IsGenomicPipeline (vsp)) {
15770         sev = SEV_ERROR;
15771       }
15772       if (isAnimal || isPlant) {
15773         /* always allow /sex, but now check values */
15774         if (! IsValidSexValue (str)) {
15775           ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_BioSourceInconsistency, "Invalid value (%s) for /sex qualifier", str);
15776         }
15777       } else if (isViral) {
15778         ValidErr (vsp, sev, ERR_SEQ_DESCR_BioSourceInconsistency, "Virus has unexpected Sex qualifier");
15779       } else if (isBacteria || isArchaea || isFungal) {
15780         ValidErr (vsp, sev, ERR_SEQ_DESCR_BioSourceInconsistency, "Unexpected use of /sex qualifier");
15781       } else {
15782         if (! IsValidSexValue (str)) {
15783           /* otherwise expect male or female, or a few others */
15784           ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_BioSourceInconsistency, "Invalid value (%s) for /sex qualifier", str);
15785         }
15786       }
15787     } else if (ssp->subtype == SUBSRC_mating_type) {
15788       is_mating_type = TRUE;
15789       str = ssp->name;
15790       if (isAnimal || isPlant || isViral) {
15791         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Unexpected use of /mating_type qualifier");
15792       } else if (IsValidSexValue (str)) {
15793         /* complain if one of the values that should go in /sex */
15794         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Unexpected use of /mating_type qualifier");
15795       }
15796     } else if (ssp->subtype == SUBSRC_plasmid_name) {
15797       num_plasmid_name++;
15798       if (biop->genome != GENOME_plasmid) {
15799         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Plasmid subsource but not plasmid location");
15800       }
15801     } else if (ssp->subtype == SUBSRC_plastid_name) {
15802       if (StringCmp (ssp->name, "chloroplast") == 0) {
15803         if (biop->genome != GENOME_chloroplast) {
15804           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Plastid name subsource chloroplast but not chloroplast location");
15805         }
15806       } else if (StringCmp (ssp->name, "chromoplast") == 0) {
15807         if (biop->genome != GENOME_chromoplast) {
15808           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Plastid name subsource chromoplast but not chromoplast location");
15809         }
15810       } else if (StringCmp (ssp->name, "kinetoplast") == 0) {
15811         if (biop->genome != GENOME_kinetoplast) {
15812           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Plastid name subsource kinetoplast but not kinetoplast location");
15813         }
15814       } else if (StringCmp (ssp->name, "plastid") == 0) {
15815         if (biop->genome != GENOME_plastid) {
15816           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Plastid name subsource plastid but not plastid location");
15817         }
15818       } else if (StringCmp (ssp->name, "apicoplast") == 0) {
15819         if (biop->genome != GENOME_apicoplast) {
15820           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Plastid name subsource apicoplast but not apicoplast location");
15821         }
15822       } else if (StringCmp (ssp->name, "leucoplast") == 0) {
15823         if (biop->genome != GENOME_leucoplast) {
15824           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Plastid name subsource leucoplast but not leucoplast location");
15825         }
15826       } else if (StringCmp (ssp->name, "proplastid") == 0) {
15827         if (biop->genome != GENOME_proplastid) {
15828           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Plastid name subsource proplastid but not proplastid location");
15829         }
15830       } else if (StringCmp (ssp->name, "chromatophore") == 0) {
15831         if (biop->genome != GENOME_chromatophore) {
15832           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Plastid name subsource chromatophore but not chromatophore location");
15833         }
15834       } else {
15835         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Plastid name subsource contains unrecognized value");
15836       }
15837     } else if (ssp->subtype == SUBSRC_collection_date) {
15838       num_collection_dates++;
15839       if (! CollectionDateIsValid (ssp->name)) {
15840         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadCollectionDate, "Collection_date format is not in DD-Mmm-YYYY format");
15841       }
15842       else if (CollectionDateIsInTheFuture (ssp->name)) {
15843         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadCollectionDate, "Collection_date is in the future");
15844       }
15845       else if (! CollectionDatesInOrder (ssp->name)) {
15846         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadCollectionDate, "Collection_dates are out of order");
15847       }
15848     } else if (ssp->subtype == SUBSRC_fwd_primer_seq) {
15849       //LCOV_EXCL_START
15850       //BasicCleanup converts to biosource.pcr_primers structure
15851       num_fwd_primer_seq++;
15852       has_fwd_pcr_seq = TRUE;
15853       if (! PrimerSeqIsValid (vsp, ssp->name, &badch)) {
15854         if (badch < ' ' || badch > '~') {
15855           badch = '?';
15856         }
15857         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadPCRPrimerSequence,
15858                   "PCR forward primer sequence format is incorrect, first bad character is '%c'", (char) badch);
15859       }
15860       /*
15861       if (PrimerSeqHasDuplicates (ssp->name)) {
15862         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_DuplicatePCRPrimerSequence,
15863                   "PCR forward primer sequence has duplicates");
15864       }
15865       */
15866       //LCOV_EXCL_STOP
15867     } else if (ssp->subtype == SUBSRC_rev_primer_seq) {
15868       //LCOV_EXCL_START
15869       //BasicCleanup converts to biosource.pcr_primers structure
15870       num_rev_primer_seq++;
15871       has_rev_pcr_seq = TRUE;
15872       if (! PrimerSeqIsValid (vsp, ssp->name, &badch)) {
15873         if (badch < ' ' || badch > '~') {
15874           badch = '?';
15875         }
15876         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadPCRPrimerSequence,
15877                   "PCR reverse primer sequence format is incorrect, first bad character is '%c'", (char) badch);
15878       }
15879       /*
15880       if (PrimerSeqHasDuplicates (ssp->name)) {
15881         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_DuplicatePCRPrimerSequence,
15882                   "PCR reverse primer sequence has duplicates");
15883       }
15884       */
15885       //LCOV_EXCL_STOP
15886     } else if (ssp->subtype == SUBSRC_fwd_primer_name) {
15887       //LCOV_EXCL_START
15888       //BasicCleanup converts to biosource.pcr_primers structure
15889       num_fwd_primer_name++;
15890       if (StringLen (ssp->name) > 10 && PrimerSeqIsValid (vsp, ssp->name, &badch)) {
15891         if (badch < ' ' || badch > '~') {
15892           badch = '?';
15893         }
15894         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadPCRPrimerName, "PCR primer name appears to be a sequence");
15895       }
15896       has_pcr_name = TRUE;
15897       //LCOV_EXCL_STOP
15898     } else if (ssp->subtype == SUBSRC_rev_primer_name) {
15899       //LCOV_EXCL_START
15900       //BasicCleanup converts to biosource.pcr_primers structure
15901       num_rev_primer_name++;
15902       if (StringLen (ssp->name) > 10 && PrimerSeqIsValid (vsp, ssp->name, &badch)) {
15903         if (badch < ' ' || badch > '~') {
15904           badch = '?';
15905         }
15906         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadPCRPrimerName, "PCR primer name appears to be a sequence");
15907       }
15908       has_pcr_name = TRUE;
15909       //LCOV_EXCL_STOP
15910     } else if (ssp->subtype == SUBSRC_lat_lon) {
15911       num_lat_lon++;
15912       if (lat_lon != NULL) {
15913         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_LatLonProblem, "Multiple lat_lon on BioSource");
15914       }
15915       lat_lon = ssp->name;
15916       ValidateLatLon (vsp, lat_lon);
15917     } else if (ssp->subtype == SUBSRC_altitude) {
15918       num_altitude++;
15919       if (! AltitudeIsValid (ssp->name)) {
15920         ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_BadAltitude, "bad altitude qualifier value %s", ssp->name);
15921       }
15922     } else if (ssp->subtype == SUBSRC_frequency) {
15923       str = ssp->name;
15924       if (StringDoesHaveText (str)) {
15925         bad_frequency = FALSE;
15926         if (StringCmp (str, "0") == 0) {
15927           /* ignore */
15928         } else if (StringCmp (str, "1") == 0) {
15929           ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_BioSourceInconsistency, "bad frequency qualifier value %s", ssp->name);
15930         } else {
15931           ch = *str;
15932           if (ch == '0') {
15933             str++;
15934             ch = *str;
15935           }
15936           if (ch == '.') {
15937             str++;
15938             ch = *str;
15939             if (! IS_DIGIT (ch)) {
15940               bad_frequency = TRUE;
15941             } else {
15942               while (ch != '\0') {
15943                 if (! IS_DIGIT (ch)) {
15944                   bad_frequency = TRUE;
15945                 }
15946                 str++;
15947                 ch = *str;
15948               }
15949             }
15950           } else {
15951             bad_frequency = TRUE;
15952           }
15953         }
15954         if (bad_frequency) {
15955           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "bad frequency qualifier value %s", ssp->name);
15956         }
15957       }
15958     }
15959 
15960     if (isViral && IsUnexpectedViralSubSourceQualifier(ssp->subtype)) {
15961       ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Virus has unexpected %s qualifier", GetSubsourceQualName (ssp->subtype));
15962     }
15963     ssp = ssp->next;
15964   }
15965 
15966   if (biop->genome == GENOME_plasmid) {
15967     if (!num_plasmid_name) {
15968       ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency,
15969                 "Plasmid location set but plasmid name missing. Add a plasmid source modifier with the plasmid name. Use unnamed if the name is not known.");
15970     }
15971   }
15972 
15973   if (num_country > 1) {
15974     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MultipleSourceQualifiers, "Multiple country qualifiers present");
15975   }
15976   if (num_lat_lon > 1) {
15977     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MultipleSourceQualifiers, "Multiple lat_lon qualifiers present");
15978   }
15979   if (num_altitude > 1) {
15980     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MultipleSourceQualifiers, "Multiple altitude qualifiers present");
15981   }
15982   //LCOV_EXCL_START
15983   //BasicCleanup converts these to Biosource.pcr_primers structure
15984   if (num_fwd_primer_seq > 1) {
15985     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MultipleSourceQualifiers, "Multiple fwd_primer_seq qualifiers present");
15986   }
15987   if (num_rev_primer_seq > 1) {
15988     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MultipleSourceQualifiers, "Multiple rev_primer_seq qualifiers present");
15989   }
15990   if (num_fwd_primer_name > 1) {
15991     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MultipleSourceQualifiers, "Multiple fwd_primer_name qualifiers present");
15992   }
15993   if (num_rev_primer_name > 1) {
15994     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MultipleSourceQualifiers, "Multiple rev_primer_name qualifiers present");
15995   }
15996   //LCOV_EXCL_STOP
15997   if (num_plasmid_name > 1) {
15998     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MultipleSourceQualifiers, "Multiple plasmid_name qualifiers present");
15999   }
16000   //LCOV_EXCL_START
16001   //BasicCleanup removes duplicates
16002   if (num_germline > 1) {
16003     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MultipleSourceQualifiers, "Multiple germline qualifiers present");
16004   }
16005   if (num_rearranged > 1) {
16006     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MultipleSourceQualifiers, "Multiple rearranged qualifiers present");
16007   }
16008   if (num_transgenic > 1) {
16009     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MultipleSourceQualifiers, "Multiple transgenic qualifiers present");
16010   }
16011   if (num_metagenomic > 1) {
16012     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MultipleSourceQualifiers, "Multiple metagenomic qualifiers present");
16013   }
16014   if (num_env_sample > 1) {
16015     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MultipleSourceQualifiers, "Multiple environtal_sample qualifiers present");
16016   }
16017   //LCOV_EXCL_STOP
16018 
16019   if (countryname != NULL && lat_lon != NULL) {
16020     csp = GetLatLonCountryData ();
16021     if (csp != NULL) {
16022       countryname = RepairCountryName (countryname, cbuf);
16023       NewerValidateCountryLatLon (vsp, gcp, countryname, lat_lon);
16024     }
16025   }
16026 
16027   //LCOV_EXCL_START
16028   //BasicCleanup converts these to Biosource.pcr_primers structure
16029   if (has_pcr_name) {
16030     if ((! has_fwd_pcr_seq) || (! has_rev_pcr_seq)) {
16031       /*
16032       ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadPCRPrimerSequence, "PCR primer has name but not both sequences");
16033       */
16034     }
16035   } else if (has_fwd_pcr_seq || has_rev_pcr_seq) {
16036     if (! (has_fwd_pcr_seq && has_rev_pcr_seq)) {
16037       ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadPCRPrimerSequence, "PCR primer does not have both sequences");
16038     }
16039   }
16040 
16041   pset = ParsePCRSet (biop);
16042   if (pset != NULL) {
16043     pset = ValNodeSort (pset, SortVnpByPCRSetSeq);
16044     primer_len_before = ValNodeLen (pset);
16045     pset = UniqueVnpByPCRSetSeq (pset);
16046     primer_len_after = ValNodeLen (pset);
16047     FreePCRSet (pset);
16048     if (primer_len_before != primer_len_after) {
16049       ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_DuplicatePCRPrimerSequence,
16050                 "PCR primer sequence has duplicates");
16051     }
16052   }
16053   //LCOV_EXCL_STOP
16054 
16055   for (prp = biop->pcr_primers; prp != NULL; prp = prp->next) {
16056 
16057     for (ppp = prp->forward; ppp != NULL; ppp = ppp->next) {
16058       if (StringDoesHaveText (ppp->seq) && (! PrimerSeqIsValid (vsp, ppp->seq, &badch))) {
16059         if (badch < ' ' || badch > '~') {
16060           badch = '?';
16061         }
16062         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadPCRPrimerSequence,
16063                   "PCR forward primer sequence format is incorrect, first bad character is '%c'", (char) badch);
16064       }
16065       if (StringLen (ppp->name) > 10 && PrimerSeqIsValid (vsp, ppp->name, &badch)) {
16066         if (badch < ' ' || badch > '~') {
16067           badch = '?';
16068         }
16069         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadPCRPrimerName, "PCR forward primer name appears to be a sequence");
16070       }
16071     }
16072 
16073     for (ppp = prp->reverse; ppp != NULL; ppp = ppp->next) {
16074       if (StringDoesHaveText (ppp->seq) && (! PrimerSeqIsValid (vsp, ppp->seq, &badch))) {
16075         if (badch < ' ' || badch > '~') {
16076           badch = '?';
16077         }
16078         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadPCRPrimerSequence,
16079                   "PCR reverse primer sequence format is incorrect, first bad character is '%c'", (char) badch);
16080       }
16081       if (StringLen (ppp->name) > 10 && PrimerSeqIsValid (vsp, ppp->name, &badch)) {
16082         if (badch < ' ' || badch > '~') {
16083           badch = '?';
16084         }
16085         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadPCRPrimerName, "PCR reverse primer name appears to be a sequence");
16086       }
16087     }
16088   }
16089 
16090   if (num_germline && num_rearranged) {
16091     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Germline and rearranged should not both be present");
16092   }
16093   if (num_transgenic && num_env_sample) {
16094     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Transgenic and environmental sample should not both be present");
16095   }
16096   if (num_metagenomic && (! num_env_sample)) {
16097     ValidErr (vsp, SEV_REJECT, ERR_SEQ_DESCR_BioSourceInconsistency, "Metagenomic should also have environmental sample annotated");
16098   }
16099   if (is_sex && is_mating_type) {
16100     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Sex and mating type should not both be present");
16101   }
16102 
16103   if (biop->org != NULL
16104       && biop->org->orgname != NULL
16105       && StringISearch (biop->org->orgname->lineage, "metagenomes") != NULL
16106       && !num_metagenomic) {
16107     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "If metagenomes appears in lineage, BioSource should have metagenomic qualifier");
16108   }
16109   if (chromcount > 1) {
16110     if (chromconf) {
16111       ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MultipleChromosomes, "Multiple conflicting chromosome qualifiers");
16112     } else {
16113         //LCOV_EXCL_START
16114         //cleanup removes identical chromosome subsources
16115       ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MultipleChromosomes, "Multiple identical chromosome qualifiers");
16116       //LCOV_EXCL_STOP
16117     }
16118   }
16119   orp = biop->org;
16120   if (orp != NULL) {
16121     /*
16122     if (StringICmp (orp->taxname, "Human immunodeficiency virus") == 0 ||
16123         StringICmp (orp->taxname, "Human immunodeficiency virus 1") == 0 ||
16124         StringICmp (orp->taxname, "Human immunodeficiency virus 2") == 0) {
16125       ValidateLocationForHIV (vsp, biop);
16126     } else */
16127     if (StringICmp (orp->taxname, "uncultured bacterium") == 0) {
16128       bsp = NULL;
16129       if (sfp != NULL) {
16130         bsp = BioseqFindFromSeqLoc (sfp->location);
16131       } else if (sdp != NULL && sdp->extended != 0) {
16132         ovp = (ObjValNodePtr) sdp;
16133         if (ovp->idx.parenttype == OBJ_BIOSEQ) {
16134           bsp = (BioseqPtr) ovp->idx.parentptr;
16135         } else if (ovp->idx.parenttype == OBJ_BIOSEQSET) {
16136           bssp = (BioseqSetPtr) ovp->idx.parentptr;
16137           if (bssp != NULL) {
16138             sep = bssp->seqentry;
16139             if (sep != NULL) {
16140               sep = FindNthBioseq (sep, 1);
16141               if (sep != NULL && IS_Bioseq (sep)) {
16142                 bsp = (BioseqPtr) sep->data.ptrvalue;
16143               }
16144             }
16145           }
16146         }
16147       }
16148       if (bsp != NULL && bsp->length >= 10000 && biop->genome != GENOME_plasmid) {
16149         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Uncultured bacterium sequence length is suspiciously high");
16150       }
16151     }
16152     if (StringNICmp (orp->taxname, "uncultured ", 11) == 0) {
16153       if (! num_env_sample) {
16154         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Uncultured should also have /environmental_sample");
16155       }
16156     }
16157     str = orp->taxname;
16158     if (StringDoesHaveText (str)) {
16159       if (UnbalancedParentheses (str)) {
16160           ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_UnbalancedParentheses,
16161                     "Unbalanced parentheses in taxname '%s'", str);
16162       }
16163       if (StringHasSgml (vsp, str)) {
16164         ValidErr (vsp, SEV_WARNING, ERR_GENERIC_SgmlPresentInText, "taxname %s has SGML", str);
16165       }
16166 
16167       p = StringRChr(str, ' ');
16168       if(p != NULL &&
16169          (StringICmp(p + 1, "sp.") == 0 || StringICmp(p + 1, "sp") == 0) &&
16170          StringNICmp(str, "uncultured ", 11) != 0 &&
16171          StringICmp(str, "Haemoproteus sp.") != 0 &&
16172          s_IfContains(str, "endosymbiont ") == FALSE &&
16173          s_IfContains(str, "symbiont ") == FALSE)
16174         ValidErr (vsp, SEV_INFO,
16175                   ERR_SEQ_DESCR_OrganismIsUndefinedSpecies, "Organism '%s' is undefined species and does not have a specific identifier.",
16176                   str);
16177     }
16178   }
16179 
16180   for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
16181     if (ssp->subtype == SUBSRC_germline ||
16182               ssp->subtype == SUBSRC_rearranged ||
16183               ssp->subtype == SUBSRC_transgenic ||
16184               ssp->subtype == SUBSRC_environmental_sample ||
16185               ssp->subtype == SUBSRC_metagenomic) continue;
16186     str = ssp->name;
16187     if (StringHasNoText (str)) continue;
16188     if (UnbalancedParentheses (str)) {
16189         ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_UnbalancedParentheses,
16190                   "Unbalanced parentheses in subsource '%s'", str);
16191     }
16192     if (StringHasSgml (vsp, str)) {
16193       ValidErr (vsp, SEV_WARNING, ERR_GENERIC_SgmlPresentInText, "subsource %s has SGML", str);
16194     }
16195   }
16196 
16197   if (orp == NULL || (StringHasNoText (orp->taxname) && StringHasNoText (orp->common))) {
16198     ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_NoOrgFound, "No organism name has been applied to this Bioseq.  Other qualifiers may exist.");
16199   }
16200   if (orp == NULL) {
16201     //LCOV_EXCL_START
16202     //Not valid ASN.1
16203     if (num_env_sample && (! is_iso_source) && (! is_specific_host)) {
16204       ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Environmental sample should also have isolation source or specific host annotated");
16205     }
16206     return;
16207     //LCOV_EXCL_STOP
16208   }
16209   onp = orp->orgname;
16210   if (onp == NULL || StringHasNoText (onp->lineage)) {
16211     if (! vsp->seqSubmitParent && vsp->indexerVersion) { /* suppress when validator run from tbl2asn or when not indexer version */
16212       sev = SEV_ERROR;
16213       if (vsp->is_refseq_in_sep) {
16214         if (has_taxon) {
16215           sev = SEV_REJECT;
16216         }
16217       }
16218       if (vsp->is_embl_ddbj_in_sep) {
16219         sev = SEV_WARNING;
16220       }
16221       if (! vsp->is_wp_in_sep) {
16222         ValidErr (vsp, sev, ERR_SEQ_DESCR_MissingLineage, "No lineage for this BioSource.");
16223       }
16224     }
16225   } else {
16226     if (biop->genome == GENOME_kinetoplast) {
16227       if (StringStr (onp->lineage, "Kinetoplastida") == 0) {
16228         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadOrganelle, "Only Kinetoplastida have kinetoplasts");
16229       }
16230     } else if (biop->genome == GENOME_nucleomorph) {
16231       if (StringStr (onp->lineage, "Chlorarachniophyceae") == 0 && StringStr (onp->lineage, "Cryptophyta") == 0) {
16232         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadOrganelle, "Only Chlorarachniophyceae and Cryptophyta have nucleomorphs");
16233       }
16234     } else if (biop->genome == GENOME_macronuclear) {
16235       if (StringStr (onp->lineage, "Ciliophora") == 0) {
16236         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadOrganelle, "Only Ciliophora have macronuclear locations");
16237       }
16238     }
16239 
16240     /* warn if bacteria has organelle location */
16241     if (StringCmp (onp->div, "BCT") == 0 || StringCmp (onp->div, "VRL") == 0) {
16242       if (biop->genome == GENOME_unknown
16243           || biop->genome == GENOME_genomic
16244           || biop->genome == GENOME_plasmid
16245           || biop->genome == GENOME_chromosome
16246           || (biop->genome == GENOME_extrachrom && StringCmp (onp->div, "BCT") == 0)
16247           || (biop->genome == GENOME_proviral && StringCmp (onp->div, "VRL") == 0)) {
16248         /* it's ok */
16249       } else {
16250         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Bacterial or viral source should not have organelle location");
16251       }
16252     }
16253 
16254     if (StringCmp (onp->div, "ENV") == 0 && (! num_env_sample)) {
16255       ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_BioSourceInconsistency, "BioSource with ENV division is missing environmental sample subsource");
16256     }
16257   }
16258   for (db = orp->db; db != NULL; db = db->next) {
16259     dbt = (DbtagPtr) db->data.ptrvalue;
16260     if (dbt != NULL) {
16261       if (last_db != NULL) {
16262         if (StringICmp (dbt->db, last_db) == 0) {
16263           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceDbTagConflict, "BioSource uses db %s multiple times", last_db);
16264         }
16265       }
16266       last_db = dbt->db;
16267     }
16268   }
16269   if (onp != NULL) {
16270     omp = onp->mod;
16271     varietyOK = FALSE;
16272     while (omp != NULL) {
16273       str = omp->subname;
16274       if (StringCmp (str, "N/A") == 0 || StringCmp (str, "Missing") == 0) {
16275         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Orgmod name should not be %s", str);
16276       }
16277       if (omp->subtype == 0 || omp->subtype == 1) {
16278         //LCOV_EXCL_START
16279         //Not valid ASN.1
16280         ValidErr (vsp, SEV_REJECT, ERR_SEQ_DESCR_BadOrgMod, "Unknown orgmod subtype %d", (int) (omp->subtype));
16281         //LCOV_EXCL_STOP
16282       } else if (omp->subtype == ORGMOD_strain) {
16283         str = omp->subname;
16284         if (StringNCmp (str, "subsp. ", 7) == 0) {
16285           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Orgmod.strain should not start with subsp.");
16286         } else if (StringNCmp (str, "serovar ", 8) == 0) {
16287           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Orgmod.strain should not start with serovar");
16288         }
16289         if (has_strain) {
16290           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadOrgMod, "Multiple strain qualifiers on the same BioSource");
16291         }
16292         has_strain = TRUE;
16293       } else if (omp->subtype == ORGMOD_isolate) {
16294         has_isolate = TRUE;
16295       } else if (omp->subtype == ORGMOD_serovar) {
16296         str = omp->subname;
16297         if (StringNCmp (str, "subsp. ", 7) == 0) {
16298           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Orgmod.serovar should not start with subsp.");
16299         } else if (StringNCmp (str, "strain ", 7) == 0) {
16300           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Orgmod.serovar should not start with strain");
16301         }
16302       } else if (omp->subtype == ORGMOD_variety) {
16303         if ((StringHasNoText (onp->div) || StringICmp (onp->div, "PLN") != 0) &&
16304             StringStr (onp->lineage, "Cyanobacteria") == 0 &&
16305             StringStr (onp->lineage, "Myxogastria") == 0 &&
16306             StringStr (onp->lineage, "Oomycetes") == 0) {
16307           if (! has_taxon) {
16308             ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadOrgMod, "Orgmod variety should only be in plants, fungi, or cyanobacteria");
16309           }
16310         }
16311         varietyOK = ValidateOrgModInTaxName (vsp, omp, orp->taxname, varietyOK);
16312       } else if (omp->subtype == ORGMOD_nat_host) {
16313         is_specific_host = TRUE;
16314         if (StringICmp (omp->subname, orp->taxname) == 0) {
16315           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadOrgMod, "Specific host is identical to taxname");
16316         }
16317       } else if (omp->subtype == ORGMOD_other) {
16318         ValidateSourceQualTags (vsp, gcp, biop, omp->subname);
16319       } else if (omp->subtype == ORGMOD_biovar ||
16320                  omp->subtype == ORGMOD_forma ||
16321                  omp->subtype == ORGMOD_forma_specialis ||
16322                  omp->subtype == ORGMOD_pathovar) {
16323         ValidateOrgModInTaxName (vsp, omp, orp->taxname, varietyOK);
16324       } else if (omp->subtype == ORGMOD_sub_species) {
16325         str = omp->subname;
16326         if (StringStr (str, "subsp. ") != NULL) {
16327           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Orgmod.sub-species should not contain subsp.");
16328         }
16329         ValidateOrgModInTaxName (vsp, omp, orp->taxname, varietyOK);
16330       } else if (omp->subtype == ORGMOD_specimen_voucher) {
16331         num_specimen_voucher++;
16332         ValidateOrgModVoucher (vsp, omp);
16333       } else if (omp->subtype == ORGMOD_culture_collection) {
16334         num_culture_collection++;
16335         ValidateOrgModVoucher (vsp, omp);
16336       } else if (omp->subtype == ORGMOD_bio_material) {
16337         num_bio_material++;
16338         ValidateOrgModVoucher (vsp, omp);
16339       } else if (omp->subtype == ORGMOD_metagenome_source) {
16340         has_metagenome_source = TRUE;
16341       } else if (omp->subtype == ORGMOD_type_material) {
16342         if (! TypeMaterialIsValid (omp->subname)) {
16343           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadOrgMod, "Bad value for type_material");
16344         }
16345       } else if (omp->subtype == ORGMOD_common) {
16346         //LCOV_EXCL_START
16347         //don't care
16348         if (StringICmp (omp->subname, orp->common) == 0) {
16349           /*
16350           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadOrgMod, "OrgMod common is identical to Org-ref common");
16351           */
16352         }
16353         //LCOV_EXCL_STOP
16354       } else if (omp->subtype == ORGMOD_synonym) {
16355         synonym = omp->subname;
16356       } else if (omp->subtype == ORGMOD_gb_synonym) {
16357         gb_synonym = omp->subname;
16358       }
16359 
16360       if (isViral && IsUnexpectedViralOrgModQualifier(omp->subtype)) {
16361         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Virus has unexpected %s qualifier", GetOrgModQualName (omp->subtype));
16362       }
16363 
16364       omp = omp->next;
16365     }
16366 
16367     for (omp = onp->mod; omp != NULL; omp = omp->next) {
16368       if (omp->subtype != ORGMOD_specimen_voucher &&
16369           omp->subtype != ORGMOD_culture_collection &&
16370           omp->subtype != ORGMOD_bio_material) continue;
16371       nxtomp = omp->next;
16372       if (nxtomp == NULL) continue;
16373       inst1 = NULL;
16374       inst2 = NULL;
16375       id1 = NULL;
16376       id2 = NULL;
16377       coll1 = NULL;
16378       coll2 = NULL;
16379       StringNCpy_0 (buf1, omp->subname, sizeof (buf1));
16380       StringNCpy_0 (buf2, nxtomp->subname, sizeof (buf2));
16381       if (StringChr (buf1, ':') == NULL || StringChr (buf2, ':') == NULL) continue;
16382       if (! ParseStructuredVoucher (buf1, &inst1, &id1)) continue;
16383       if (! ParseStructuredVoucher (buf2, &inst2, &id2)) continue;
16384       if (inst1 == NULL || inst2 == NULL) continue;
16385       if (StringNICmp (inst1, "personal", 8) == 0) continue;
16386       if (StringNICmp (inst2, "personal", 8) == 0) continue;
16387       coll1 = StringChr (inst1, ':');
16388       if (coll1 != NULL) {
16389         *coll1 = '\0';
16390         coll1++;
16391       }
16392       coll2 = StringChr (inst2, ':');
16393       if (coll2 != NULL) {
16394         *coll2 = '\0';
16395         coll2++;
16396       }
16397       if (StringICmp (inst1, inst2) != 0) continue;
16398       if (omp->subtype != nxtomp->subtype) continue;
16399       if (StringCmp (coll1, "DNA") == 0 || StringCmp (coll2, "DNA") == 0) continue;
16400       if (coll1 != NULL && coll2 != NULL && StringICmp (coll1, coll2) == 0) {
16401         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MultipleSourceVouchers, "Multiple vouchers with same institution:collection");
16402       } else {
16403         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MultipleSourceVouchers, "Multiple vouchers with same institution");
16404       }
16405     }
16406   }
16407 
16408   if (onp != NULL) {
16409     for (omp = onp->mod; omp != NULL; omp = omp->next) {
16410       str = omp->subname;
16411       if (StringHasNoText (str)) continue;
16412       if (UnbalancedParentheses (str)) {
16413         if (omp->subtype == ORGMOD_old_name) continue;
16414         ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_UnbalancedParentheses,
16415                   "Unbalanced parentheses in orgmod '%s'", str);
16416       }
16417       if (StringHasSgml (vsp, str)) {
16418         ValidErr (vsp, SEV_WARNING, ERR_GENERIC_SgmlPresentInText, "orgmod %s has SGML", str);
16419       }
16420     }
16421   }
16422 
16423   if (vsp->indexerVersion) {
16424     if (isBacteria && isBioSample) {
16425       if ( ! has_strain && ! has_isolate && ! num_env_sample ) {
16426         ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_BioSourceInconsistency, "Bacteria should have strain or isolate or environmental sample");
16427       }
16428     }
16429   }
16430 
16431   /*
16432   if (num_bio_material > 1) {
16433     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MultipleSourceQualifiers, "Multiple bio_material qualifiers present");
16434   }
16435   if (num_culture_collection > 1) {
16436     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MultipleSourceQualifiers, "Multiple culture_collection qualifiers present");
16437   }
16438   if (num_specimen_voucher > 1) {
16439     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MultipleSourceQualifiers, "Multiple specimen_voucher qualifiers present");
16440   }
16441   */
16442   if (num_collection_dates > 1) {
16443     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MultipleSourceQualifiers, "Multiple collection_date qualifiers present");
16444   }
16445   if (num_env_sample && has_strain) {
16446     ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_BioSourceInconsistency, "Strain should not be present in an environmental sample");
16447   }
16448   if (num_env_sample && (! is_iso_source) && (! is_specific_host)) {
16449     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Environmental sample should also have isolation source or specific host annotated");
16450   }
16451   if (has_metagenome_source && (! num_metagenomic)) {
16452     ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_BioSourceInconsistency, "Metagenome source should also have metagenomic qualifier");
16453   }
16454   if (StringDoesHaveText (synonym) && StringDoesHaveText (gb_synonym)) {
16455     if (StringICmp (synonym, gb_synonym) == 0) {
16456       ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "OrgMod synonym is identical to OrgMod gb_synonym");
16457     }
16458   }
16459 
16460   for (db = orp->db; db != NULL; db = db->next) {
16461     id = -1;
16462     dbt = (DbtagPtr) db->data.ptrvalue;
16463     if (dbt != NULL && dbt->db != NULL) {
16464 
16465       if (DbxrefIsValid (dbt->db, &is_rf, &is_sc, &is_bc, &good)) {
16466         if (is_bc) {
16467           if (StringHasNoText (good)) {
16468             good = "?";
16469           }
16470           if (is_sc) {
16471             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_IllegalDbXref,
16472                       "Illegal db_xref type %s (%s), legal capitalization is %s", dbt->db, ValGetDbtagStr (dbt, buf), good);
16473           } else {
16474             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_IllegalDbXref,
16475                       "Illegal db_xref type %s (%s), legal capitalization is %s, but should not be used on an OrgRef",
16476                       dbt->db, ValGetDbtagStr (dbt, buf), good);
16477           }
16478         } else if (is_rf) {
16479           if (vsp->is_refseq_in_sep || vsp->is_gps_in_sep) {
16480             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_IllegalDbXref,
16481                       "RefSeq-specific db_xref type %s (%s) should not be used on an OrgRef", dbt->db, ValGetDbtagStr (dbt, buf));
16482           } else {
16483             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_IllegalDbXref,
16484                       "RefSeq-specific db_xref type %s (%s) should not be used on a non-RefSeq OrgRef", dbt->db, ValGetDbtagStr (dbt, buf));
16485           }
16486         } else if (is_sc) {
16487         } else {
16488           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_IllegalDbXref,
16489                     "db_xref type %s (%s) should not be used on an OrgRef", dbt->db, ValGetDbtagStr (dbt, buf));
16490         }
16491       } else {
16492         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_IllegalDbXref, "Illegal db_xref type %s (%s)", dbt->db, ValGetDbtagStr (dbt, buf));
16493       }
16494 
16495       if (StringDoesHaveText (dbt->db)) {
16496         if (StringHasSgml (vsp, dbt->db)) {
16497           ValidErr (vsp, SEV_WARNING, ERR_GENERIC_SgmlPresentInText, "dbxref database %s has SGML", dbt->db);
16498         }
16499       }
16500 
16501       oip = dbt->tag;
16502       if (oip != NULL && StringDoesHaveText (oip->str)) {
16503         if (StringHasSgml (vsp, oip->str)) {
16504           ValidErr (vsp, SEV_WARNING, ERR_GENERIC_SgmlPresentInText, "dbxref value %s has SGML", oip->str);
16505         }
16506       }
16507 
16508       /*
16509       dbxerr = NULL;
16510       dbvalid = IsDbxrefValid (dbt->db, NULL, orp, FALSE, &dbxerr);
16511       if (dbxerr != NULL) {
16512         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_IllegalDbXref, dbxerr);
16513         dbxerr = MemFree (dbxerr);
16514       }
16515       */
16516     }
16517   }
16518 
16519   if (! vsp->indexerVersion) return;
16520 
16521   if (has_taxon) return;
16522 
16523   if (! vsp->seqSubmitParent) { /* suppress when validator run from tbl2asn */
16524     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_NoTaxonID, "BioSource is missing taxon ID");
16525   }
16526 }
16527 
IsXr(ValNodePtr sdp)16528 static Boolean IsXr (ValNodePtr sdp)
16529 
16530 {
16531   BioseqPtr      bsp;
16532   ObjValNodePtr  ovp;
16533   SeqIdPtr       sip;
16534   TextSeqIdPtr   tsip;
16535 
16536   if (sdp->extended == 0) return FALSE;
16537   ovp = (ObjValNodePtr) sdp;
16538   if (ovp->idx.parenttype != OBJ_BIOSEQ) return FALSE;
16539   bsp = (BioseqPtr) ovp->idx.parentptr;
16540   if (bsp == NULL) return FALSE;
16541   for (sip = bsp->id; sip != NULL; sip = sip->next) {
16542     if (sip->choice != SEQID_OTHER) continue;
16543     tsip = (TextSeqIdPtr) sip->data.ptrvalue;
16544     if (tsip == NULL) continue;
16545     if (StringNICmp (tsip->accession, "XR_", 3) == 0) return TRUE;
16546   }
16547   return FALSE;
16548 }
16549 
IsSynthetic(BioseqPtr bsp)16550 static Boolean IsSynthetic (BioseqPtr bsp)
16551 
16552 {
16553   BioSourcePtr       biop;
16554   SeqMgrDescContext  dcontext;
16555   OrgNamePtr         onp;
16556   OrgRefPtr          orp;
16557   SeqDescrPtr        sdp;
16558 
16559   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
16560   if (sdp == NULL) return FALSE;
16561   biop = (BioSourcePtr) sdp->data.ptrvalue;
16562   if (biop == NULL) return FALSE;
16563   if (biop->origin == 5) return TRUE;
16564   orp = biop->org;
16565   if (orp == NULL) return FALSE;
16566   onp = orp->orgname;
16567   if (onp == NULL) return FALSE;
16568   if (StringICmp (onp->div, "SYN") == 0) return TRUE;
16569   return FALSE;
16570 }
16571 
IsMicroRNA(BioseqPtr bsp)16572 static Boolean IsMicroRNA (BioseqPtr bsp)
16573 
16574 {
16575   SeqMgrFeatContext  fcontext;
16576   RnaRefPtr          rrp;
16577   SeqFeatPtr         sfp;
16578   CharPtr            str;
16579 
16580   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_otherRNA, &fcontext);
16581   while (sfp != NULL) {
16582     if (sfp->data.choice == SEQFEAT_RNA) {
16583       rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
16584       if (rrp != NULL && rrp->ext.choice == 1) {
16585         str = (CharPtr) rrp->ext.value.ptrvalue;
16586         if (StringStr (str, "microRNA") != NULL) return TRUE;
16587       }
16588     }
16589     sfp = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_otherRNA, &fcontext);
16590   }
16591   return FALSE;
16592 }
16593 
IsOtherDNA(BioseqPtr bsp)16594 static Boolean IsOtherDNA (BioseqPtr bsp)
16595 
16596 {
16597   SeqMgrDescContext  dcontext;
16598   MolInfoPtr         mip;
16599   SeqDescrPtr        sdp;
16600 
16601   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext);
16602   if (sdp == NULL) return FALSE;
16603   mip = (MolInfoPtr) sdp->data.ptrvalue;
16604   if (mip == NULL) return FALSE;
16605   if (mip->biomol == 255) return TRUE;
16606   return FALSE;
16607 }
16608 
StringHasPMID(CharPtr str)16609 static Boolean StringHasPMID (CharPtr str)
16610 
16611 {
16612   Char     ch;
16613   Int2     numdigits = 0;
16614   CharPtr  ptr;
16615 
16616   if (StringHasNoText (str)) return FALSE;
16617 
16618   ptr = StringStr (str, "(PMID ");
16619   if (ptr == NULL) return FALSE;
16620 
16621   ptr += 6;
16622   ch = *ptr;
16623   while (ch != '\0') {
16624     if (ch == ')') {
16625       if (numdigits > 0) return TRUE;
16626       return FALSE;
16627     } else if (IS_DIGIT (ch)) {
16628       numdigits++;
16629     }
16630     ptr++;
16631     ch = *ptr;
16632   }
16633 
16634   return FALSE;
16635 }
16636 
16637 
HasStructuredCommentPrefix(UserObjectPtr uop)16638 static Boolean HasStructuredCommentPrefix (UserObjectPtr uop)
16639 {
16640   UserFieldPtr ufp;
16641 
16642   if (uop == NULL) {
16643     return FALSE;
16644   }
16645   for (ufp = uop->data; ufp != NULL; ufp = ufp->next) {
16646     if (ufp->label != NULL && StringCmp (ufp->label->str, "StructuredCommentPrefix") == 0) {
16647       return TRUE;
16648     }
16649   }
16650   return FALSE;
16651 }
16652 
BadBSFormat(CharPtr str)16653 static Boolean BadBSFormat (CharPtr str)
16654 
16655 {
16656   Char     ch;
16657   CharPtr  ptr;
16658 
16659   if (StringLen (str) < 5) return TRUE;
16660 
16661   ch = str [0];
16662   if (ch != 'S') return TRUE;
16663   ch = str [1];
16664   if (ch != 'A') return TRUE;
16665   ch = str [2];
16666   if (ch != 'M') return TRUE;
16667   ch = str [3];
16668   if (ch != 'E' && ch != 'N' && ch != 'D') return TRUE;
16669 
16670   ptr = str + 4;
16671 
16672   /* EBI alternative format */
16673   if (ch == 'E') {
16674     ch = *ptr;
16675     if (IS_ALPHA (ch)) {
16676       ptr++;
16677     }
16678   }
16679 
16680   ch = *ptr;
16681   while (ch != '\0') {
16682     if (! IS_DIGIT (ch)) return TRUE;
16683     ptr++;
16684     ch = *ptr;
16685   }
16686 
16687   return FALSE;
16688 }
16689 
BadAltBSFormat(CharPtr str)16690 static Boolean BadAltBSFormat (CharPtr str)
16691 
16692 {
16693   Char     ch;
16694   CharPtr  ptr;
16695 
16696   if (StringLen (str) < 9) return TRUE;
16697 
16698   ch = str [0];
16699   if (ch != 'S') return TRUE;
16700   ch = str [1];
16701   if (ch != 'R') return TRUE;
16702   ch = str [2];
16703   if (ch != 'S') return TRUE;
16704 
16705   ptr = str + 3;
16706   ch = *ptr;
16707   while (ch != '\0') {
16708     if (! IS_DIGIT (ch)) return TRUE;
16709     ptr++;
16710     ch = *ptr;
16711   }
16712 
16713   return FALSE;
16714 }
16715 
BadSRAFormat(CharPtr str)16716 static Boolean BadSRAFormat (CharPtr str)
16717 
16718 {
16719   Char     ch;
16720   CharPtr  ptr;
16721 
16722   if (StringLen (str) < 9) return TRUE;
16723 
16724   ch = str [0];
16725   if (ch != 'S' && ch != 'D' && ch != 'E') return TRUE;
16726   ch = str [1];
16727   if (! IS_UPPER (ch)) return TRUE;
16728   ch = str [2];
16729   if (! IS_UPPER (ch)) return TRUE;
16730 
16731   ptr = str + 3;
16732   ch = *ptr;
16733   while (ch != '\0') {
16734     if (! IS_DIGIT (ch)) return TRUE;
16735     ptr++;
16736     ch = *ptr;
16737   }
16738 
16739   return FALSE;
16740 }
16741 
BadBPFormat(CharPtr str)16742 static Boolean BadBPFormat (CharPtr str)
16743 
16744 {
16745   Char     ch;
16746   CharPtr  ptr;
16747 
16748   if (StringLen (str) < 6) return TRUE;
16749 
16750   ch = str [0];
16751   if (ch != 'P') return TRUE;
16752   ch = str [1];
16753   if (ch != 'R') return TRUE;
16754   ch = str [2];
16755   if (ch != 'J') return TRUE;
16756   ch = str [3];
16757   if (ch != 'E' && ch != 'N' && ch != 'D') return TRUE;
16758   ch = str [4];
16759   if (! IS_UPPER (ch)) return TRUE;
16760 
16761   ptr = str + 5;
16762   ch = *ptr;
16763   while (ch != '\0') {
16764     if (! IS_DIGIT (ch)) return TRUE;
16765     ptr++;
16766     ch = *ptr;
16767   }
16768 
16769   return FALSE;
16770 }
16771 
16772 static CharPtr dblink_names [] = {
16773   "Trace Assembly Archive",
16774   "ProbeDB",
16775   "Assembly",
16776   "BioSample",
16777   "Sequence Read Archive",
16778   "BioProject",
16779   NULL
16780 };
16781 
ValidateDblink(ValidStructPtr vsp,UserObjectPtr uop)16782 static void ValidateDblink (ValidStructPtr vsp, UserObjectPtr uop)
16783 
16784 {
16785   CharPtr PNTR      cpp;
16786   GatherContextPtr  gcp;
16787   Int4              i;
16788   ObjectIdPtr       oip;
16789   CharPtr           str;
16790   UserFieldPtr      ufp;
16791 
16792   if (vsp == NULL || uop == NULL) return;
16793   gcp = vsp->gcp;
16794 
16795   for (ufp = uop->data; ufp != NULL; ufp = ufp->next) {
16796     oip = ufp->label;
16797     if (oip == NULL || oip->str == NULL) continue;
16798     if (StringICmp (oip->str, "Trace Assembly Archive") == 0) {
16799     } else if (StringICmp (oip->str, "ProbeDB") == 0) {
16800     } else if (StringICmp (oip->str, "Assembly") == 0) {
16801     } else if (StringICmp (oip->str, "BioSample") == 0) {
16802       if (ufp->choice == 1) {
16803         str = (CharPtr) ufp->data.ptrvalue;
16804         if (StringDoesHaveText (str)) {
16805           if (BadBSFormat (str) && BadAltBSFormat (str)) {
16806             ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_DBLinkProblem, "Bad BioSample format - %s", str);
16807           }
16808         }
16809       } else if (ufp->choice == 7) {
16810         cpp = (CharPtr PNTR) ufp->data.ptrvalue;
16811         if (ufp->num < 1 || cpp == NULL) continue;
16812         for (i = 0; i < ufp->num; i++) {
16813           str = cpp [i];
16814           if (StringHasNoText (str)) continue;
16815           if (BadBSFormat (str) && BadAltBSFormat (str)) {
16816             ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_DBLinkProblem, "Bad BioSample format - %s", str);
16817           }
16818         }
16819       }
16820     } else if (StringICmp (oip->str, "Sequence Read Archive") == 0 && ufp->choice == 7) {
16821       if (ufp->choice == 1) {
16822         //LCOV_EXCL_START
16823         //DUH. choice is required to be 7, will not be 1
16824         str = (CharPtr) ufp->data.ptrvalue;
16825         if (StringDoesHaveText (str)) {
16826           if (BadSRAFormat (str)) {
16827             ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_DBLinkProblem, "Bad Sequence Read Archive format - %s", str);
16828           }
16829         }
16830         //LCOV_EXCL_STOP
16831       } else if (ufp->choice == 7) {
16832         cpp = (CharPtr PNTR) ufp->data.ptrvalue;
16833         if (ufp->num < 1 || cpp == NULL) continue;
16834         for (i = 0; i < ufp->num; i++) {
16835           str = cpp [i];
16836           if (StringHasNoText (str)) continue;
16837           if (BadSRAFormat (str)) {
16838             ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_DBLinkProblem, "Bad Sequence Read Archive format - %s", str);
16839           }
16840         }
16841       }
16842     } else if (StringICmp (oip->str, "BioProject") == 0 && ufp->choice == 7) {
16843       if (ufp->choice == 1) {
16844         //LCOV_EXCL_START
16845         //DUH. choice is required to be 7, will not be 1
16846         str = (CharPtr) ufp->data.ptrvalue;
16847         if (StringDoesHaveText (str)) {
16848           if (BadBPFormat (str)) {
16849             ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_DBLinkProblem, "Bad BioProject format - %s", str);
16850           }
16851         }
16852         //LCOV_EXCL_STOP
16853       } else if (ufp->choice == 7) {
16854         cpp = (CharPtr PNTR) ufp->data.ptrvalue;
16855         if (ufp->num < 1 || cpp == NULL) continue;
16856         for (i = 0; i < ufp->num; i++) {
16857           str = cpp [i];
16858           if (StringHasNoText (str)) continue;
16859           if (BadBPFormat (str)) {
16860             ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_DBLinkProblem, "Bad BioProject format - %s", str);
16861           }
16862         }
16863       }
16864     }
16865     for (i = 0; dblink_names [i] != NULL; i++) {
16866       if (StringICmp (oip->str, dblink_names [i]) == 0 && StringCmp (oip->str, dblink_names [i]) != 0) {
16867         ValidErr (vsp, SEV_REJECT, ERR_SEQ_DESCR_DBLinkProblem, "Bad DBLink capitalization - %s", oip->str);
16868       }
16869     }
16870   }
16871 }
16872 
16873 
ValidateSeqDescrCommon(ValNodePtr sdp,BioseqValidStrPtr bvsp,ValidStructPtr vsp,Uint4 descitemid)16874 static Boolean ValidateSeqDescrCommon (ValNodePtr sdp, BioseqValidStrPtr bvsp, ValidStructPtr vsp, Uint4 descitemid)
16875 {
16876   ValNodePtr      vnp, vnp2;
16877   OrgRefPtr       this_org = NULL, that_org = NULL;
16878   int             tmpval;
16879   Char            buf1[20], buf2[20], ch;
16880   EMBLBlockPtr    ebp;
16881   GBBlockPtr      gbp;
16882   ValNodePtr      keywords = NULL;
16883   PubdescPtr      pdp;
16884   MolInfoPtr      mip;
16885   ObjectIdPtr     oip;
16886   Uint2           olditemtype = 0;
16887   Uint4           olditemid = 0;
16888   BioSourcePtr    biop;
16889   GatherContextPtr gcp = NULL;
16890   CharPtr         str, ptr;
16891   SeqFeatPtr      sfp;
16892   Boolean         tpa_exp;
16893   Boolean         tpa_inf;
16894   UserObjectPtr   uop;
16895   BioseqPtr       bsp;
16896   DatePtr         dp;
16897   size_t          len;
16898   SeqMgrFeatContext  fcontext;
16899   Int2            baddate;
16900   static char    *badmod = "Inconsistent GIBB-mod [%d] and [%d]";
16901   CharPtr         p;
16902 
16903   vsp->sfp = NULL;
16904   vnp = sdp;
16905   vsp->descr = vnp;
16906 
16907   if (descitemid > 0) {
16908     gcp = vsp->gcp;
16909     if (gcp != NULL) {
16910       olditemid = gcp->itemID;
16911       olditemtype = gcp->thistype;
16912       gcp->itemID = descitemid;
16913       gcp->thistype = OBJ_SEQDESC;
16914     }
16915   }
16916 
16917   switch (vnp->choice) {
16918   case Seq_descr_mol_type:
16919     tmpval = (int) (vnp->data.intvalue);
16920     switch (tmpval) {
16921     case 8:                    /* peptide */
16922       if (!bvsp->is_aa)
16923         ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "Nucleic acid with GIBB-mol = peptide");
16924       break;
16925     case 0:                    /* unknown */
16926     case 255:                  /* other */
16927       ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "GIBB-mol unknown or other used");
16928       break;
16929     default:                   /* the rest are nucleic acid */
16930       if (bvsp->is_aa) {
16931         ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "GIBB-mol [%d] used on protein", tmpval);
16932       } else {
16933         if (bvsp->last_na_mol) {
16934           if (bvsp->last_na_mol != (int) vnp->data.intvalue) {
16935             ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "Inconsistent GIBB-mol [%d] and [%d]", bvsp->last_na_mol, tmpval);
16936           }
16937         } else
16938           bvsp->last_na_mol = tmpval;
16939       }
16940       break;
16941     }
16942     break;
16943   case Seq_descr_modif:
16944     for (vnp2 = (ValNodePtr) (vnp->data.ptrvalue); vnp2 != NULL; vnp2 = vnp2->next) {
16945       tmpval = (int) (vnp2->data.intvalue);
16946       switch (tmpval) {
16947       case 0:                  /* dna */
16948       case 1:                  /* rna */
16949         if (bvsp->is_aa) {      /* only temporarily on 0 */
16950           ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "Nucleic acid GIBB-mod [%d] on protein", tmpval);
16951         } else if (bvsp->last_na_mod) {
16952           if (tmpval != bvsp->last_na_mod) {
16953             ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, badmod, bvsp->last_na_mod, tmpval);
16954           }
16955         } else
16956           bvsp->last_na_mod = tmpval;
16957         break;
16958       case 4:                  /* mitochondria */
16959       case 5:                  /* chloroplast */
16960       case 6:                  /* kinetoplast */
16961       case 7:                  /* cyanelle */
16962       case 18:                 /* macronuclear */
16963         if (bvsp->last_organelle) {
16964           if (tmpval != bvsp->last_organelle) {
16965             ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, badmod, bvsp->last_organelle, tmpval);
16966           }
16967         } else
16968           bvsp->last_organelle = tmpval;
16969         break;
16970       case 10:                 /* partial */
16971       case 11:                 /* complete */
16972         if (bvsp->last_partialness) {
16973           if (tmpval != bvsp->last_partialness) {
16974             ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, badmod, bvsp->last_partialness, tmpval);
16975           }
16976         } else
16977           bvsp->last_partialness = tmpval;
16978         if ((bvsp->last_left_right) && (tmpval == 11)) {
16979           ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, badmod, bvsp->last_left_right, tmpval);
16980         }
16981         break;
16982       case 16:                 /* no left */
16983       case 17:                 /* no right */
16984         if (bvsp->last_partialness == 11) {     /* complete */
16985           ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, badmod, bvsp->last_partialness, tmpval);
16986         }
16987         bvsp->last_left_right = tmpval;
16988         break;
16989       case 255:                /* other */
16990         ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Unknown, "GIBB-mod = other used");
16991         break;
16992       default:
16993         break;
16994 
16995       }
16996     }
16997     break;
16998   case Seq_descr_method:
16999     if (!bvsp->is_aa) {
17000       ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "Nucleic acid with protein sequence method");
17001     }
17002     break;
17003   /*
17004   case Seq_descr_comment:
17005     str = (CharPtr) vnp->data.ptrvalue;
17006     if (StringHasNoText (str)) {
17007       ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_MissingText, "Comment descriptor needs text");
17008     }
17009     if (SerialNumberInString (str)) {
17010       ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_SerialInComment,
17011                 "Comment may refer to reference by serial number - attach reference specific comments to the reference REMARK instead.");
17012     }
17013     if (StringLooksLikeFakeStructuredComment (str)) {
17014       ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_FakeStructuredComment,
17015                 "Comment may be formatted to look like a structured comment.");
17016     }
17017     for (vnp2 = vnp->next; vnp2 != NULL; vnp2 = vnp2->next) {
17018       if (vnp2->choice == Seq_descr_comment) {
17019         ptr = (CharPtr) vnp2->data.ptrvalue;
17020         if (StringDoesHaveText (ptr) && StringICmp (str, ptr) == 0) {
17021           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MultipleComments, "Undesired multiple comment descriptors, identical text");
17022         }
17023       }
17024     }
17025     break;
17026   */
17027   case Seq_descr_genbank:
17028     if (bvsp->last_gb != NULL)
17029       ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "Multiple GenBank blocks");
17030     else
17031       bvsp->last_gb = vnp;
17032     if (vnp != NULL) {
17033       gbp = (GBBlockPtr) vnp->data.ptrvalue;
17034       if (gbp != NULL) {
17035         keywords = gbp->keywords;
17036       }
17037     }
17038     break;
17039   case Seq_descr_embl:
17040     if (bvsp->last_embl != NULL)
17041       ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "Multiple EMBL blocks");
17042     else
17043       bvsp->last_embl = vnp;
17044     if (vnp != NULL) {
17045       ebp = (EMBLBlockPtr) vnp->data.ptrvalue;
17046       if (ebp != NULL) {
17047         keywords = ebp->keywords;
17048       }
17049     }
17050     break;
17051   case Seq_descr_pir:
17052     if (bvsp->last_pir != NULL)
17053       ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "Multiple PIR blocks");
17054     else
17055       bvsp->last_pir = vnp;
17056     break;
17057   case Seq_descr_sp:
17058     if (bvsp->last_sp != NULL)
17059       ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "Multiple SWISS-PROT blocks");
17060     else
17061       bvsp->last_sp = vnp;
17062     break;
17063   case Seq_descr_pdb:
17064     if (bvsp->last_pdb != NULL)
17065       ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "Multiple PDB blocks");
17066     else
17067       bvsp->last_pdb = vnp;
17068     break;
17069   case Seq_descr_prf:
17070     if (bvsp->last_prf != NULL)
17071       ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "Multiple PRF blocks");
17072     else
17073       bvsp->last_prf = vnp;
17074     break;
17075   case Seq_descr_create_date:
17076     dp = (DatePtr) vnp->data.ptrvalue;
17077     if (DateIsBad (dp, TRUE, &baddate)) {
17078       PrintBadDateError (vsp, baddate, SEV_ERROR, ERR_GENERIC_BadDate, "Create date has error");
17079     }
17080     if (bvsp->last_create != NULL) {
17081       tmpval = (int) DateMatch ((DatePtr) vnp->data.ptrvalue, (DatePtr) (bvsp->last_create->data.ptrvalue), FALSE);
17082       if (tmpval && vsp->has_gi_or_accn_ver) {
17083         DatePrint ((DatePtr) (vnp->data.ptrvalue), buf1);
17084         DatePrint ((DatePtr) (bvsp->last_create->data.ptrvalue), buf2);
17085         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_InconsistentDates, "Inconsistent create_dates [%s] and [%s]", buf1, buf2);
17086       }
17087     } else
17088       bvsp->last_create = vnp;
17089     if (bvsp->last_update != NULL) {
17090       tmpval = (int) DateMatch ((DatePtr) vnp->data.ptrvalue, (DatePtr) (bvsp->last_update->data.ptrvalue), FALSE);
17091       if (tmpval == 1 && vsp->has_gi_or_accn_ver) {
17092         DatePrint ((DatePtr) (vnp->data.ptrvalue), buf1);
17093         DatePrint ((DatePtr) (bvsp->last_update->data.ptrvalue), buf2);
17094         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_InconsistentDates, "Inconsistent create_date [%s] and update_date [%s]", buf1, buf2);
17095       }
17096     }
17097     break;
17098   case Seq_descr_update_date:
17099     dp = (DatePtr) vnp->data.ptrvalue;
17100     if (DateIsBad (dp, TRUE, &baddate)) {
17101       PrintBadDateError (vsp, baddate, SEV_ERROR, ERR_GENERIC_BadDate, "Update date has error");
17102     }
17103     if (bvsp->last_create != NULL) {
17104       tmpval = (int) DateMatch ((DatePtr) bvsp->last_create->data.ptrvalue, (DatePtr) (vnp->data.ptrvalue), FALSE);
17105       if (tmpval == 1 && vsp->has_gi_or_accn_ver) {
17106         DatePrint ((DatePtr) (bvsp->last_create->data.ptrvalue), buf1);
17107         DatePrint ((DatePtr) (vnp->data.ptrvalue), buf2);
17108         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_InconsistentDates, "Inconsistent create_date [%s] and update_date [%s]", buf1, buf2);
17109       }
17110     }
17111     if (bvsp->last_update == NULL)
17112       bvsp->last_update = vnp;
17113     break;
17114   case Seq_descr_source:
17115     biop = (BioSourcePtr) vnp->data.ptrvalue;
17116     bsp = bvsp->bsp;
17117     if (biop != NULL && biop->is_focus && bsp != NULL) {
17118       if (ISA_aa (bsp->mol) || bsp->repr == Seq_repr_seg || SeqMgrGetParentOfPart (bsp, NULL) != NULL) {
17119         /* skip proteins, segmented bioseqs, or segmented parts */
17120       } else {
17121         sfp = SeqMgrGetNextFeature (bvsp->bsp, NULL, SEQFEAT_BIOSRC, 0, &fcontext);
17122         if (sfp == NULL) {
17123           ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_UnnecessaryBioSourceFocus, "BioSource descriptor has focus, but no BioSource feature");
17124         }
17125       }
17126     }
17127     if (biop != NULL && biop->origin == 5) {
17128       bsp = bvsp->bsp;
17129       if (bsp != NULL && ! IsOtherDNA (bsp) && !ISA_aa (bsp->mol)) {
17130         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_InvalidForType, "Molinfo-biomol other should be used if Biosource-location is synthetic");
17131       }
17132     }
17133     /* ValidateBioSource (vsp, gcp, biop, NULL, vnp); */
17134     if (biop != NULL) {
17135       this_org = biop->org;
17136     }
17137     /* fall into Seq_descr_org */
17138   case Seq_descr_org:
17139     if (this_org == NULL)
17140       this_org = (OrgRefPtr) (vnp->data.ptrvalue);
17141     if (bvsp->last_org != NULL) {
17142       if ((this_org->taxname != NULL) && (bvsp->last_org->taxname != NULL)) {
17143         if (StringCmp (this_org->taxname, bvsp->last_org->taxname)) {
17144           if (! vsp->is_wp_in_sep) {
17145             ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "Inconsistent taxnames [%s] and [%s]", this_org->taxname, bvsp->last_org->taxname);
17146           }
17147         }
17148       }
17149     } else
17150       bvsp->last_org = this_org;
17151 
17152     for (vnp2 = vnp->next; vnp2 != NULL; vnp2 = vnp2->next) {
17153       if (vnp2->choice == Seq_descr_source || vnp2->choice == Seq_descr_org) {
17154         that_org = NULL;
17155         if (vnp2->choice == Seq_descr_source) {
17156           that_org = ((BioSourcePtr) (vnp2->data.ptrvalue))->org;
17157         }
17158         if (that_org == NULL) {
17159           that_org = (OrgRefPtr) (vnp2->data.ptrvalue);
17160         }
17161         if (that_org != NULL) {
17162           if ((this_org->taxname != NULL) && (that_org->taxname != NULL) && StringCmp (this_org->taxname, that_org->taxname) == 0) {
17163             ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_MultipleBioSources, "Undesired multiple source descriptors");
17164           }
17165         }
17166       }
17167     }
17168     break;
17169   case Seq_descr_title:
17170     str = (CharPtr) vnp->data.ptrvalue;
17171     if (StringHasNoText (str)) {
17172       ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_MissingText, "Title descriptor needs text");
17173     }
17174     for (vnp2 = vnp->next; vnp2 != NULL; vnp2 = vnp2->next) {
17175       if (vnp2->choice == Seq_descr_title) {
17176         ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_MultipleTitles, "Undesired multiple title descriptors");
17177         break;
17178       }
17179     }
17180     len = StringLen (str);
17181     if (len > 4) {
17182       ch = str [len - 1];
17183       while (ch == ' ' && len > 4) {
17184         len--;
17185         ch = str [len - 1];
17186       }
17187       if (ch == '.' && len > 4) {
17188         len--;
17189         ch = str [len - 1];
17190       }
17191       if (ch == '.' || ch == ',' || ch == ';' || ch == ':') {
17192         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadPunctuation, "Title descriptor ends in bad punctuation");
17193       }
17194     }
17195     if (StringHasPMID (str)) {
17196       ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_TitleHasPMID, "Title descriptor has internal PMID");
17197     }
17198     break;
17199   case Seq_descr_name:
17200     str = (CharPtr) vnp->data.ptrvalue;
17201     if (StringHasNoText (str)) {
17202       ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_MissingText, "Name descriptor needs text");
17203     }
17204     for (vnp2 = vnp->next; vnp2 != NULL; vnp2 = vnp2->next) {
17205       if (vnp2->choice == Seq_descr_name) {
17206         ptr = (CharPtr) vnp2->data.ptrvalue;
17207         if (StringDoesHaveText (ptr) && StringICmp (str, ptr) == 0) {
17208           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MultipleNames, "Undesired multiple name descriptors, identical text");
17209         } else {
17210           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MultipleNames, "Undesired multiple name descriptors, different text");
17211         }
17212       }
17213     }
17214     break;
17215   case Seq_descr_region:
17216     str = (CharPtr) vnp->data.ptrvalue;
17217     if (StringHasNoText (str)) {
17218       ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_MissingText, "Region descriptor needs text");
17219     }
17220     break;
17221   case Seq_descr_user:
17222     uop = (UserObjectPtr) vnp->data.ptrvalue;
17223     if (uop != NULL) {
17224       oip = uop->type;
17225       if (oip != NULL) {
17226         if (StringCmp (oip->str, "StructuredComment") == 0) {
17227           if (uop->data == NULL) {
17228             ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_UserObjectProblem, "Structured Comment user object descriptor is empty");
17229           }
17230           if (!HasStructuredCommentPrefix (uop)) {
17231             ValidErr (vsp, SEV_INFO, ERR_SEQ_DESCR_StructuredCommentPrefixOrSuffixMissing, "Structured Comment lacks prefix");
17232           }
17233         } else if (StringICmp (oip->str, "DBLink") == 0) {
17234           ValidateDblink (vsp, uop);
17235         }
17236       }
17237     }
17238     break;
17239   case Seq_descr_pub:
17240     bvsp->got_a_pub = TRUE;
17241     pdp = (PubdescPtr) vnp->data.ptrvalue;
17242     /*
17243        ValidatePubdesc (vsp, pdp);
17244      */
17245     break;
17246   case Seq_descr_molinfo:
17247     mip = (MolInfoPtr) vnp->data.ptrvalue;
17248     if (mip != NULL) {
17249       switch (mip->biomol) {
17250       case MOLECULE_TYPE_PEPTIDE:      /* peptide */
17251         if (!bvsp->is_aa) {
17252           ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "Nucleic acid with Molinfo-biomol = peptide");
17253         }
17254         break;
17255       case MOLECULE_TYPE_OTHER_GENETIC_MATERIAL:
17256         if (! bvsp->is_artificial) {
17257           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_InvalidForType, "Molinfo-biomol = other genetic");
17258         }
17259         break;
17260       case 0:                  /* unknown */
17261         ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "Molinfo-biomol unknown used");
17262         break;
17263       case 255:                /* other */
17264         if (! IsXr (vnp)) {
17265           bsp = bvsp->bsp;
17266           if (! IsSynthetic (bsp)) {
17267             if (! IsMicroRNA (bsp)) {
17268               ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_InvalidForType, "Molinfo-biomol other used");
17269             }
17270           }
17271         }
17272         break;
17273       default:                 /* the rest are nucleic acid */
17274         if (bvsp->is_aa) {
17275           ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "Molinfo-biomol [%d] used on protein", (int) mip->biomol);
17276         } else {
17277           if (bvsp->last_biomol) {
17278             if (bvsp->last_biomol != (int) mip->biomol) {
17279               ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "Inconsistent Molinfo-biomol [%d] and [%d]", bvsp->last_biomol, (int) mip->biomol);
17280             }
17281           } else {
17282             bvsp->last_biomol = (int) mip->biomol;
17283           }
17284         }
17285         break;
17286       }
17287 
17288       if (bvsp->is_syn_constr) {
17289         if (mip->biomol != MOLECULE_TYPE_OTHER_GENETIC_MATERIAL && !bvsp->is_aa) {
17290           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_InvalidForType, "synthetic construct should have other-genetic");
17291         }
17292         if (! bvsp->is_artificial) {
17293           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_InvalidForType, "synthetic construct should have artificial origin");
17294         }
17295       } else if (bvsp->is_artificial) {
17296         ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_InvalidForType, "artificial origin should have other-genetic and synthetic construct");
17297       }
17298       if (bvsp->is_artificial) {
17299         if (mip->biomol != MOLECULE_TYPE_OTHER_GENETIC_MATERIAL && mip->biomol != MOLECULE_TYPE_PEPTIDE) {
17300           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_InvalidForType, "artificial origin should have other-genetic");
17301         }
17302       }
17303       if (!bvsp->is_aa) {
17304         switch (mip->tech) {
17305         case MI_TECH_concept_trans:
17306         case MI_TECH_seq_pept:
17307         case MI_TECH_both:
17308         case MI_TECH_seq_pept_overlap:
17309         case MI_TECH_seq_pept_homol:
17310         case MI_TECH_concept_trans_a:
17311           ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "Nucleic acid with protein sequence method");
17312           break;
17313         case MI_TECH_tsa:
17314             if(mip->biomol == 0)
17315                 p = "unknown";
17316             else if(mip->biomol == 1)
17317                 p = "genomic";
17318             else if(mip->biomol == 2)
17319                 p = "pre-RNA";
17320             else if(mip->biomol == 5)
17321                 p = "tRNA";
17322             else if(mip->biomol == 6)
17323                 p = "snRNA";
17324             else if(mip->biomol == 7)
17325                 p = "scRNA";
17326             else if(mip->biomol == 8)
17327                 p = "peptide";
17328             else if(mip->biomol == 9)
17329                 p = "other-genetic";
17330             else if(mip->biomol == 10)
17331                 p = "genomic-mRNA";
17332             else if(mip->biomol == 11)
17333                 p = "cRNA";
17334             else if(mip->biomol == 12)
17335                 p = "snoRNA";
17336             else if(mip->biomol == 15)
17337                 p = "tmRNA";
17338             else if(mip->biomol == 255)
17339                 p = "other";
17340             else
17341                 p = NULL;
17342             if(p != NULL)
17343                 ValidErr(vsp, SEV_ERROR, ERR_SEQ_DESCR_WrongBiomolForTechnique,
17344                         "Biomol \"%s\" is not appropriate for sequences that use the TSA technique.",
17345                         p);
17346             break;
17347         default:
17348           break;
17349         }
17350       } else {
17351         switch (mip->tech) {
17352         case MI_TECH_est:
17353         case MI_TECH_sts:
17354         case MI_TECH_genemap:
17355         case MI_TECH_physmap:
17356         case MI_TECH_htgs_1:
17357         case MI_TECH_htgs_2:
17358         case MI_TECH_htgs_3:
17359         case MI_TECH_fli_cdna:
17360         case MI_TECH_htgs_0:
17361         case MI_TECH_htc:
17362         case MI_TECH_wgs:
17363         case MI_TECH_barcode:
17364         case MI_TECH_composite_wgs_htgs:
17365           ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_InvalidForType, "Protein with nucleic acid sequence method");
17366           break;
17367         default:
17368           break;
17369         }
17370       }
17371       if (bvsp->last_tech) {
17372         if (bvsp->last_tech != (int) mip->tech) {
17373           ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "Inconsistent Molinfo-tech [%d] and [%d]", bvsp->last_tech, (int) mip->tech);
17374         }
17375       } else {
17376         bvsp->last_tech = (int) mip->tech;
17377       }
17378       if (bvsp->last_completeness) {
17379         if (bvsp->last_completeness != (int) mip->completeness) {
17380           ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "Inconsistent Molinfo-completeness [%d] and [%d]",
17381                     bvsp->last_completeness, (int) mip->completeness);
17382         }
17383       } else {
17384         bvsp->last_completeness = (int) mip->completeness;
17385       }
17386     }
17387     break;
17388   default:
17389     break;
17390   }
17391 
17392   if (keywords != NULL) {
17393     tpa_exp = FALSE;
17394     tpa_inf = FALSE;
17395     for (vnp = keywords; vnp != NULL; vnp = vnp->next) {
17396       if (StringICmp ((CharPtr) vnp->data.ptrvalue, "TPA:experimental") == 0) {
17397         tpa_exp = TRUE;
17398       } else if (StringICmp ((CharPtr) vnp->data.ptrvalue, "TPA:inferential") == 0) {
17399         tpa_inf = TRUE;
17400       }
17401     }
17402     if (tpa_exp && tpa_inf) {
17403       ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "TPA:experimental and TPA:inferential should not both be in the same set of keywords");
17404     }
17405   }
17406 
17407   if (gcp != NULL) {
17408     gcp->itemID = olditemid;
17409     gcp->thistype = olditemtype;
17410   }
17411 
17412   return TRUE;
17413 }
17414 
ValidateSeqDescrIndexed(ValNodePtr sdp,SeqMgrDescContextPtr context)17415 static Boolean LIBCALLBACK ValidateSeqDescrIndexed (ValNodePtr sdp, SeqMgrDescContextPtr context)
17416 {
17417   ValidStructPtr  vsp;
17418   BioseqValidStrPtr bvsp;
17419 
17420   bvsp = (BioseqValidStrPtr) context->userdata;
17421   vsp = bvsp->vsp;
17422 
17423   return ValidateSeqDescrCommon (sdp, bvsp, vsp, context->itemID);
17424 }
17425 
17426 //LCOV_EXCL_START
17427 //only used when indexing not available
ValidateSeqDescrContext(GatherContextPtr gcp)17428 static void ValidateSeqDescrContext (GatherContextPtr gcp)
17429 {
17430   ValidStructPtr  vsp;
17431   BioseqValidStrPtr bvsp;
17432   ValNodePtr      sdp;
17433 
17434   bvsp = (BioseqValidStrPtr) (gcp->userdata);
17435   vsp = bvsp->vsp;
17436   sdp = (ValNodePtr) (gcp->thisitem);
17437 
17438   ValidateSeqDescrCommon (sdp, bvsp, vsp, 0);
17439 }
17440 //LCOV_EXCL_STOP
17441 
17442 /*****************************************************************************
17443 *
17444 *   ValidateBioseqContextGather(gcp)
17445 *      Gather callback for validating context on a Bioseq
17446 *
17447 *****************************************************************************/
DifferentDbxrefs(ValNodePtr dbxref1,ValNodePtr dbxref2)17448 static Boolean DifferentDbxrefs (ValNodePtr dbxref1, ValNodePtr dbxref2)
17449 {
17450   DbtagPtr        dbt1, dbt2;
17451   ObjectIdPtr     oip1, oip2;
17452 
17453   if (dbxref1 == NULL || dbxref2 == NULL)
17454     return FALSE;
17455   dbt1 = (DbtagPtr) dbxref1->data.ptrvalue;
17456   dbt2 = (DbtagPtr) dbxref2->data.ptrvalue;
17457   if (dbt1 == NULL || dbt2 == NULL)
17458     return FALSE;
17459   if (StringICmp (dbt1->db, dbt2->db) != 0)
17460     return TRUE;
17461   oip1 = dbt1->tag;
17462   oip2 = dbt2->tag;
17463   if (oip1 == NULL || oip2 == NULL)
17464     return FALSE;
17465   if (oip1->str == NULL && oip2->str == NULL) {
17466     if (oip1->id != oip2->id)
17467       return TRUE;
17468   } else {
17469     if (StringICmp (oip1->str, oip2->str) != 0)
17470       return TRUE;
17471   }
17472   return FALSE;
17473 }
17474 
FlybaseDbxrefs(ValNodePtr vnp)17475 static Boolean FlybaseDbxrefs (ValNodePtr vnp)
17476 
17477 {
17478   DbtagPtr  dbt;
17479 
17480   while (vnp != NULL) {
17481     dbt = (DbtagPtr) vnp->data.ptrvalue;
17482     if (dbt != NULL) {
17483       if (StringCmp (dbt->db, "FLYBASE") == 0 || StringCmp (dbt->db, "FlyBase") == 0) {
17484         return TRUE;
17485       }
17486     }
17487     vnp = vnp->next;
17488   }
17489   return FALSE;
17490 }
17491 
GPSorNTorNCorNGorNW(SeqEntryPtr sep,SeqLocPtr location)17492 static Boolean GPSorNTorNCorNGorNW (SeqEntryPtr sep, SeqLocPtr location)
17493 {
17494   BioseqPtr       bsp;
17495   BioseqSetPtr    bssp;
17496   SeqIdPtr        sip;
17497   TextSeqIdPtr    tsip;
17498 
17499   if (sep != NULL && IS_Bioseq_set (sep)) {
17500     bssp = (BioseqSetPtr) sep->data.ptrvalue;
17501     if (bssp != NULL && bssp->_class == BioseqseqSet_class_gen_prod_set) {
17502       return TRUE;
17503     }
17504   }
17505   bsp = BioseqFindFromSeqLoc (location);
17506   if (bsp != NULL) {
17507     for (sip = bsp->id; sip != NULL; sip = sip->next) {
17508       if (sip->choice == SEQID_OTHER) {
17509         tsip = (TextSeqIdPtr) sip->data.ptrvalue;
17510         if (tsip != NULL && tsip->accession != NULL) {
17511           if (StringNICmp (tsip->accession, "NT_", 3) == 0) {
17512             return TRUE;
17513           } else if (StringNICmp (tsip->accession, "NC_", 3) == 0) {
17514             return TRUE;
17515           } else if (StringNICmp (tsip->accession, "NG_", 3) == 0) {
17516             return TRUE;
17517           } else if (StringNICmp (tsip->accession, "NW_", 3) == 0) {
17518             return TRUE;
17519           }
17520         }
17521       }
17522     }
17523   }
17524   return FALSE;
17525 }
17526 
IsGenBankAccn(SeqEntryPtr sep,SeqLocPtr location)17527 static Boolean IsGenBankAccn (SeqEntryPtr sep, SeqLocPtr location)
17528 {
17529   BioseqPtr  bsp;
17530   SeqIdPtr   sip;
17531 
17532   bsp = BioseqFindFromSeqLoc (location);
17533   if (bsp != NULL) {
17534     for (sip = bsp->id; sip != NULL; sip = sip->next) {
17535       if (sip->choice == SEQID_GENBANK) return TRUE;
17536     }
17537   }
17538   return FALSE;
17539 }
17540 
IsEMBLAccn(SeqEntryPtr sep,SeqLocPtr location)17541 static Boolean IsEMBLAccn (SeqEntryPtr sep, SeqLocPtr location)
17542 {
17543   BioseqPtr  bsp;
17544   SeqIdPtr   sip;
17545 
17546   bsp = BioseqFindFromSeqLoc (location);
17547   if (bsp != NULL) {
17548     for (sip = bsp->id; sip != NULL; sip = sip->next) {
17549       if (sip->choice == SEQID_EMBL) return TRUE;
17550     }
17551   }
17552   return FALSE;
17553 }
17554 
IsGeneralAccn(SeqEntryPtr sep,SeqLocPtr location)17555 static Boolean IsGeneralAccn (SeqEntryPtr sep, SeqLocPtr location)
17556 {
17557   BioseqPtr  bsp;
17558   DbtagPtr   dbt;
17559   SeqIdPtr   sip;
17560 
17561   bsp = BioseqFindFromSeqLoc (location);
17562   if (bsp != NULL) {
17563     for (sip = bsp->id; sip != NULL; sip = sip->next) {
17564       if (sip->choice != SEQID_GENERAL) continue;
17565       dbt = (DbtagPtr) sip->data.ptrvalue;
17566       if (dbt == NULL) continue;
17567       if (IsSkippableDbtag(dbt)) continue;
17568       return TRUE;
17569     }
17570   }
17571   return FALSE;
17572 }
17573 
NGorNT(SeqEntryPtr sep,SeqLocPtr location,BoolPtr is_nc)17574 static Boolean NGorNT (SeqEntryPtr sep, SeqLocPtr location, BoolPtr is_nc)
17575 {
17576   BioseqPtr       bsp;
17577   SeqIdPtr        sip;
17578   TextSeqIdPtr    tsip;
17579 
17580   if (is_nc != NULL) {
17581     *is_nc = FALSE;
17582   }
17583   bsp = BioseqFindFromSeqLoc (location);
17584   if (bsp != NULL) {
17585     for (sip = bsp->id; sip != NULL; sip = sip->next) {
17586       if (sip->choice == SEQID_OTHER) {
17587         tsip = (TextSeqIdPtr) sip->data.ptrvalue;
17588         if (tsip != NULL && tsip->accession != NULL) {
17589           if (StringNICmp (tsip->accession, "NT_", 3) == 0) {
17590             return TRUE;
17591           } else if (StringNICmp (tsip->accession, "NG_", 3) == 0) {
17592             return TRUE;
17593           } else if (StringNICmp (tsip->accession, "NW_", 3) == 0) {
17594             return TRUE;
17595           } else if (StringNICmp (tsip->accession, "NC_", 3) == 0 && is_nc != NULL) {
17596             *is_nc = TRUE;
17597           }
17598         }
17599       }
17600     }
17601   }
17602   return FALSE;
17603 }
17604 
GPSorRefSeq(SeqEntryPtr sep,SeqLocPtr location)17605 static Boolean GPSorRefSeq (SeqEntryPtr sep, SeqLocPtr location)
17606 {
17607   BioseqPtr     bsp;
17608   BioseqSetPtr  bssp;
17609   SeqIdPtr      sip;
17610 
17611   if (sep != NULL && IS_Bioseq_set (sep)) {
17612     bssp = (BioseqSetPtr) sep->data.ptrvalue;
17613     if (bssp != NULL && bssp->_class == BioseqseqSet_class_gen_prod_set) {
17614       return TRUE;
17615     }
17616   }
17617   bsp = BioseqFindFromSeqLoc (location);
17618   if (bsp != NULL) {
17619     for (sip = bsp->id; sip != NULL; sip = sip->next) {
17620       if (sip->choice == SEQID_OTHER) {
17621         return TRUE;
17622       }
17623     }
17624   }
17625   return FALSE;
17626 }
17627 
IsNCorNT(SeqEntryPtr sep,SeqLocPtr location)17628 static Boolean IsNCorNT (SeqEntryPtr sep, SeqLocPtr location)
17629 {
17630   BioseqPtr       bsp;
17631   SeqIdPtr        sip;
17632   TextSeqIdPtr    tsip;
17633 
17634   bsp = BioseqFindFromSeqLoc (location);
17635   if (bsp != NULL) {
17636     for (sip = bsp->id; sip != NULL; sip = sip->next) {
17637       if (sip->choice == SEQID_OTHER) {
17638         tsip = (TextSeqIdPtr) sip->data.ptrvalue;
17639         if (tsip != NULL && tsip->accession != NULL) {
17640           if (StringNICmp (tsip->accession, "NC_", 3) == 0) {
17641             return TRUE;
17642           } else if (StringNICmp (tsip->accession, "NT_", 3) == 0) {
17643             return TRUE;
17644           }
17645         }
17646       }
17647     }
17648   }
17649   return FALSE;
17650 }
17651 
IsNCorNTorNW(SeqEntryPtr sep,SeqLocPtr location)17652 static Boolean IsNCorNTorNW (SeqEntryPtr sep, SeqLocPtr location)
17653 {
17654   BioseqPtr       bsp;
17655   SeqIdPtr        sip;
17656   TextSeqIdPtr    tsip;
17657 
17658   bsp = BioseqFindFromSeqLoc (location);
17659   if (bsp != NULL) {
17660     for (sip = bsp->id; sip != NULL; sip = sip->next) {
17661       if (sip->choice == SEQID_OTHER) {
17662         tsip = (TextSeqIdPtr) sip->data.ptrvalue;
17663         if (tsip != NULL && tsip->accession != NULL) {
17664           if (StringNICmp (tsip->accession, "NC_", 3) == 0) {
17665             return TRUE;
17666           } else if (StringNICmp (tsip->accession, "NT_", 3) == 0) {
17667             return TRUE;
17668           } else if (StringNICmp (tsip->accession, "NW_", 3) == 0) {
17669             return TRUE;
17670           }
17671         }
17672       }
17673     }
17674   }
17675   return FALSE;
17676 }
17677 
NotPeptideException(SeqFeatPtr sfp,SeqFeatPtr last)17678 static Boolean NotPeptideException (SeqFeatPtr sfp, SeqFeatPtr last)
17679 {
17680   if (sfp != NULL && sfp->excpt) {
17681     if (StringISearch (sfp->except_text, "alternative processing") != NULL)
17682       return FALSE;
17683   }
17684   if (last != NULL && last->excpt) {
17685     if (StringISearch (last->except_text, "alternative processing") != NULL)
17686       return FALSE;
17687   }
17688   return TRUE;
17689 }
17690 
DescsSame(AnnotDescrPtr adp1,AnnotDescrPtr adp2)17691 static Boolean DescsSame (AnnotDescrPtr adp1, AnnotDescrPtr adp2)
17692 
17693 {
17694   if (adp1 == NULL || adp2 == NULL) return TRUE;
17695   if (adp1->choice != adp2->choice) return FALSE;
17696   if (adp1->choice == Annot_descr_name || adp1->choice == Annot_descr_title) {
17697     if (StringICmp ((CharPtr) adp1->data.ptrvalue, (CharPtr) adp2->data.ptrvalue) == 0) return TRUE;
17698   }
17699   return FALSE;
17700 }
17701 
17702 typedef struct gmcdata {
17703   SeqFeatPtr  gene;
17704   SeqFeatPtr  feat;
17705 } GmcData, PNTR GmcDataPtr;
17706 
SortGmcByGenePtr(VoidPtr vp1,VoidPtr vp2)17707 static int LIBCALLBACK SortGmcByGenePtr (
17708   VoidPtr vp1,
17709   VoidPtr vp2
17710 )
17711 
17712 {
17713   GmcDataPtr gdp1, gdp2;
17714 
17715   if (vp1 == NULL || vp2 == NULL) return 0;
17716   gdp1 = (GmcDataPtr) vp1;
17717   gdp2 = (GmcDataPtr) vp2;
17718   if (gdp1 == NULL || gdp2 == NULL) return 0;
17719 
17720   if (gdp1->gene > gdp2->gene) return -1;
17721   if (gdp1->gene < gdp2->gene) return 1;
17722 
17723   if (gdp1->feat > gdp2->feat) return -1;
17724   if (gdp1->feat < gdp2->feat) return 1;
17725 
17726   return 0;
17727 }
17728 
ValidateLocusTagGeneral(ValidStructPtr vsp,BioseqPtr bsp)17729 static void ValidateLocusTagGeneral (ValidStructPtr vsp, BioseqPtr bsp)
17730 
17731 {
17732   DbtagPtr           dbt;
17733   SeqMgrFeatContext  fcontext;
17734   GatherContextPtr   gcp;
17735   SeqFeatPtr         gene;
17736   GeneRefPtr         grp;
17737   ObjectIdPtr        oip;
17738   Uint2              olditemtype = 0;
17739   Uint4              olditemid = 0;
17740   BioseqPtr          prod;
17741   CharPtr            ptr;
17742   SeqFeatPtr         sfp;
17743   SeqIdPtr           sip;
17744   Char               tmp [64];
17745 
17746   if (vsp == NULL || bsp == NULL) return;
17747   if (! ISA_na (bsp->mol)) return;
17748 
17749   gcp = vsp->gcp;
17750   if (gcp != NULL) {
17751     olditemid = gcp->itemID;
17752     olditemtype = gcp->thistype;
17753   }
17754 
17755   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
17756   while (sfp != NULL) {
17757     if (sfp->idx.subtype == FEATDEF_CDS || sfp->idx.subtype == FEATDEF_mRNA) {
17758       grp = SeqMgrGetGeneXref (sfp);
17759       if (! SeqMgrGeneIsSuppressed (grp)) {
17760         if (grp == NULL) {
17761           gene = SeqMgrGetOverlappingGene (sfp->location, NULL);
17762           if (gene != NULL) {
17763             grp = (GeneRefPtr) gene->data.value.ptrvalue;
17764           }
17765         }
17766         if (grp != NULL && StringDoesHaveText (grp->locus_tag)) {
17767           prod = BioseqFindFromSeqLoc (sfp->product);
17768           if (prod != NULL) {
17769             for (sip = prod->id; sip != NULL; sip = sip->next) {
17770               if (sip->choice != SEQID_GENERAL) continue;
17771               dbt = (DbtagPtr) sip->data.ptrvalue;
17772               if (dbt == NULL) continue;
17773               if (IsSkippableDbtag(dbt)) continue;
17774               oip = dbt->tag;
17775               if (oip == NULL) continue;
17776               if (StringHasNoText (oip->str)) continue;
17777               StringNCpy_0 (tmp, oip->str, sizeof (tmp));
17778               ptr = StringChr (tmp, '-');
17779               if (ptr != NULL) {
17780                 *ptr = '\0';
17781               }
17782               if (StringICmp (grp->locus_tag, tmp) != 0) {
17783                 if (gcp != NULL) {
17784                   gcp->itemID = sfp->idx.itemID;
17785                   gcp->thistype = OBJ_SEQFEAT;
17786                 }
17787                 vsp->descr = NULL;
17788                 vsp->sfp = sfp;
17789                 ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_LocusTagProductMismatch, "Gene locus_tag does not match general ID of product");
17790               }
17791             }
17792           }
17793         }
17794       }
17795     }
17796     sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext);
17797   }
17798 
17799   if (gcp != NULL) {
17800     gcp->itemID = olditemid;
17801     gcp->thistype = olditemtype;
17802   }
17803 }
17804 
ReplaceQualsDiffer(GBQualPtr sfpqual,GBQualPtr lastqual)17805 static Boolean ReplaceQualsDiffer (GBQualPtr sfpqual, GBQualPtr lastqual)
17806 
17807 {
17808   if (sfpqual == NULL || lastqual == NULL) return FALSE;
17809 
17810   while (sfpqual != NULL && StringICmp (sfpqual->qual, "replace") != 0) {
17811     sfpqual = sfpqual->next;
17812   }
17813   while (lastqual != NULL && StringICmp (lastqual->qual, "replace") != 0) {
17814     lastqual = lastqual->next;
17815   }
17816   if (sfpqual == NULL || lastqual == NULL) return FALSE;
17817 
17818   if (StringICmp (sfpqual->val, lastqual->val) != 0) return TRUE;
17819 
17820   return FALSE;
17821 }
17822 
GBQualsDiffer(GBQualPtr sfpqual,GBQualPtr lastqual)17823 static Boolean GBQualsDiffer (GBQualPtr sfpqual, GBQualPtr lastqual)
17824 
17825 {
17826   if (sfpqual == NULL || lastqual == NULL) return FALSE;
17827 
17828   /* depends upon sorted order of gbquals imposed by BasicSeqEntryCleanup */
17829 
17830   while (sfpqual != NULL && lastqual != NULL) {
17831     if (StringICmp (sfpqual->qual, lastqual->qual) != 0) return TRUE;
17832     if (StringICmp (sfpqual->val, lastqual->val) != 0) return TRUE;
17833     sfpqual = sfpqual->next;
17834     lastqual = lastqual->next;
17835   }
17836 
17837   if (sfpqual != NULL || lastqual != NULL) return TRUE;
17838 
17839   return FALSE;
17840 }
17841 
MakePubLabelString(PubdescPtr pdp)17842 static CharPtr MakePubLabelString (PubdescPtr pdp)
17843 
17844 {
17845   Char        buf [521];
17846   CitGenPtr   cgp;
17847   ValNodePtr  vnp;
17848 
17849   if (pdp == NULL) return NULL;
17850 
17851   vnp = pdp->pub;
17852 
17853   /* skip over just serial number */
17854 
17855   if (vnp != NULL && vnp->choice == PUB_Gen && vnp->next != NULL) {
17856     cgp = (CitGenPtr) vnp->data.ptrvalue;
17857     if (cgp != NULL) {
17858       if (StringNICmp ("BackBone id_pub", cgp->cit, 15) != 0) {
17859         if (cgp->cit == NULL && cgp->journal == NULL && cgp->date == NULL && cgp->serial_number) {
17860           vnp = vnp->next;
17861         }
17862       }
17863     }
17864   }
17865 
17866   if (PubLabelUnique (vnp, buf, sizeof (buf) - 1, OM_LABEL_CONTENT, TRUE) > 0) {
17867     return StringSaveNoNull (buf);
17868   }
17869 
17870   return NULL;
17871 }
17872 
ValGetAuthorsPlusConsortium(AuthListPtr alp)17873 static CharPtr ValGetAuthorsPlusConsortium (
17874   AuthListPtr alp
17875 )
17876 
17877 {
17878   CharPtr  consortium;
17879   CharPtr  str;
17880   CharPtr  tmp;
17881 
17882   consortium = NULL;
17883   str = GetAuthorsString (GENBANK_FMT, alp, &consortium, NULL, NULL);
17884   if (str == NULL) return consortium;
17885   if (consortium == NULL) return str;
17886   tmp = MemNew (StringLen (str) + StringLen (consortium) + 5);
17887   if (tmp == NULL) return NULL;
17888   StringCpy (tmp, str);
17889   StringCat (tmp, "; ");
17890   StringCat (tmp, consortium);
17891   MemFree (str);
17892   MemFree (consortium);
17893   return tmp;
17894 }
17895 
IsIdenticalPublication(PubdescPtr pdp1,PubdescPtr pdp2)17896 NLM_EXTERN Boolean IsIdenticalPublication (PubdescPtr pdp1, PubdescPtr pdp2)
17897 
17898 {
17899   AuthListPtr  alp1, alp2;
17900   Boolean      rsult = TRUE;
17901   CharPtr      str1, str2;
17902 
17903   if (pdp1 == NULL || pdp2 == NULL) return FALSE;
17904 
17905   str1 = MakePubLabelString (pdp1);
17906   str2 = MakePubLabelString (pdp2);
17907   if (StringDoesHaveText (str1) && StringDoesHaveText (str2)) {
17908     if (StringICmp (str1, str2) != 0) {
17909        rsult = FALSE;
17910     }
17911   }
17912   MemFree (str1);
17913   MemFree (str2);
17914   if (! rsult) return rsult;
17915 
17916   alp1 = GetAuthListPtr (pdp1, NULL);
17917   alp2 = GetAuthListPtr (pdp2, NULL);
17918   if (alp1 != NULL && alp2 != NULL) {
17919     str1 = ValGetAuthorsPlusConsortium (alp1);
17920     str2 = ValGetAuthorsPlusConsortium (alp2);
17921     if (StringDoesHaveText (str1) && StringDoesHaveText (str2)) {
17922       if (StringICmp (str1, str2) != 0) {
17923          rsult = FALSE;
17924       }
17925     }
17926     MemFree (str1);
17927     MemFree (str2);
17928   }
17929 
17930   return rsult;
17931 }
17932 
IsIdenticalBioSource(BioSourcePtr biop1,BioSourcePtr biop2)17933 static Boolean IsIdenticalBioSource (BioSourcePtr biop1, BioSourcePtr biop2)
17934 
17935 {
17936   DbtagPtr      dbt1, dbt2;
17937   ObjectIdPtr   oip1, oip2;
17938   OrgModPtr     omp1, omp2;
17939   OrgNamePtr    onp1, onp2;
17940   OrgRefPtr     orp1, orp2;
17941   SubSourcePtr  ssp1, ssp2;
17942   ValNodePtr    vnp1, vnp2;
17943 
17944   if (biop1 == NULL || biop2 == NULL) return FALSE;
17945 
17946   if (biop1->is_focus != biop2->is_focus) return FALSE;
17947 
17948   orp1 = biop1->org;
17949   orp2 = biop2->org;
17950   if (orp1 == NULL || orp2 == NULL) return FALSE;
17951   if (StringICmp (orp1->taxname, orp2->taxname) != 0) return FALSE;
17952 
17953   onp1 = orp1->orgname;
17954   onp2 = orp2->orgname;
17955   if (onp1 == NULL || onp2 == NULL) return FALSE;
17956 
17957   omp1 = onp1->mod;
17958   omp2 = onp2->mod;
17959   while (omp1 != NULL && omp2 != NULL) {
17960     if (omp1->subtype != omp2->subtype) return FALSE;
17961     if (StringICmp (omp1->subname, omp2->subname) != 0) return FALSE;
17962     omp1 = omp1->next;
17963     omp2 = omp2->next;
17964   }
17965   if (omp1 != NULL || omp2 != NULL) return FALSE;
17966 
17967   ssp1 = biop1->subtype;
17968   ssp2 = biop2->subtype;
17969   while (ssp1 != NULL && ssp2 != NULL) {
17970     if (ssp1->subtype != ssp2->subtype) return FALSE;
17971     if (StringICmp(ssp1->name, ssp2->name) != 0) return FALSE;
17972     ssp1 = ssp1->next;
17973     ssp2 = ssp2->next;
17974   }
17975   if (ssp1 != NULL || ssp2 != NULL) return FALSE;
17976 
17977   vnp1 = orp1->db;
17978   vnp2 = orp2->db;
17979   while (vnp1 != NULL && vnp2 != NULL) {
17980     dbt1 = (DbtagPtr) vnp1->data.ptrvalue;
17981     dbt2 = (DbtagPtr) vnp2->data.ptrvalue;
17982 
17983     if ((dbt1 != NULL) && (dbt2 != NULL)) {
17984       if (StringCmp (dbt1->db, dbt2->db) != 0) return FALSE;
17985 
17986       oip1 = dbt1->tag;
17987       oip2 = dbt2->tag;
17988       if ((oip1 != NULL) && (oip2 != NULL)) {
17989         if (oip1->str != NULL) {
17990           if (StringICmp(oip1->str, oip2->str) != 0) return FALSE;
17991         } else  {
17992           if (oip1->id != oip2->id) return FALSE;
17993         }
17994       }
17995       else if (oip1 != NULL)
17996         return FALSE;
17997       else if (oip2 != NULL)
17998         return FALSE;
17999     }
18000     else if (dbt1 != NULL)
18001       return FALSE;
18002     else if (dbt2 != NULL)
18003       return FALSE;
18004 
18005     vnp1 = vnp1->next;
18006     vnp2 = vnp2->next;
18007   }
18008   if (vnp1 != NULL || vnp2 != NULL) return FALSE;
18009 
18010   return TRUE;
18011 }
18012 
18013 typedef struct lpdata {
18014   Int2        count;
18015   SeqFeatPtr  cds;
18016   SeqFeatPtr  mrna;
18017   Char        firstid [64];
18018   Boolean     products_unique;
18019   Boolean     featid_matched;
18020 } LpData, PNTR LpDataPtr;
18021 
IdXrefsAreReciprocal(SeqFeatPtr cds,SeqFeatPtr mrna)18022 static Boolean IdXrefsAreReciprocal (
18023   SeqFeatPtr cds,
18024   SeqFeatPtr mrna
18025 )
18026 
18027 {
18028   SeqFeatXrefPtr  xref;
18029   Boolean         match1 = FALSE, match2 = FALSE;
18030   SeqFeatPtr      matchsfp;
18031 
18032   if (cds == NULL || mrna == NULL) return FALSE;
18033   if (cds->id.choice != 3 || mrna->id.choice != 3) return FALSE;
18034 
18035   for (xref = cds->xref; xref != NULL; xref = xref->next) {
18036     if (xref->id.choice != 0) {
18037       matchsfp = SeqMgrGetFeatureByFeatID (cds->idx.entityID, NULL, NULL, xref, NULL);
18038       if (matchsfp == mrna) {
18039         match1 = TRUE;
18040       }
18041     }
18042   }
18043 
18044   for (xref = mrna->xref; xref != NULL; xref = xref->next) {
18045     if (xref->id.choice != 0) {
18046       matchsfp = SeqMgrGetFeatureByFeatID (mrna->idx.entityID, NULL, NULL, xref, NULL);
18047       if (matchsfp == cds) {
18048         match2 = TRUE;
18049       }
18050     }
18051   }
18052 
18053   if (match1 && match2) return TRUE;
18054   return FALSE;
18055 }
18056 
IdXrefsNotReciprocal(SeqFeatPtr cds,SeqFeatPtr mrna)18057 static Int2 IdXrefsNotReciprocal (
18058   SeqFeatPtr cds,
18059   SeqFeatPtr mrna
18060 )
18061 
18062 {
18063   BIG_ID          giu = 0, gip = 0;
18064   SeqFeatPtr      matchsfp;
18065   ObjectIdPtr     oip;
18066   SeqIdPtr        sip;
18067   CharPtr         tmp;
18068   UserFieldPtr    ufp;
18069   UserObjectPtr   uop;
18070   SeqFeatXrefPtr  xref;
18071 
18072   if (cds == NULL || mrna == NULL) return 0;
18073   if (cds->id.choice != 3 || mrna->id.choice != 3) return 0;
18074 
18075   for (xref = cds->xref; xref != NULL; xref = xref->next) {
18076     if (xref->id.choice != 0) {
18077       matchsfp = SeqMgrGetFeatureByFeatID (cds->idx.entityID, NULL, NULL, xref, NULL);
18078       if (matchsfp != NULL && matchsfp->idx.subtype == FEATDEF_mRNA && matchsfp != mrna) {
18079         return 1;
18080       }
18081     }
18082   }
18083 
18084   for (xref = mrna->xref; xref != NULL; xref = xref->next) {
18085     if (xref->id.choice != 0) {
18086       matchsfp = SeqMgrGetFeatureByFeatID (mrna->idx.entityID, NULL, NULL, xref, NULL);
18087       if (matchsfp != NULL && matchsfp->idx.subtype == FEATDEF_CDS && matchsfp != cds) {
18088         return 1;
18089       }
18090     }
18091   }
18092 
18093   if (cds->product == NULL) return 0;
18094   if (mrna->ext == NULL) return 0;
18095   uop = FindUopByTag (mrna->ext, "MrnaProteinLink");
18096   if (uop == NULL) return 0;
18097   sip = SeqLocId (cds->product);
18098   if (sip == NULL) return 0;
18099   if (sip->choice == SEQID_GI) {
18100     gip = (BIG_ID) sip->data.intvalue;
18101   } else {
18102     gip = GetGIForSeqId (sip);
18103   }
18104   if (gip == 0) return 0;
18105   ufp = uop->data;
18106   if (ufp == NULL || ufp->choice != 1) return 0;
18107   oip = ufp->label;
18108   if (oip == NULL || StringICmp (oip->str, "protein seqID") != 0) return 0;
18109   tmp = (CharPtr) ufp->data.ptrvalue;
18110   if (StringHasNoText (tmp)) return 0;
18111   sip = MakeSeqID (tmp);
18112   if (sip == NULL) return 0;
18113   if (sip->choice == SEQID_GI) {
18114     giu = (BIG_ID) sip->data.intvalue;
18115   } else {
18116     giu = GetGIForSeqId (sip);
18117   }
18118   SeqIdFree (sip);
18119   if (giu == 0) return 0;
18120   if (gip != giu) return 2;
18121 
18122   return 0;
18123 }
18124 
FindSingleMrnaProc(SeqFeatPtr sfp,SeqMgrFeatContextPtr context)18125 static Boolean LIBCALLBACK FindSingleMrnaProc (
18126   SeqFeatPtr sfp,
18127   SeqMgrFeatContextPtr context
18128 )
18129 
18130 {
18131   Char        buf [64];
18132   SeqFeatPtr  cds;
18133   LpDataPtr   ldp;
18134   SeqIdPtr    sip;
18135   VvmDataPtr  vdp;
18136 
18137   ldp = (LpDataPtr) context->userdata;
18138   if (ldp == NULL) return TRUE;
18139   cds = ldp->cds;
18140   if (cds == NULL) return TRUE;
18141 
18142   if (sfp->product) {
18143     if (StringHasNoText (ldp->firstid)) {
18144       sip = SeqLocId (sfp->product);
18145       SeqIdWrite (sip, ldp->firstid, PRINTID_FASTA_LONG, sizeof (ldp->firstid) - 1);
18146     } else {
18147       sip = SeqLocId (sfp->product);
18148       SeqIdWrite (sip, buf, PRINTID_FASTA_LONG, sizeof (buf) - 1);
18149       if (StringCmp (ldp->firstid, buf) == 0) {
18150         ldp->products_unique = FALSE;
18151       }
18152     }
18153   }
18154 
18155   vdp = (VvmDataPtr) sfp->idx.scratch;
18156   if (vdp != NULL && vdp->accounted_for) return TRUE;
18157 
18158   (ldp->count)++;
18159   ldp->mrna = sfp;
18160 
18161   if (IdXrefsAreReciprocal (cds, sfp)) {
18162     ldp->featid_matched = TRUE;
18163   }
18164 
18165   return TRUE;
18166 }
18167 
18168 
MarkMrnasFromCDSXrefs(SeqFeatPtr cds,LpDataPtr lp)18169 static Boolean MarkMrnasFromCDSXrefs (SeqFeatPtr cds, LpDataPtr lp)
18170 {
18171   SeqFeatXrefPtr     xref;
18172   Boolean            has_xref = FALSE;
18173   BioseqExtraPtr     bspextra;
18174   SMFidItemPtr PNTR  array;
18175   Char               buf [32];
18176   CharPtr            featid = NULL;
18177   SMFeatItemPtr      feat;
18178   SMFidItemPtr       item;
18179   Int4               L;
18180   Int4               mid;
18181   Int4               num;
18182   ObjectIdPtr        oip;
18183   ObjMgrDataPtr      omdp;
18184   Int4               R;
18185   SeqFeatPtr         sfp;
18186   SeqMgrFeatContext  context;
18187 
18188   if (cds == NULL) {
18189     return FALSE;
18190   }
18191 
18192   omdp = ObjMgrGetData (cds->idx.entityID);
18193   if (omdp == NULL) {
18194     return FALSE;
18195   }
18196 
18197   bspextra = (BioseqExtraPtr) omdp->extradata;
18198   if (bspextra == NULL) return FALSE;
18199   array = bspextra->featsByFeatID;
18200   num = bspextra->numfids;
18201   if (array == NULL || num < 1) return FALSE;
18202 
18203   context.userdata = lp;
18204 
18205   for (xref = cds->xref; xref != NULL; xref = xref->next) {
18206     if (xref != NULL && xref->id.choice == 3) {
18207       featid = NULL;
18208       oip = (ObjectIdPtr) xref->id.value.ptrvalue;
18209       if (oip != NULL) {
18210         if (StringDoesHaveText (oip->str)) {
18211           featid = oip->str;
18212         } else {
18213           sprintf (buf, "%ld", (long) oip->id);
18214           featid = buf;
18215         }
18216       }
18217       if (StringHasNoText (featid)) continue;
18218 
18219       L = 0;
18220       R = num - 1;
18221       while (L < R) {
18222         mid = (L + R) / 2;
18223         item = array [mid];
18224         if (item != NULL && StringICmp (item->fid, featid) < 0) {
18225           L = mid + 1;
18226         } else {
18227           R = mid;
18228         }
18229       }
18230 
18231       while (R > 0 && StringICmp (array[R - 1]->fid, featid) == 0) {
18232         R--;
18233       }
18234 
18235       while (R < num && StringICmp (array[R]->fid, featid) == 0) {
18236         item = array [R];
18237         feat = item->feat;
18238         if (feat != NULL
18239             && !feat->ignore
18240             && feat->sfp != NULL
18241             && feat->sfp->idx.subtype == FEATDEF_mRNA
18242             && IdXrefsAreReciprocal(cds, feat->sfp)) {
18243           has_xref = TRUE;
18244           sfp = feat->sfp;
18245           context.entityID = sfp->idx.entityID;
18246           context.itemID = feat->itemID;
18247           context.sfp = sfp;
18248           context.sap = feat->sap;
18249           context.bsp = feat->bsp;
18250           context.label = feat->label;
18251           context.left = feat->left;
18252           context.right = feat->right;
18253           context.dnaStop = feat->dnaStop;
18254           context.partialL = feat->partialL;
18255           context.partialR = feat->partialR;
18256           context.farloc = feat->farloc;
18257           context.bad_order = feat->bad_order;
18258           context.mixed_strand = feat->mixed_strand;
18259           context.strand = feat->strand;
18260           context.seqfeattype = sfp->data.choice;
18261           context.featdeftype = feat->subtype;
18262           context.numivals = feat->numivals;
18263           context.ivals = feat->ivals;
18264           context.omdp = (Pointer) omdp;
18265           context.index = R + 1;
18266           FindSingleMrnaProc (sfp, &context);
18267         }
18268         R++;
18269       }
18270     }
18271   }
18272   return has_xref;
18273 }
18274 
18275 
18276 /*
18277 static Boolean LIBCALLBACK DummyCM121Proc (
18278   SeqFeatPtr sfp,
18279   SeqMgrFeatContextPtr context
18280 )
18281 
18282 {
18283   return TRUE;
18284 }
18285 */
18286 
IsTransposonOrRetro(SeqFeatPtr mbl_element)18287 static Boolean IsTransposonOrRetro (
18288   SeqFeatPtr mbl_element
18289 )
18290 
18291 {
18292   GBQualPtr  gbq;
18293 
18294   if (mbl_element == NULL) return FALSE;
18295 
18296   for (gbq = mbl_element->qual; gbq != NULL; gbq = gbq->next) {
18297     if (StringICmp (gbq->qual, "mobile_element_type") != 0) continue;
18298     if (StringNICmp (gbq->val, "transposon", 10) == 0) return TRUE;
18299     if (StringNICmp (gbq->val, "retrotransposon", 15) == 0) return TRUE;
18300   }
18301 
18302   return FALSE;
18303 }
18304 
IsSGDTransposonOrRetro(SeqFeatPtr cds)18305 static Boolean IsSGDTransposonOrRetro (
18306   SeqFeatPtr cds
18307 )
18308 
18309 {
18310   DbtagPtr    dbt;
18311   ValNodePtr  vnp;
18312 
18313   if (cds == NULL) return FALSE;
18314   if (StringHasNoText (cds->comment)) return FALSE;
18315 
18316   for (vnp = cds->dbxref; vnp != NULL; vnp = vnp->next) {
18317     dbt = (DbtagPtr) vnp->data.ptrvalue;
18318     if (dbt == NULL) continue;
18319     if (StringCmp (dbt->db, "SGD") != 0) continue;
18320     if (StringISearch (cds->comment, "transposon") != NULL) return TRUE;
18321   }
18322 
18323   return FALSE;
18324 }
18325 
ValidateCDSmRNAmatch(ValidStructPtr vsp,BioseqPtr bsp,Int2 numgene,Int2 numcds,Int2 nummrna)18326 static void ValidateCDSmRNAmatch (
18327   ValidStructPtr vsp,
18328   BioseqPtr bsp,
18329   Int2 numgene,
18330   Int2 numcds,
18331   Int2 nummrna
18332 )
18333 
18334 {
18335   BioSourcePtr       biop;
18336   ValNodePtr         cdshead = NULL;
18337   ValNodePtr         cdstail = NULL;
18338   SeqMgrDescContext  dcontext;
18339   SeqMgrFeatContext  fcontext, rcontext;
18340   GatherContextPtr   gcp;
18341   GmcDataPtr         gdp, head;
18342   SeqFeatPtr         gene;
18343   Boolean            goOn, pseudo, suppressed;
18344   GeneRefPtr         grp;
18345   Int2               i, j, k, numfeats, tmpnumcds, tmpnummrna, count;
18346   Boolean            is_genbank = FALSE;
18347   LpData             ld;
18348   SeqFeatPtr         mbl_element, rpt_region;
18349   VoidPtr            mobile_element_array, repeat_region_array;
18350   Int4               num_mobile_elements, num_repeat_regions;
18351   Int2               num_no_mrna = 0;
18352   Uint2              olditemtype = 0;
18353   Uint4              olditemid = 0;
18354   OrgNamePtr         onp;
18355   OrgRefPtr          orp;
18356   Int2               recip;
18357   SeqDescrPtr        sdp;
18358   ErrSev             sev = /* SEV_INFO */ SEV_WARNING;
18359   SeqFeatPtr         sfp;
18360   SeqIdPtr           sip;
18361   VvmDataPtr         vdp;
18362   ValNodePtr         vnp;
18363 
18364   if (vsp == NULL || bsp == NULL) return;
18365   if (! ISA_na (bsp->mol)) return;
18366 
18367   gcp = vsp->gcp;
18368   if (gcp != NULL) {
18369     olditemid = gcp->itemID;
18370     olditemtype = gcp->thistype;
18371   }
18372 
18373   /*
18374   if (GetAppProperty ("ValidateCDSmRNAoneToOne") != NULL) {
18375     cdsMrnaOneToOne = TRUE;
18376   }
18377   */
18378 
18379   for (sip = bsp->id; sip != NULL; sip = sip->next) {
18380     if (sip->choice == SEQID_OTHER) {
18381       sev = SEV_WARNING;
18382     } else if (sip->choice == SEQID_GENBANK) {
18383       is_genbank = TRUE;
18384     }
18385   }
18386 
18387   if (is_genbank) {
18388     sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
18389     if (sdp != NULL) {
18390       biop = (BioSourcePtr) sdp->data.ptrvalue;
18391       if (biop != NULL) {
18392         orp = biop->org;
18393         if (orp != NULL) {
18394           onp = orp->orgname;
18395           if (onp != NULL) {
18396             if (StringDoesHaveText (onp->div) &&
18397                 StringCmp (onp->div, "BCT") != 0 &&
18398                 StringCmp (onp->div, "VRL") != 0) {
18399               is_genbank = FALSE;
18400             }
18401           }
18402         }
18403       }
18404     }
18405   }
18406 
18407   repeat_region_array = SeqMgrBuildFeatureIndex (bsp, &num_repeat_regions, 0, FEATDEF_repeat_region);
18408   mobile_element_array = SeqMgrBuildFeatureIndex (bsp, &num_mobile_elements, 0, FEATDEF_mobile_element);
18409 
18410   if (numgene > 0 && numcds > 0 && nummrna > 0) {
18411     numfeats = numcds + nummrna;
18412     head = (GmcDataPtr) MemNew (sizeof (GmcData) * (size_t) (numfeats + 1));
18413     if (head != NULL) {
18414       gdp = head;
18415       sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
18416       while (sfp != NULL) {
18417         if (sfp->idx.subtype == FEATDEF_CDS || sfp->idx.subtype == FEATDEF_mRNA) {
18418           gdp->feat = sfp;
18419           grp = SeqMgrGetGeneXref (sfp);
18420           if (grp == NULL) {
18421             gdp->gene = SeqMgrGetOverlappingGene (sfp->location, NULL);
18422           } else if (! SeqMgrGeneIsSuppressed (grp)) {
18423             if (StringDoesHaveText (grp->locus_tag)) {
18424               gdp->gene = SeqMgrGetGeneByLocusTag (bsp, grp->locus_tag, NULL);
18425             } else if (StringDoesHaveText (grp->locus)) {
18426               gdp->gene = SeqMgrGetFeatureByLabel (bsp, grp->locus, SEQFEAT_GENE, 0, NULL);
18427             }
18428           }
18429           gdp++;
18430         }
18431         sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext);
18432       }
18433       HeapSort (head, (size_t) numfeats, sizeof (GmcData), SortGmcByGenePtr);
18434       for (i = 0; i < numfeats; i += j) {
18435         gene = head [i].gene;
18436         for (j = 1; i + j < numfeats && gene == head [i + j].gene; j++) continue;
18437         if (j > 1 && gene != NULL) {
18438           /* is alt splicing */
18439           tmpnumcds = 0;
18440           tmpnummrna = 0;
18441           for (k = 0; k < j; k++) {
18442             sfp = head [i + k].feat;
18443             if (sfp == NULL) continue;
18444             if (sfp->idx.subtype == FEATDEF_CDS) {
18445               tmpnumcds++;
18446             }
18447             if (sfp->idx.subtype == FEATDEF_mRNA) {
18448               tmpnummrna++;
18449             }
18450           }
18451           if (tmpnumcds > 0 && tmpnummrna > 1 && tmpnumcds != tmpnummrna && (! is_genbank)) {
18452 
18453             if (gcp != NULL) {
18454               gcp->itemID = gene->idx.itemID;
18455               gcp->thistype = OBJ_SEQFEAT;
18456             }
18457             vsp->descr = NULL;
18458             vsp->sfp = gene;
18459             ValidErr (vsp, sev, ERR_SEQ_FEAT_CDSmRNAmismatch, "mRNA count (%d) does not match CDS (%d) count for gene",
18460                       (int) tmpnummrna, (int) tmpnumcds);
18461           }
18462         }
18463       }
18464     }
18465     MemFree (head);
18466   }
18467 
18468   /* loop through CDS features, finding single unused mRNA partner */
18469 
18470   goOn = TRUE;
18471   while (goOn) {
18472     goOn = FALSE;
18473     sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &fcontext);
18474     while (sfp != NULL) {
18475       vdp = (VvmDataPtr) sfp->idx.scratch;
18476       if (vdp != NULL && (! vdp->accounted_for)) {
18477         vdp->num_mrnas = 0;
18478         ld.count = 0;
18479         ld.cds = sfp;
18480         ld.mrna = NULL;
18481         ld.firstid [0] = '\0';
18482         ld.products_unique = TRUE;
18483         ld.featid_matched = FALSE;
18484 
18485         if (sfp->excpt &&
18486           (StringISearch (sfp->except_text, "ribosomal slippage") != NULL ||
18487             StringISearch (sfp->except_text, "trans-splicing") != NULL)) {
18488           count = SeqMgrGetAllOverlappingFeatures (sfp->location, FEATDEF_mRNA, NULL, 0,
18489                                                   LOCATION_SUBSET, (Pointer) &ld, FindSingleMrnaProc);
18490         } else {
18491           count = SeqMgrGetAllOverlappingFeatures (sfp->location, FEATDEF_mRNA, NULL, 0,
18492                                                   CHECK_INTERVALS, (Pointer) &ld, FindSingleMrnaProc);
18493         }
18494 
18495         if (!ld.featid_matched) {
18496           MarkMrnasFromCDSXrefs (sfp, &ld);
18497         }
18498 
18499         if (ld.count == 1 && ld.mrna != NULL) {
18500           vdp->accounted_for = TRUE;
18501           vdp->num_mrnas = ld.count;
18502           vdp->featid_matched = ld.featid_matched;
18503           vdp = (VvmDataPtr) ld.mrna->idx.scratch;
18504           if (vdp != NULL) {
18505             vdp->accounted_for = TRUE;
18506             goOn = TRUE;
18507             recip = IdXrefsNotReciprocal (sfp, ld.mrna);
18508             if (recip == 1) {
18509               if (gcp != NULL) {
18510                 gcp->itemID = sfp->idx.itemID;
18511                 gcp->thistype = OBJ_SEQFEAT;
18512               }
18513               vsp->descr = NULL;
18514               vsp->sfp = sfp;
18515               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_SeqFeatXrefNotReciprocal, "CDS/mRNA unambiguous pair have erroneous cross-references");
18516             } else if (recip == 2) {
18517               if (gcp != NULL) {
18518                 gcp->itemID = ld.mrna->idx.itemID;
18519                 gcp->thistype = OBJ_SEQFEAT;
18520               }
18521               vsp->descr = NULL;
18522               vsp->sfp = ld.mrna;
18523               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_SeqFeatXrefProblem, "MrnaProteinLink inconsistent with feature ID cross-references");
18524             }
18525           }
18526           if (SeqLocAinB (sfp->location, ld.mrna->location) < 0) {
18527             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_CDSmRNAXrefLocationProblem, "CDS not contained within cross-referenced mRNA");
18528           }
18529         } else {
18530           vdp->num_mrnas = ld.count;
18531           vdp->products_unique = ld.products_unique;
18532           vdp->featid_matched = ld.featid_matched;
18533         }
18534       }
18535       sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_CDREGION, 0, &fcontext);
18536     }
18537   }
18538 
18539   sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &fcontext);
18540   while (sfp != NULL && (! is_genbank)) {
18541     vdp = (VvmDataPtr) sfp->idx.scratch;
18542     if (vdp != NULL) {
18543       count = vdp->num_mrnas;
18544       /*
18545       count = SeqMgrGetAllOverlappingFeatures (sfp->location, FEATDEF_mRNA, NULL, 0,
18546                                                  CHECK_INTERVALS, NULL, DummyCM121Proc);
18547       */
18548       if (count > 1) {
18549         if (gcp != NULL) {
18550           gcp->itemID = sfp->idx.itemID;
18551           gcp->thistype = OBJ_SEQFEAT;
18552         }
18553         vsp->descr = NULL;
18554         vsp->sfp = sfp;
18555         if (vdp->featid_matched) {
18556           /* presence of reciprocal link suppresses warnings */
18557         } else if (vdp->products_unique) {
18558           ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_CDSwithMultipleMRNAs,
18559                     "CDS overlapped by %d mRNAs, but product locations are unique", (int) count);
18560         } else {
18561           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_CDSwithMultipleMRNAs, "CDS overlapped by %d mRNAs", (int) count);
18562         }
18563       } else if (count == 0 && numgene > 0 && numcds > 0 && nummrna > 0) {
18564         grp = GetGeneByFeat (sfp, &pseudo, &suppressed);
18565         if (! pseudo) {
18566           rpt_region = SeqMgrGetOverlappingFeature (sfp->location, 0, repeat_region_array, num_repeat_regions,
18567                                                     NULL, CONTAINED_WITHIN, &rcontext);
18568           mbl_element = SeqMgrGetOverlappingFeature (sfp->location, 0, mobile_element_array, num_mobile_elements,
18569                                                     NULL, CONTAINED_WITHIN, &rcontext);
18570           if (rpt_region == NULL && (! IsTransposonOrRetro (mbl_element)) && (! IsSGDTransposonOrRetro (sfp))) {
18571             if (StringStr (sfp->except_text, "rearrangement required for product") == NULL) {
18572               /*
18573               if (gcp != NULL) {
18574                 gcp->itemID = sfp->idx.itemID;
18575                 gcp->thistype = OBJ_SEQFEAT;
18576               }
18577               vsp->descr = NULL;
18578               vsp->sfp = sfp;
18579               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_CDSwithNoMRNAOverlap, "CDS overlapped by 0 mRNAs");
18580               */
18581               vnp = ValNodeAddPointer (&cdstail, 0, (Pointer) sfp);
18582               if (cdshead == NULL) {
18583                 cdshead = vnp;
18584               }
18585               cdstail = vnp;
18586               num_no_mrna++;
18587             }
18588           }
18589         }
18590       }
18591     }
18592     sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_CDREGION, 0, &fcontext);
18593   }
18594 
18595   MemFree (repeat_region_array);
18596   MemFree (mobile_element_array);
18597 
18598   if (num_no_mrna > 0) {
18599     if (num_no_mrna >= 10) {
18600       if (gcp != NULL) {
18601         gcp->itemID = olditemid;
18602         gcp->thistype = olditemtype;
18603       }
18604       vsp->descr = NULL;
18605       vsp->sfp = NULL;
18606       vsp->bsp = bsp;
18607       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_CDSwithNoMRNAOverlap,
18608                 "%d out of %d CDSs overlapped by 0 mRNAs", (int) num_no_mrna, (int) numcds);
18609     } else {
18610       for (vnp = cdshead; vnp != NULL; vnp = vnp->next) {
18611         sfp = (SeqFeatPtr) vnp->data.ptrvalue;
18612         if (sfp == NULL) continue;
18613         if (gcp != NULL) {
18614           gcp->itemID = sfp->idx.itemID;
18615           gcp->thistype = OBJ_SEQFEAT;
18616         }
18617         vsp->descr = NULL;
18618         vsp->sfp = sfp;
18619         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_CDSwithNoMRNAOverlap, "CDS overlapped by 0 mRNAs");
18620       }
18621     }
18622   }
18623 
18624   ValNodeFree (cdshead);
18625 
18626   if (gcp != NULL) {
18627     gcp->itemID = olditemid;
18628     gcp->thistype = olditemtype;
18629   }
18630 }
18631 
HaveUniqueFeatIDXrefs(SeqFeatXrefPtr xref1,SeqFeatXrefPtr xref2)18632 static Boolean HaveUniqueFeatIDXrefs (SeqFeatXrefPtr xref1, SeqFeatXrefPtr xref2)
18633 
18634 {
18635   ObjectIdPtr  oip1 = NULL, oip2 = NULL;
18636 
18637   while (xref1 != NULL) {
18638     if (xref1->id.choice == 3) {
18639       oip1 = (ObjectIdPtr) xref1->id.value.ptrvalue;
18640     }
18641     xref1 = xref1->next;
18642   }
18643 
18644   while (xref2 != NULL) {
18645     if (xref2->id.choice == 3) {
18646       oip2 = (ObjectIdPtr) xref2->id.value.ptrvalue;
18647     }
18648     xref2 = xref2->next;
18649   }
18650 
18651   if (oip1 == NULL || oip2 == NULL) return FALSE;
18652   if (oip1->str == NULL && oip2->str == NULL) {
18653     if (oip1->id != oip2->id && oip1->id > 0 && oip2->id > 0) return TRUE;
18654   }
18655 
18656   return FALSE;
18657 }
18658 
18659 #define LEFT_RIBOSOMAL_SUBUNIT  1
18660 #define INTERNAL_SPACER_1        2
18661 #define MIDDLE_RIBOSOMAL_SUBUNIT 3
18662 #define INTERNAL_SPACER_2        4
18663 #define RIGHT_RIBOSOMAL_SUBUNIT  5
18664 #define INTERNAL_SPACER_X        6
18665 
WhichRNA(SeqFeatPtr sfp)18666 static Int2 WhichRNA (SeqFeatPtr sfp)
18667 
18668 {
18669   RnaRefPtr  rrp;
18670   CharPtr    str;
18671 
18672   if (sfp == NULL || sfp->data.choice != SEQFEAT_RNA) return 0;
18673   rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
18674   if (rrp == NULL) return 0;
18675   str = GetRNARefProductString (rrp, NULL);
18676   if (StringHasNoText (str)) return 0;
18677   if (rrp->type == RNA_TYPE_rRNA) {
18678     if (StringNICmp (str, "small ", 6) == 0) return LEFT_RIBOSOMAL_SUBUNIT;
18679     if (StringNICmp (str, "18S ", 4) == 0) return LEFT_RIBOSOMAL_SUBUNIT;
18680     if (StringNICmp (str, "16S ", 4) == 0) return LEFT_RIBOSOMAL_SUBUNIT;
18681     if (StringNICmp (str, "5.8S ", 5) == 0) return MIDDLE_RIBOSOMAL_SUBUNIT;
18682     if (StringNICmp (str, "large ", 6) == 0) return RIGHT_RIBOSOMAL_SUBUNIT;
18683     if (StringNICmp (str, "26S ", 4) == 0) return RIGHT_RIBOSOMAL_SUBUNIT;
18684     if (StringNICmp (str, "28S ", 4) == 0) return RIGHT_RIBOSOMAL_SUBUNIT;
18685     if (StringNICmp (str, "23S ", 4) == 0) return RIGHT_RIBOSOMAL_SUBUNIT;
18686     /* variant spellings */
18687     if (StringNICmp (str, "18 ", 3) == 0) return LEFT_RIBOSOMAL_SUBUNIT;
18688     if (StringNICmp (str, "16 ", 3) == 0) return LEFT_RIBOSOMAL_SUBUNIT;
18689     if (StringNICmp (str, "5.8 ", 4) == 0) return MIDDLE_RIBOSOMAL_SUBUNIT;
18690     if (StringNICmp (str, "26 ", 3) == 0) return RIGHT_RIBOSOMAL_SUBUNIT;
18691     if (StringNICmp (str, "28 ", 3) == 0) return RIGHT_RIBOSOMAL_SUBUNIT;
18692     if (StringNICmp (str, "23 ", 3) == 0) return RIGHT_RIBOSOMAL_SUBUNIT;
18693   }
18694   if (rrp->type == RNA_TYPE_misc_RNA) {
18695     if (StringICmp (str, "internal transcribed spacer 1") == 0) return INTERNAL_SPACER_1;
18696     if (StringICmp (str, "internal transcribed spacer 2") == 0) return INTERNAL_SPACER_2;
18697     /* variant spellings */
18698     if (StringICmp (str, "internal transcribed spacer1") == 0) return INTERNAL_SPACER_1;
18699     if (StringICmp (str, "internal transcribed spacer2") == 0) return INTERNAL_SPACER_2;
18700     if (StringICmp (str, "internal transcribed spacer") == 0) return INTERNAL_SPACER_X;
18701     if (StringICmp (str, "ITS") == 0) return INTERNAL_SPACER_X;
18702     if (StringICmp (str, "16S-23S ribosomal RNA intergenic spacer") == 0) return INTERNAL_SPACER_X;
18703     if (StringICmp (str, "16S-23S intergenic spacer") == 0) return INTERNAL_SPACER_X;
18704     if (StringICmp (str, "intergenic spacer") == 0) return INTERNAL_SPACER_X;
18705   }
18706   return 0;
18707 }
18708 
CDSsLinkedToDifferentMRNAs(SeqFeatPtr sfp,SeqFeatPtr last)18709 static Boolean CDSsLinkedToDifferentMRNAs (SeqFeatPtr sfp, SeqFeatPtr last)
18710 
18711 {
18712   SeqFeatPtr      mrna1 = NULL, mrna2 = NULL;
18713   SeqFeatXrefPtr  xref;
18714 
18715   if (sfp == NULL || last == NULL) return FALSE;
18716   if (sfp->idx.subtype != FEATDEF_CDS || last->idx.subtype != FEATDEF_CDS) return FALSE;
18717 
18718   for (xref = sfp->xref; xref != NULL && mrna1 == NULL; xref = xref->next) {
18719     if (xref->id.choice != 0) {
18720       mrna1 = SeqMgrGetFeatureByFeatID (sfp->idx.entityID, NULL, NULL, xref, NULL);
18721       if (mrna1 != NULL && mrna1->idx.subtype != FEATDEF_mRNA) {
18722         mrna1 = NULL;
18723       }
18724     }
18725   }
18726 
18727   for (xref = last->xref; xref != NULL && mrna2 == NULL; xref = xref->next) {
18728     if (xref->id.choice != 0) {
18729       mrna2 = SeqMgrGetFeatureByFeatID (last->idx.entityID, NULL, NULL, xref, NULL);
18730       if (mrna2 != NULL && mrna2->idx.subtype != FEATDEF_mRNA) {
18731         mrna2 = NULL;
18732       }
18733     }
18734   }
18735 
18736   if (mrna1 != NULL && mrna2 != NULL && mrna1 != mrna2) return TRUE;
18737 
18738   return FALSE;
18739 }
18740 
MRNAsLinkedToDifferentCDSs(SeqFeatPtr sfp,SeqFeatPtr last)18741 static Boolean MRNAsLinkedToDifferentCDSs (SeqFeatPtr sfp, SeqFeatPtr last)
18742 
18743 {
18744   SeqFeatPtr      cds1 = NULL, cds2 = NULL;
18745   CdRegionPtr     crp1, crp2;
18746   SeqFeatXrefPtr  xref;
18747 
18748   if (sfp == NULL || last == NULL) return FALSE;
18749   if (sfp->idx.subtype != FEATDEF_mRNA || last->idx.subtype != FEATDEF_mRNA) return FALSE;
18750 
18751   for (xref = sfp->xref; xref != NULL && cds1 == NULL; xref = xref->next) {
18752     if (xref->id.choice != 0) {
18753       cds1 = SeqMgrGetFeatureByFeatID (sfp->idx.entityID, NULL, NULL, xref, NULL);
18754       if (cds1 != NULL && cds1->idx.subtype != FEATDEF_CDS) {
18755         cds1 = NULL;
18756       }
18757     }
18758   }
18759 
18760   for (xref = last->xref; xref != NULL && cds2 == NULL; xref = xref->next) {
18761     if (xref->id.choice != 0) {
18762       cds2 = SeqMgrGetFeatureByFeatID (last->idx.entityID, NULL, NULL, xref, NULL);
18763       if (cds2 != NULL && cds2->idx.subtype != FEATDEF_CDS) {
18764         cds2 = NULL;
18765       }
18766     }
18767   }
18768 
18769   if (cds1 == NULL || cds2 == NULL || cds1 == cds2) return FALSE;
18770 
18771   crp1 = (CdRegionPtr) cds1->data.value.ptrvalue;
18772   crp2 = (CdRegionPtr) cds2->data.value.ptrvalue;
18773   if (crp1 == NULL || crp2 == NULL) return FALSE;
18774 
18775   if (SeqLocCompare (cds1->location, cds2->location) != SLC_A_EQ_B) return TRUE;
18776 
18777   if (crp1->frame < 2 && crp2->frame < 2) return FALSE;
18778   if (crp1->frame != crp2->frame) return TRUE;
18779 
18780   return FALSE;
18781 }
18782 
BaseRangeIsVirtual(BioseqPtr bsp,Int4 left,Int4 right)18783 static Boolean BaseRangeIsVirtual (BioseqPtr bsp, Int4 left, Int4 right)
18784 
18785 {
18786   Uint1        res;
18787   StreamCache  sc;
18788 
18789   if (! StreamCacheSetup (bsp, NULL, EXPAND_GAPS_TO_DASHES, &sc)) return FALSE;
18790 
18791   StreamCacheSetPosition (&sc, left - 1);
18792   res = StreamCacheGetResidue (&sc);
18793   if (res == '-') return FALSE;
18794   res = StreamCacheGetResidue (&sc);
18795   if (res != '-') return FALSE;
18796 
18797   StreamCacheSetPosition (&sc, right - 1);
18798   res = StreamCacheGetResidue (&sc);
18799   if (res != '-') return FALSE;
18800   res = StreamCacheGetResidue (&sc);
18801   if (res == '-') return FALSE;
18802 
18803   return TRUE;
18804 }
18805 
18806 
IsAllNs(SeqLocPtr slp)18807 static Boolean IsAllNs (SeqLocPtr slp)
18808 {
18809   Boolean rval = TRUE;
18810   ErrSev            logsev;
18811   ErrSev            msgsev;
18812 
18813   msgsev = ErrSetMessageLevel (SEV_MAX);
18814   logsev = ErrSetLogLevel (SEV_MAX);
18815   SeqPortStreamLoc (slp, STREAM_EXPAND_GAPS, (Pointer) &rval, IsAllNsProc);
18816   ErrSetLogLevel (logsev);
18817   ErrSetMessageLevel (msgsev);
18818   return rval;
18819 }
18820 
18821 
18822 /* Pass in DeltaSeqPtr for start of loc and offset */
DoesLocIntersectGapOfUnknownLength(SeqLocPtr slp,DeltaSeqPtr dsp,Int4 dsp_start)18823 static Boolean DoesLocIntersectGapOfUnknownLength (SeqLocPtr slp, DeltaSeqPtr dsp, Int4 dsp_start)
18824 {
18825   Int4 stop;
18826   Int4 offset = dsp_start;
18827 
18828   if (dsp == NULL) {
18829     return FALSE;
18830   }
18831 
18832   stop = SeqLocStop (slp);
18833   while (dsp != NULL && offset < stop) {
18834     if (IsDeltaSeqUnknownGap(dsp)) {
18835       return TRUE;
18836     }
18837     offset += GetDeltaSeqLen(dsp);
18838     dsp = dsp->next;
18839   }
18840   return FALSE;
18841 }
18842 
18843 
GetFeatsInGaps(SeqFeatPtr sfp,SeqMgrFeatContextPtr fcontext)18844 static Boolean LIBCALLBACK GetFeatsInGaps (
18845   SeqFeatPtr sfp,
18846   SeqMgrFeatContextPtr fcontext
18847 )
18848 
18849 {
18850   BioseqPtr         bsp;
18851   Int4              dashes;
18852   GatherContextPtr  gcp;
18853   SeqLocPtr         loc;
18854   Int4              Ns;
18855   Uint2             olditemtype = 0;
18856   Uint4             olditemid = 0;
18857   Int4              plusses;
18858   Int4              realBases;
18859   SeqLocPtr         slp;
18860   Boolean           startsOrEndsInGap = FALSE;
18861   ValidStructPtr    vsp;
18862   DeltaSeqPtr       dsp;
18863   Int4              dsp_start;
18864   Int4              start, stop;
18865 
18866   if (sfp == NULL || fcontext == NULL) return FALSE;
18867   vsp = (ValidStructPtr) fcontext->userdata;
18868   if (vsp == NULL) return FALSE;
18869   gcp = vsp->gcp;
18870   if (gcp == NULL) return FALSE;
18871 
18872   if (sfp->idx.subtype == FEATDEF_gap) return TRUE;
18873   loc = sfp->location;
18874   if (loc == NULL) return TRUE;
18875 
18876   olditemid = gcp->itemID;
18877   olditemtype = gcp->thistype;
18878 
18879   gcp->itemID = fcontext->itemID;
18880   gcp->thistype = OBJ_SEQFEAT;
18881   vsp->sfp = sfp;
18882 
18883 
18884   dashes = 0;
18885   plusses = 0;
18886   Ns = 0;
18887   realBases = 0;
18888 
18889   bsp = BioseqFindFromSeqLoc (loc);
18890 
18891   start = SeqLocStart(loc);
18892   stop = SeqLocStop(loc);
18893   dsp = GetDeltaSeqForPosition(start, bsp, &dsp_start);
18894   /* special check for single interval misc_features that may exactly cover a gap */
18895   if (loc->choice == SEQLOC_INT && sfp->idx.subtype == FEATDEF_misc_feature
18896       && dsp != NULL
18897       && dsp_start == start && SeqLocLen(loc) == GetDeltaSeqLen(dsp)) {
18898     /* single interval misc_feature covers a delta segment exactly, can ignore it */
18899     return TRUE;
18900   }
18901 
18902   if (SeqLocLen (loc) >= 50 && IsAllNs(loc)) {
18903     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_FeatureInsideGap, "Feature inside gap of Ns");
18904     return TRUE;
18905   }
18906 
18907   if (dsp == NULL) {
18908     /* not a delta sequence, no other errors possible */
18909     return TRUE;
18910   }
18911 
18912   if (IsDeltaSeqGap(dsp)) {
18913     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_FeatureBeginsOrEndsInGap, "Feature begins or ends in gap starting at %d", dsp_start + 1);
18914     return TRUE;
18915   }
18916   dsp = GetDeltaSeqForPosition(stop, bsp, &dsp_start);
18917   if (IsDeltaSeqGap(dsp)) {
18918     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_FeatureBeginsOrEndsInGap, "Feature begins or ends in gap starting at %d", dsp_start + 1);
18919     return TRUE;
18920   }
18921 
18922   for (slp = SeqLocFindNext (loc, NULL); slp != NULL; slp = SeqLocFindNext (loc, slp)) {
18923     start = SeqLocStart(slp);
18924     dsp = GetDeltaSeqForPosition(start, bsp, &dsp_start);
18925     if (DoesLocIntersectGapOfUnknownLength(slp, dsp, dsp_start)) {
18926       if (sfp->data.choice != SEQFEAT_GENE) {
18927         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_FeatureCrossesGap, "Feature crosses gap of unknown length");
18928         return TRUE;
18929       }
18930     } else if (IsDeltaSeqGap(dsp)) {
18931       startsOrEndsInGap = TRUE;
18932     }
18933     stop = SeqLocStop (slp);
18934     dsp = GetDeltaSeqForPosition(stop, bsp, &dsp_start);
18935     if (IsDeltaSeqGap(dsp)) {
18936       startsOrEndsInGap = TRUE;
18937     }
18938   }
18939 
18940   if (startsOrEndsInGap) {
18941     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_IntervalBeginsOrEndsInGap, "Internal interval begins or ends in gap");
18942   }
18943 
18944   gcp->itemID = olditemid;
18945   gcp->thistype = olditemtype;
18946   vsp->sfp = NULL;
18947 
18948   return TRUE;
18949 }
18950 
CheckBioseqForFeatsInGap(BioseqPtr bsp,ValidStructPtr vsp)18951 static void CheckBioseqForFeatsInGap (
18952   BioseqPtr bsp,
18953   ValidStructPtr vsp
18954 )
18955 
18956 {
18957   SeqMgrFeatContext  fcontext;
18958   SeqFeatPtr         sfp;
18959   SeqIdPtr           sip;
18960 
18961   if (bsp == NULL || ISA_aa (bsp->mol)) return;
18962   sip = SeqIdFindBest (bsp->id, 0);
18963   if (sip == NULL) return;
18964 
18965   for (sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
18966        sfp != NULL;
18967        sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext)) {
18968     fcontext.userdata = (Pointer) vsp;
18969     GetFeatsInGaps (sfp, &fcontext);
18970   }
18971 }
18972 
GetFeatsInNs(SeqFeatPtr sfp,SeqMgrFeatContextPtr fcontext)18973 static Boolean LIBCALLBACK GetFeatsInNs (
18974   SeqFeatPtr sfp,
18975   SeqMgrFeatContextPtr fcontext
18976 )
18977 
18978 {
18979   Char              ch;
18980   GatherContextPtr  gcp;
18981   int               i;
18982   size_t            len;
18983   Int4              Ns = 0;
18984   Uint2             olditemtype = 0;
18985   Uint4             olditemid = 0;
18986   Int4              realBases = 0;
18987   CharPtr           str;
18988   ValidStructPtr    vsp;
18989 
18990   if (sfp == NULL || fcontext == NULL) return FALSE;
18991   vsp = (ValidStructPtr) fcontext->userdata;
18992   if (vsp == NULL) return FALSE;
18993   gcp = vsp->gcp;
18994   if (gcp == NULL) return FALSE;
18995 
18996   if (sfp->idx.subtype == FEATDEF_gap || sfp->idx.subtype == FEATDEF_misc_feature) return TRUE;
18997 
18998   str = GetSequenceByFeatureEx (sfp, STREAM_EXPAND_GAPS | SEQ_GAP_AS_TILDE);
18999   if (str == NULL) return TRUE;
19000 
19001   olditemid = gcp->itemID;
19002   olditemtype = gcp->thistype;
19003 
19004   gcp->itemID = fcontext->itemID;
19005   gcp->thistype = OBJ_SEQFEAT;
19006   vsp->sfp = sfp;
19007 
19008   len = StringLen (str);
19009   if (len > 0) {
19010     /*
19011     if (str [0] == 'N') {
19012       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_FeatureBeginsOrEndsWithN, "Feature begins with an N");
19013     }
19014     if (len > 1 && str [len - 1] == 'N') {
19015       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_FeatureBeginsOrEndsWithN, "Feature ends with an N");
19016     }
19017     */
19018     for (i = 0; i < len; i++) {
19019       ch = str [i];
19020       if (ch == 'N') {
19021         Ns++;
19022       } else if (IS_ALPHA (ch)) {
19023         realBases++;
19024       }
19025     }
19026     if (Ns > realBases) {
19027       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_FeatureIsMostlyNs, "Feature contains more than 50%s Ns", "%");
19028     }
19029   }
19030 
19031   MemFree (str);
19032 
19033   gcp->itemID = olditemid;
19034   gcp->thistype = olditemtype;
19035   vsp->sfp = NULL;
19036 
19037   return TRUE;
19038 }
19039 
CheckBioseqForFeatsInNs(BioseqPtr bsp,ValidStructPtr vsp)19040 static void CheckBioseqForFeatsInNs (
19041   BioseqPtr bsp,
19042   ValidStructPtr vsp
19043 )
19044 
19045 {
19046   SeqMgrFeatContext  fcontext;
19047   SeqFeatPtr         sfp;
19048   SeqIdPtr           sip;
19049 
19050   if (bsp == NULL || ISA_aa (bsp->mol)) return;
19051   sip = SeqIdFindBest (bsp->id, 0);
19052   if (sip == NULL) return;
19053 
19054   for (sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
19055        sfp != NULL;
19056        sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext)) {
19057     fcontext.userdata = (Pointer) vsp;
19058     GetFeatsInNs (sfp, &fcontext);
19059   }
19060 }
19061 
ReportGeneCollision(GeneRefPtr grp,GeneRefPtr lastgrp)19062 static Boolean ReportGeneCollision (GeneRefPtr grp, GeneRefPtr lastgrp)
19063 
19064 {
19065   if (grp == NULL || lastgrp == NULL) return TRUE;
19066 
19067   if (StringDoesHaveText (grp->locus) && StringDoesHaveText (lastgrp->locus)) {
19068     if (StringICmp (grp->locus, lastgrp->locus) == 0) return TRUE;
19069   }
19070 
19071   if (StringDoesHaveText (grp->locus_tag) && StringDoesHaveText (lastgrp->locus_tag)) {
19072     if (StringICmp (grp->locus_tag, lastgrp->locus_tag) == 0) return TRUE;
19073   }
19074 
19075   if (StringDoesHaveText (grp->desc) && StringDoesHaveText (lastgrp->desc)) {
19076     if (StringICmp (grp->desc, lastgrp->desc) == 0) return FALSE;
19077   }
19078 
19079   return TRUE;
19080 }
19081 
FeatureSequencesIdentical(SeqFeatPtr sfp,SeqFeatPtr lastsfp)19082 static Boolean FeatureSequencesIdentical (SeqFeatPtr sfp, SeqFeatPtr lastsfp)
19083 
19084 {
19085   Boolean  rsult = FALSE;
19086   CharPtr  tmp1, tmp2;
19087 
19088   if (sfp == NULL || lastsfp == NULL) return rsult;
19089 
19090   tmp1 = GetSequenceByFeature (sfp);
19091   tmp2 = GetSequenceByFeature (lastsfp);
19092 
19093   if (tmp1 != NULL && tmp2 != NULL) {
19094     if (StringCmp (tmp1, tmp2) == 0) {
19095       rsult = TRUE;
19096     }
19097   }
19098 
19099   MemFree (tmp1);
19100   MemFree (tmp2);
19101 
19102   return rsult;
19103 }
19104 
GeneXrefsDifferent(SeqFeatPtr sfp,SeqFeatPtr lastsfp)19105 static Boolean GeneXrefsDifferent (SeqFeatPtr sfp, SeqFeatPtr lastsfp)
19106 
19107 {
19108   SeqFeatPtr  gene, lastgene;
19109 
19110   if (sfp == NULL || lastsfp == NULL) return FALSE;
19111 
19112   gene = GetGeneForFeature (sfp);
19113   lastgene = GetGeneForFeature (lastsfp);
19114   if (gene == NULL || lastgene == NULL) return FALSE;
19115 
19116   if (gene != lastgene) return TRUE;
19117 
19118   return FALSE;
19119 }
19120 
19121 
CheckForGenesInconsistent(BioseqPtr bsp,ValidStructPtr vsp)19122 static void CheckForGenesInconsistent (BioseqPtr bsp, ValidStructPtr vsp)
19123 {
19124   SeqMgrFeatContext  fcontext;
19125   SeqFeatPtr mrna, genomic_gene = NULL, mrna_gene = NULL;
19126   GeneRefPtr genomicgrp, grp, found_grp;
19127   BioseqPtr  genomic_bsp;
19128   Boolean    is_error = FALSE;
19129 
19130   mrna = SeqMgrGetRNAgivenProduct (bsp, &fcontext);
19131   if (mrna != NULL) {
19132     genomicgrp = SeqMgrGetGeneXref (mrna);
19133     if (genomicgrp == NULL) {
19134       genomic_gene = SeqMgrGetOverlappingGene (mrna->location, NULL);
19135       if (genomic_gene != NULL) {
19136         genomicgrp = (GeneRefPtr) genomic_gene->data.value.ptrvalue;
19137       }
19138     }
19139     if (genomicgrp != NULL
19140         && (mrna_gene = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_GENE, 0, &fcontext)) != NULL
19141         && (grp = (GeneRefPtr) mrna_gene->data.value.ptrvalue) != NULL) {
19142       if (StringCmp (grp->locus, genomicgrp->locus) != 0 ||
19143           StringCmp (grp->allele, genomicgrp->allele) != 0 ||
19144           StringCmp (grp->desc, genomicgrp->desc) != 0 ||
19145           StringCmp (grp->locus_tag, genomicgrp->locus_tag) != 0) {
19146         is_error = TRUE;
19147         if (genomic_gene == NULL
19148             && ((StringHasNoText (genomicgrp->desc) && StringDoesHaveText(grp->desc))
19149                 || (StringHasNoText (genomicgrp->allele) && StringDoesHaveText(grp->allele)))) {
19150           genomic_bsp = BioseqFindFromSeqLoc (mrna->location);
19151           if (StringDoesHaveText (genomicgrp->locus_tag)) {
19152             genomic_gene = SeqMgrGetGeneByLocusTag (genomic_bsp, genomicgrp->locus_tag, &fcontext);
19153             if (genomic_gene != NULL && (found_grp = (GeneRefPtr)genomic_gene->data.value.ptrvalue)
19154               && StringCmp (found_grp->locus, genomicgrp->locus) != 0) {
19155               genomic_gene = NULL;
19156             }
19157           } else if (StringDoesHaveText (genomicgrp->locus)) {
19158             genomic_gene = SeqMgrGetFeatureByLabel (genomic_bsp, genomicgrp->locus, SEQFEAT_GENE, 0, &fcontext);
19159             if (genomic_gene != NULL
19160                 && (found_grp = (GeneRefPtr)genomic_gene->data.value.ptrvalue) != NULL
19161                 && StringDoesHaveText (found_grp->locus_tag)) {
19162               genomic_gene = NULL;
19163             }
19164           }
19165           if (genomic_gene != NULL && (genomicgrp = (GeneRefPtr) genomic_gene->data.value.ptrvalue) != NULL
19166               && StringCmp (grp->locus, genomicgrp->locus) == 0
19167               && StringCmp (grp->allele, genomicgrp->allele) == 0
19168               && StringCmp (grp->desc, genomicgrp->desc) == 0
19169               && StringCmp (grp->locus_tag, genomicgrp->locus_tag) == 0) {
19170             is_error = FALSE;
19171           }
19172         }
19173         if (is_error) {
19174           vsp->descr = NULL;
19175           vsp->sfp = mrna_gene;
19176           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_GenesInconsistent, "Gene on mRNA bioseq does not match gene on genomic bioseq");
19177         }
19178       }
19179     }
19180   }
19181 }
19182 
CheckForNonViralComplete(BioseqPtr bsp,ValidStructPtr vsp,GatherContextPtr gcp)19183 static void CheckForNonViralComplete (BioseqPtr bsp, ValidStructPtr vsp, GatherContextPtr gcp)
19184 
19185 {
19186   BioSourcePtr       biop = NULL;
19187   SeqMgrDescContext  dcontext;
19188   MolInfoPtr         mip = NULL;
19189   Uint2              olditemtype = 0;
19190   Uint4              olditemid = 0;
19191   OrgNamePtr         onp;
19192   OrgRefPtr          orp;
19193   ObjValNodePtr      ovp;
19194   SeqDescrPtr        sdp;
19195   CharPtr            title = NULL;
19196   SubSourcePtr       ssp;
19197 
19198   if (bsp == NULL || vsp == NULL) return;
19199 
19200   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext);
19201   if (sdp != NULL) {
19202     mip = (MolInfoPtr) sdp->data.ptrvalue;
19203   }
19204   if (mip == NULL) return;
19205   if (mip->biomol != MOLECULE_TYPE_GENOMIC || mip->completeness != 1) return;
19206 
19207   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_title, &dcontext);
19208   if (sdp != NULL) {
19209     title = (CharPtr) sdp->data.ptrvalue;
19210   }
19211   if (title == NULL) return;
19212   if (StringStr (title, "complete genome") == NULL) return;
19213 
19214   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
19215   if (sdp != NULL) {
19216     biop = (BioSourcePtr) sdp->data.ptrvalue;
19217   }
19218   if (biop == NULL) return;
19219   if (biop->genome != GENOME_genomic && biop->genome != GENOME_unknown) return;
19220 
19221   orp = biop->org;
19222   if (orp == NULL) return;
19223   onp = orp->orgname;
19224   if (onp == NULL) return;
19225   if (StringNICmp (onp->lineage, "Viruses; ", 9) == 0) return;
19226   if (StringNICmp (onp->lineage, "Viroids; ", 9) == 0) return;
19227   if (StringICmp (onp->lineage, "Viruses") == 0 && StringICmp (onp->div, "PHG") == 0) return;
19228 
19229   for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
19230     if (ssp->subtype == SUBSRC_endogenous_virus_name) {
19231       return;
19232     }
19233   }
19234 
19235   if (gcp != NULL) {
19236     olditemid = gcp->itemID;
19237     olditemtype = gcp->thistype;
19238   }
19239 
19240   if (sdp != NULL && sdp->extended != 0) {
19241     ovp = (ObjValNodePtr) sdp;
19242     if (ovp != NULL && gcp != NULL) {
19243       gcp->itemID = ovp->idx.itemID;
19244       gcp->thistype = ovp->idx.itemtype;
19245     }
19246   }
19247 
19248   ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceNeedsChromosome, "Non-viral complete genome not labeled as chromosome");
19249 
19250   if (gcp != NULL) {
19251     gcp->itemID = olditemid;
19252     gcp->thistype = olditemtype;
19253   }
19254 }
19255 
19256 
LookForViralMolInfoInLineage(BioseqPtr bsp,ValidStructPtr vsp,GatherContextPtr gcp)19257 static void LookForViralMolInfoInLineage (BioseqPtr bsp, ValidStructPtr vsp, GatherContextPtr gcp)
19258 {
19259   SeqDescPtr        sdp;
19260   SeqMgrDescContext context;
19261   BioSourcePtr      biop;
19262   CharPtr           lineage;
19263   ObjValNodePtr     ovp;
19264   Uint2             olditemtype = 0;
19265   Uint4             olditemid = 0;
19266 
19267   if (bsp == NULL || ISA_aa (bsp->mol) || vsp == NULL) return;
19268 
19269   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
19270   if (sdp == NULL || (biop = (BioSourcePtr) sdp->data.ptrvalue) == NULL
19271       || biop->org == NULL || biop->org->orgname == NULL
19272       || (lineage = biop->org->orgname->lineage) == NULL
19273       || StringNICmp (biop->org->orgname->lineage, "Viruses; ", 9) != 0) {
19274     return;
19275   }
19276 
19277   if (gcp != NULL) {
19278     olditemid = gcp->itemID;
19279     olditemtype = gcp->thistype;
19280   }
19281 
19282   if (sdp != NULL && sdp->extended != 0) {
19283     ovp = (ObjValNodePtr) sdp;
19284     if (ovp != NULL && gcp != NULL) {
19285       gcp->itemID = ovp->idx.itemID;
19286       gcp->thistype = ovp->idx.itemtype;
19287     }
19288   }
19289 
19290   vsp->bsp = bsp;
19291   vsp->descr = sdp;
19292 
19293   if ((StringSearch (lineage, " ssRNA viruses; ") != NULL
19294        || StringSearch (lineage, " ssRNA negative-strand viruses; ") != NULL
19295        || StringSearch (lineage, " ssRNA positive-strand viruses, no DNA stage; ") != NULL
19296        || StringSearch (lineage, " unassigned ssRNA viruses; ") != NULL)
19297       && (bsp->mol != Seq_mol_rna)) {
19298     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MolInfoConflictsWithBioSource, "Taxonomy indicates single-stranded RNA, sequence does not agree.");
19299   }
19300 
19301   if (StringSearch (lineage, " dsRNA viruses; ") != NULL
19302       && (bsp->mol != Seq_mol_rna)) {
19303     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MolInfoConflictsWithBioSource, "Taxonomy indicates double-stranded RNA, sequence does not agree.");
19304   }
19305 
19306   if (StringSearch (lineage, " ssDNA viruses; ") != NULL
19307       && (bsp->mol != Seq_mol_dna)) {
19308     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MolInfoConflictsWithBioSource, "Taxonomy indicates single-stranded DNA, sequence does not agree.");
19309   }
19310 
19311   if (StringSearch (lineage, " dsDNA viruses; ") != NULL
19312       && (bsp->mol != Seq_mol_dna)) {
19313     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_MolInfoConflictsWithBioSource, "Taxonomy indicates double-stranded DNA, sequence does not agree.");
19314   }
19315 
19316   if (gcp != NULL) {
19317     gcp->itemID = olditemid;
19318     gcp->thistype = olditemtype;
19319   }
19320 }
19321 
19322 
SuppressMultipleEquivBioSources(BioSourcePtr biop)19323 static Boolean SuppressMultipleEquivBioSources (BioSourcePtr biop)
19324 {
19325   CharPtr viruses = "Viruses";
19326   if (biop == NULL || biop->org == NULL) {
19327     return FALSE;
19328   }
19329   if (StringICmp (biop->org->taxname, "unidentified phage") == 0) {
19330     return TRUE;
19331   } else if (biop->org->orgname != NULL
19332              && StringNICmp (biop->org->orgname->lineage, viruses, StringLen (viruses)) == 0) {
19333     return TRUE;
19334   } else {
19335     return FALSE;
19336   }
19337 }
19338 
19339 
ReportMultipleUnprocessedProteinFeatures(BioseqPtr bsp,ValidStructPtr vsp)19340 static void ReportMultipleUnprocessedProteinFeatures (BioseqPtr bsp, ValidStructPtr vsp)
19341 {
19342   SeqFeatPtr sfp;
19343   SeqMgrFeatContext context;
19344 
19345   if (bsp == NULL || vsp == NULL || !ISA_aa(bsp->mol)) {
19346     return;
19347   }
19348 
19349   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_PROT, &context);
19350   if (sfp != NULL) {
19351     sfp = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_PROT, &context);
19352     if (sfp != NULL) {
19353       for (sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_PROT, &context);
19354            sfp != NULL;
19355            sfp = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_PROT, &context)) {
19356         vsp->sfp = sfp;
19357         ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_ExtraProteinFeature,
19358                   "Protein sequence has multiple unprocessed protein features");
19359       }
19360     }
19361   }
19362   vsp->sfp = NULL;
19363 }
19364 
19365 
FeatureHastRNAXref(SeqFeatPtr sfp)19366 static Boolean FeatureHastRNAXref(SeqFeatPtr sfp)
19367 {
19368     SeqFeatXrefPtr xref;
19369     RnaRefPtr rrp;
19370     Boolean rval = FALSE;
19371 
19372     for (xref = sfp->xref; xref != NULL && !rval; xref = xref->next) {
19373         if (xref->data.choice == SEQFEAT_RNA
19374             && (rrp = (RnaRefPtr) xref->data.value.ptrvalue) != NULL
19375             && rrp->type == 3) {
19376             rval = TRUE;
19377         }
19378     }
19379     return rval;
19380 }
19381 
19382 
ValidateBioseqContextIndexed(BioseqPtr bsp,BioseqValidStrPtr bvsp)19383 static Boolean ValidateBioseqContextIndexed (BioseqPtr bsp, BioseqValidStrPtr bvsp)
19384 
19385 {
19386   ValidStructPtr     vsp;
19387   ObjMgrDataPtr      omdp;
19388   SeqSubmitPtr       ssp;
19389   SubmitBlockPtr     sbp;
19390   GatherContextPtr   gcp;
19391   SeqFeatPtr         sfp, lastsfp;
19392   SeqMgrFeatContext  fcontext;
19393   Uint2              featdeftype = 0;
19394   Boolean            firstCDS;
19395   GeneRefPtr         grp, lastgrp;
19396   SeqFeatPtr         last = NULL;
19397   Boolean            leave;
19398   CharPtr            label = NULL;
19399   CharPtr            comment = NULL;
19400   Int4               left = 0;
19401   Boolean            partialL = FALSE;
19402   Boolean            partialR = FALSE;
19403   Int4               right = 0;
19404   Uint1              strand = 0;
19405   Int2               numivals = 0;
19406   Int4Ptr            ivals = NULL;
19407   Boolean            ivalssame;
19408   SeqAnnotPtr        sap = NULL;
19409   Boolean            no_cit_sub;
19410   Uint2              olditemtype = 0;
19411   Uint4              olditemid = 0;
19412   CharPtr            lastLabel;
19413   Boolean            isSplitGene, lastIsSplitGene;
19414   CharPtr            message;
19415   Int2               i;
19416   Boolean            isCuratedFlybase = FALSE;
19417   Boolean            isDrosophila = FALSE;
19418   Boolean            isEukaryote = FALSE;
19419   Boolean            isGenBankAccn = FALSE;
19420   Boolean            isGeneralAccn = FALSE;
19421   Boolean            isGPSorNTorNCorNGorNW = FALSE;
19422   Boolean            isMicrosporidia = FALSE;
19423   Boolean            isViral = FALSE;
19424   Boolean            non_pseudo_16S_rRNA = FALSE;
19425   Uint1              genome = 0;
19426   RnaRefPtr          rrp;
19427   RNAGenPtr          rgp;
19428   CharPtr            str;
19429   Int2               j;
19430   CdRegionPtr        crp;
19431   Uint1              frame = 0;
19432   Boolean            samelabel;
19433   int                severity;
19434   int                overlapPepSev;
19435   BioSourcePtr       biop = NULL, lastbiop;
19436   OrgRefPtr          orp = NULL;
19437   OrgNamePtr         onp = NULL;
19438   Int4               fiveUTRright;
19439   Int4               cdsRight;
19440   Int4               threeUTRright;
19441   Int4               cdscount, genecount, utr5count, utr3count;
19442   SeqFeatPtr         cdsgene, utr5gene, utr3gene;
19443   PubdescPtr         pdp = NULL, lastpdp;
19444   SeqDescrPtr        sdp;
19445   SeqMgrDescContext  dcontext;
19446   Boolean            showBadFullSource;
19447   Int2               numBadFullSource;
19448   SubSourcePtr       sbsp;
19449   Int2               numgene, numcds, nummrna, numcdsproducts, nummrnaproducts,
19450                      numcdspseudo, nummrnapseudo, numrearrangedcds, lastrnatype,
19451                      thisrnatype;
19452   Boolean            cds_products_unique = TRUE, mrna_products_unique = TRUE,
19453                      pseudo, suppressed;
19454   SeqIdPtr           sip;
19455   Char               buf [96];
19456   SeqFeatXrefPtr     xref = NULL;
19457   CharPtr            except_text = NULL;
19458   ValNodePtr         vnp, cds_prod_head = NULL, mrna_prod_head = NULL,
19459                      lastcdsprod = NULL, lastmrnaprod = NULL;
19460   StreamCache        sc;
19461   Int2               res;
19462   Int4               dashes;
19463   Int4               Ns;
19464   Int4               realBases;
19465   Int4               estimated_length;
19466   Int4               loclen;
19467   GBQualPtr          gbq;
19468   long int           val;
19469   SeqLocPtr          slp;
19470   MolInfoPtr         mip = NULL;
19471   SeqFeatPtr         cds;
19472   BioseqPtr          nbsp;
19473   Boolean            last_reported;
19474   Boolean            found_overlapping_peptide;
19475   SeqFeatPtr         utr5plus, cdsplus, utr3plus, utr5minus, cdsminus, utr3minus;
19476   SeqFeatPtr         utr5pgene, cdspgene, utr3pgene, utr5mgene, cdsmgene, utr3mgene;
19477   Int2               cdsgencode, firstcdsgencode = 0;
19478   Boolean            mixedcdsgencodes = FALSE;
19479   GeneticCodePtr     gc;
19480   Uint1              origin;
19481   ErrSev             sev;
19482 
19483 
19484   gcp = bvsp->gcp;
19485   vsp = bvsp->vsp;
19486   vsp->descr = NULL;
19487   vsp->sfp = NULL;
19488   vsp->gcp = gcp;               /* needed for ValidErr */
19489 
19490   vsp->rrna_array = SeqMgrBuildFeatureIndex (bsp, &(vsp->numrrna), 0, FEATDEF_rRNA);
19491   vsp->trna_array = SeqMgrBuildFeatureIndex (bsp, &(vsp->numtrna), 0, FEATDEF_tRNA);
19492 
19493   SeqMgrExploreFeatures (bsp, (Pointer) bvsp, ValidateSeqFeatIndexed, NULL, NULL, NULL);
19494   ReportMultipleUnprocessedProteinFeatures(bsp, vsp);
19495 
19496   vsp->rrna_array = MemFree (vsp->rrna_array);
19497   vsp->trna_array = MemFree (vsp->trna_array);
19498 
19499   overlapPepSev = SEV_WARNING;
19500   if (GetAppProperty ("SpliceValidateAsError") != NULL) {
19501     overlapPepSev = SEV_ERROR;
19502   }
19503 
19504   if (gcp != NULL) {
19505     olditemid = gcp->itemID;
19506     olditemtype = gcp->thistype;
19507   }
19508 
19509   numgene = 0;
19510   numcds = 0;
19511   nummrna = 0;
19512   numcdsproducts = 0;
19513   nummrnaproducts = 0;
19514   numcdspseudo = 0;
19515   nummrnapseudo = 0;
19516   numrearrangedcds = 0;
19517 
19518   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
19519   while (sfp != NULL) {
19520     switch (sfp->idx.subtype) {
19521       case FEATDEF_GENE :
19522         numgene++;
19523         break;
19524       case FEATDEF_CDS :
19525         numcds++;
19526         if (StringStr (sfp->except_text, "rearrangement required for product") != NULL) {
19527           numrearrangedcds++;
19528         } else if (sfp->product != NULL) {
19529           numcdsproducts++;
19530           sip = SeqLocId (sfp->product);
19531           SeqIdWrite (sip, buf, PRINTID_FASTA_LONG, sizeof (buf) - 1);
19532           if (StringDoesHaveText (buf)) {
19533             vnp = ValNodeCopyStr (&lastcdsprod, 0, buf);
19534             if (cds_prod_head == NULL) {
19535               cds_prod_head = vnp;
19536             }
19537             lastcdsprod = vnp;
19538           }
19539         } else {
19540           grp = GetGeneByFeat (sfp, &pseudo, &suppressed);
19541           if (pseudo) {
19542             numcdspseudo++;
19543           }
19544         }
19545         crp = (CdRegionPtr) sfp->data.value.ptrvalue;
19546         if (crp != NULL) {
19547           cdsgencode = 0;
19548           gc = crp->genetic_code;
19549           if (gc != NULL) {
19550             for (vnp = gc->data.ptrvalue; vnp != NULL; vnp = vnp->next) {
19551               if (vnp->choice == 2) {
19552                 cdsgencode = (Int2) vnp->data.intvalue;
19553               }
19554             }
19555           }
19556           if (cdsgencode != 0) {
19557             if (firstcdsgencode == 0) {
19558               firstcdsgencode = cdsgencode;
19559             } else if (firstcdsgencode != cdsgencode) {
19560               mixedcdsgencodes = TRUE;
19561             }
19562           }
19563         }
19564         break;
19565       case FEATDEF_mRNA :
19566         nummrna++;
19567         if (sfp->product != NULL) {
19568           nummrnaproducts++;
19569           sip = SeqLocId (sfp->product);
19570           SeqIdWrite (sip, buf, PRINTID_FASTA_LONG, sizeof (buf) - 1);
19571           if (StringDoesHaveText (buf)) {
19572             vnp = ValNodeCopyStr (&lastmrnaprod, 0, buf);
19573             if (mrna_prod_head == NULL) {
19574               mrna_prod_head = vnp;
19575             }
19576             lastmrnaprod = vnp;
19577           }
19578         } else {
19579           grp = GetGeneByFeat (sfp, &pseudo, &suppressed);
19580           if (pseudo) {
19581             nummrnapseudo++;
19582           }
19583         }
19584         break;
19585       case FEATDEF_rRNA :
19586         if (! sfp->pseudo) {
19587           rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
19588           if (rrp != NULL) {
19589             str = NULL;
19590             if (rrp->ext.choice == 1) {
19591               str = (CharPtr) rrp->ext.value.ptrvalue;
19592             } else if (rrp->ext.choice == 3) {
19593               rgp = (RNAGenPtr) rrp->ext.value.ptrvalue;
19594               if (rgp != NULL) {
19595                 str = rgp->product;
19596               }
19597             }
19598             if (str != NULL && StringICmp (str, "16S ribosomal RNA") == 0) {
19599               non_pseudo_16S_rRNA = TRUE;
19600             }
19601           }
19602         }
19603         break;
19604       default :
19605         break;
19606     }
19607     sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext);
19608   }
19609 
19610   vsp->descr = NULL;
19611   vsp->sfp = NULL;
19612   vsp->bssp = NULL;
19613   vsp->bsp = bsp;
19614 
19615   if (cds_prod_head != NULL) {
19616     cds_prod_head = ValNodeSort (cds_prod_head, SortVnpByString);
19617     cds_products_unique = StringListIsUnique (cds_prod_head);
19618   }
19619   if (mrna_prod_head != NULL) {
19620     mrna_prod_head = ValNodeSort (mrna_prod_head, SortVnpByString);
19621     mrna_products_unique = StringListIsUnique (mrna_prod_head);
19622   }
19623 
19624   if (numcds > 0 && nummrna > 1) {
19625     if (numcdsproducts > 0 && numcdsproducts + numcdspseudo != numcds && numcdsproducts + numcdspseudo + numrearrangedcds != numcds) {
19626       if (gcp != NULL) {
19627         gcp->itemID = olditemid;
19628         gcp->thistype = olditemtype;
19629       }
19630       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_FeatureProductInconsistency, "%d CDS features have %d product references",
19631                 (int) numcds, (int) numcdsproducts);
19632     }
19633     if (numcdsproducts > 0 && (! cds_products_unique)) {
19634       if (gcp != NULL) {
19635         gcp->itemID = olditemid;
19636         gcp->thistype = olditemtype;
19637       }
19638       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_FeatureProductInconsistency, "CDS products are not unique");
19639     }
19640     if (nummrnaproducts > 0 && nummrnaproducts + nummrnapseudo != nummrna) {
19641       if (gcp != NULL) {
19642         gcp->itemID = olditemid;
19643         gcp->thistype = olditemtype;
19644       }
19645       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_FeatureProductInconsistency, "%d mRNA features have %d product references",
19646                 (int) nummrna, (int) nummrnaproducts);
19647     }
19648     if (nummrnaproducts > 0 && (! mrna_products_unique)) {
19649       if (gcp != NULL) {
19650         gcp->itemID = olditemid;
19651         gcp->thistype = olditemtype;
19652       }
19653       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_FeatureProductInconsistency, "mRNA products are not unique");
19654     }
19655     /*
19656     if (numcds > nummrna) {
19657       if (gcp != NULL) {
19658         gcp->itemID = olditemid;
19659         gcp->thistype = olditemtype;
19660       }
19661       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_FeatureProductInconsistency, "%d CDS features and only %d mRNA features",
19662                 (int) numcds, (int) nummrna);
19663     } else if (numcds < nummrna) {
19664       if (gcp != NULL) {
19665         gcp->itemID = olditemid;
19666         gcp->thistype = olditemtype;
19667       }
19668       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_FeatureProductInconsistency, "%d mRNA features and only %d CDS features",
19669                 (int) nummrna, (int) numcds);
19670     }
19671     */
19672   }
19673 
19674   ValNodeFreeData (cds_prod_head);
19675   ValNodeFreeData (mrna_prod_head);
19676 
19677   sev = SEV_ERROR;
19678   /*
19679   SeqEntryToBioSource (vsp->sep, NULL, NULL, 0, &biop);
19680   */
19681   BioseqToGeneticCode (bsp, NULL, NULL, NULL, NULL, 0, &biop);
19682   if (biop != NULL) {
19683     genome = biop->genome;
19684     origin = biop->origin;
19685     if (origin == ORG_MUT || origin == ORG_ARTIFICIAL || origin == ORG_SYNTHETIC) {
19686       sev = SEV_WARNING;
19687     }
19688     orp = biop->org;
19689     if (orp != NULL) {
19690       /* curated fly source still has duplicate features */
19691       if (StringNICmp (orp->taxname, "Drosophila ", 11) == 0) {
19692         isDrosophila = TRUE;
19693       }
19694       onp = orp->orgname;
19695       if (onp != NULL) {
19696         if (StringNICmp (onp->lineage, "Viruses; ", 9) == 0) {
19697           isViral = TRUE;
19698         }
19699         if (StringNICmp (onp->lineage, "Eukaryota; ", 11) == 0) {
19700           isEukaryote = TRUE;
19701           if (StringNICmp (onp->lineage, "Eukaryota; Fungi; Microsporidia; ", 33) == 0) {
19702             isMicrosporidia = TRUE;
19703           }
19704         }
19705         if (StringICmp (onp->div, "SYN") == 0) {
19706           sev = SEV_WARNING;
19707         }
19708       }
19709     }
19710     for (sbsp = biop->subtype; sbsp != NULL; sbsp = sbsp->next) {
19711       if (sbsp->subtype == SUBSRC_transgenic) {
19712         sev = SEV_WARNING;
19713       }
19714     }
19715   }
19716 
19717   if (mixedcdsgencodes) {
19718     ValidErr (vsp, sev, ERR_SEQ_FEAT_MultipleGenCodes, "Multiple CDS genetic codes on sequence");
19719   }
19720 
19721   if (isEukaryote && (! isMicrosporidia) && non_pseudo_16S_rRNA &&
19722       genome != GENOME_mitochondrion &&
19723       genome != GENOME_chloroplast &&
19724       genome != GENOME_chromoplast &&
19725       genome != GENOME_kinetoplast &&
19726       genome != GENOME_plastid &&
19727       genome != GENOME_apicoplast &&
19728       genome != GENOME_leucoplast &&
19729       genome != GENOME_proplastid &&
19730       genome != GENOME_chromatophore) {
19731     sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
19732     if (sdp != NULL) {
19733       if (gcp != NULL) {
19734         gcp->itemID = dcontext.itemID;
19735         gcp->thistype = OBJ_SEQDESC;
19736       }
19737     }
19738     ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_WrongOrganismFor16SrRNA, "Improper 16S ribosomal RNA");
19739     if (gcp != NULL) {
19740       gcp->itemID = olditemid;
19741       gcp->thistype = olditemtype;
19742     }
19743   }
19744 
19745   if (! vsp->suppress_no_cit_subs) {
19746     no_cit_sub = TRUE;
19747     sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_pub, &dcontext);
19748     while (sdp != NULL) {
19749       pdp = (PubdescPtr) sdp->data.ptrvalue;
19750       if (pdp != NULL) {
19751         for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
19752           if (vnp->choice == PUB_Sub) {
19753             no_cit_sub = FALSE;
19754           }
19755         }
19756       }
19757       sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_pub, &dcontext);
19758     }
19759     if (no_cit_sub) {
19760       sev = SEV_INFO;
19761       if (vsp->genomeSubmission) {
19762         sev = SEV_ERROR;
19763       }
19764       ValidErr (vsp, sev, ERR_GENERIC_MissingPubInfo, "Expected submission citation is missing for this Bioseq");
19765     }
19766   }
19767 
19768   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext);
19769   if (sdp != NULL && sdp->choice == Seq_descr_molinfo) {
19770     mip = (MolInfoPtr) sdp->data.ptrvalue;
19771   }
19772 
19773   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
19774   last_reported = FALSE;
19775   while (sfp != NULL) {
19776     HasFeatId(sfp, 932);
19777     leave = TRUE;
19778     if (last != NULL) {
19779       ivalssame = FALSE;
19780       if (fcontext.left == left && fcontext.right == right && fcontext.featdeftype == featdeftype) {
19781         if ((fcontext.strand == Seq_strand_minus && strand == Seq_strand_minus)
19782             || (fcontext.strand != Seq_strand_minus && strand != Seq_strand_minus)) {
19783           ivalssame = TRUE;
19784           if (fcontext.numivals != numivals || fcontext.ivals == NULL || ivals == NULL) {
19785             ivalssame = FALSE;
19786           } else {
19787             for (i = 0, j = 0; i < numivals; i++, j += 2) {
19788               if (fcontext.ivals[j] != ivals[j]) {
19789                 ivalssame = FALSE;
19790               }
19791               if (fcontext.ivals[j + 1] != ivals[j + 1]) {
19792                 ivalssame = FALSE;
19793               }
19794             }
19795           }
19796           if (ivalssame &&      /* StringICmp (fcontext.label, label) == 0 && */
19797               (fcontext.sap == sap || (fcontext.sap->desc == NULL && sap->desc == NULL) || DescsSame (fcontext.sap->desc, sap->desc))) {
19798             if (gcp != NULL) {
19799               gcp->itemID = fcontext.itemID;
19800               gcp->thistype = OBJ_SEQFEAT;
19801             }
19802             vsp->descr = NULL;
19803             vsp->sfp = sfp;
19804             severity = SEV_ERROR;
19805             samelabel = TRUE;
19806             if (StringICmp (fcontext.label, label) != 0 || StringICmp (sfp->comment, comment) != 0) {
19807               samelabel = FALSE;
19808             }
19809             if (GBQualsDiffer (sfp->qual, last->qual)) {
19810               samelabel = FALSE;
19811             }
19812             if (featdeftype == FEATDEF_PUB ||
19813                 featdeftype == FEATDEF_REGION || featdeftype == FEATDEF_misc_feature || featdeftype == FEATDEF_STS || featdeftype == FEATDEF_variation) {
19814               severity = SEV_WARNING;
19815             } else {
19816               if (isGPSorNTorNCorNGorNW || GPSorNTorNCorNGorNW (vsp->sep, sfp->location)) {
19817                 isGPSorNTorNCorNGorNW = TRUE;
19818                 if (! isCuratedFlybase) {
19819                   if (isDrosophila) {
19820                     isCuratedFlybase = TRUE;
19821                   }
19822                 }
19823                 if (isCuratedFlybase) {
19824                   /* curated fly source still has duplicate features */
19825                   severity = SEV_WARNING;
19826                 }
19827               } else if (isGenBankAccn || IsGenBankAccn (vsp->sep, sfp->location)) {
19828                 isGenBankAccn = TRUE;
19829                 if (! isCuratedFlybase) {
19830                   if (isDrosophila) {
19831                     isCuratedFlybase = TRUE;
19832                   }
19833                 }
19834                 if (isCuratedFlybase) {
19835                   /* curated fly source still has duplicate features */
19836                   severity = SEV_WARNING;
19837                 }
19838               } else if (isGeneralAccn || IsGeneralAccn (vsp->sep, sfp->location)) {
19839                 isGeneralAccn = TRUE;
19840                 if (! isCuratedFlybase) {
19841                   if (isDrosophila) {
19842                     isCuratedFlybase = TRUE;
19843                   }
19844                 }
19845                 if (isCuratedFlybase) {
19846                   /* curated fly source still has duplicate features */
19847                   severity = SEV_WARNING;
19848                 }
19849               } else {
19850                 severity = SEV_WARNING;
19851               }
19852             }
19853             /* if different CDS frames, lower to warning */
19854             if (sfp->data.choice == SEQFEAT_CDREGION) {
19855               crp = (CdRegionPtr) sfp->data.value.ptrvalue;
19856               if (crp != NULL) {
19857                 if (frame > 1 || crp->frame > 1) {
19858                   if (frame != crp->frame) {
19859                     severity = SEV_WARNING;
19860                     if (! samelabel) {
19861                       if (numivals == 1 && left == 0 && right == bsp->length - 1 && partialL && partialR) {
19862                         /* skip full length partial CDS features in different frames with different products */
19863                         severity = SEV_NONE;
19864                       }
19865                     }
19866                   }
19867                 }
19868               }
19869             }
19870             if (isGPSorNTorNCorNGorNW || GPSorNTorNCorNGorNW (vsp->sep, sfp->location)) {
19871               isGPSorNTorNCorNGorNW = TRUE;
19872               severity = SEV_WARNING;
19873             }
19874             if (FlybaseDbxrefs (last->dbxref) || FlybaseDbxrefs (sfp->dbxref)) {
19875               severity = SEV_ERROR;
19876             }
19877             if (featdeftype == FEATDEF_repeat_region) {
19878               severity = SEV_WARNING;
19879             }
19880             if (featdeftype == FEATDEF_SITE || featdeftype == FEATDEF_BOND) {
19881               severity = SEV_WARNING;
19882             }
19883             if (severity == SEV_NONE) {
19884               /* skip full length partial CDS features in different frames with different products */
19885             } else if (featdeftype == FEATDEF_REGION && DifferentDbxrefs (last->dbxref, sfp->dbxref)) {
19886               /* do not report if both have dbxrefs and they are different */
19887             } else if (featdeftype == FEATDEF_variation && ReplaceQualsDiffer (sfp->qual, last->qual)) {
19888               /* do not report if both have replace quals and they are different */
19889             } else if (CDSsLinkedToDifferentMRNAs (sfp, last)) {
19890               /* do not report if CDSs are linked to two different mRNAs */
19891             } else if (MRNAsLinkedToDifferentCDSs (sfp, last)) {
19892               /* do not report if mRNAs are linked to two different CDSs */
19893             } else if (fcontext.sap == sap) {
19894               if (samelabel) {
19895                 if (GeneXrefsDifferent (sfp, last)) {
19896                   severity = SEV_WARNING;
19897                 }
19898                 if (vsp->is_small_genome_set) {
19899                   severity = SEV_WARNING;
19900                 }
19901                 ValidErr (vsp, severity, ERR_SEQ_FEAT_FeatContentDup, "Duplicate feature");
19902               } else if (featdeftype != FEATDEF_PUB) {
19903                 if (fcontext.partialL != partialL || fcontext.partialR != partialR) {
19904                   /* do not report if partial flags are different */
19905                 } else {
19906                   if ((featdeftype == FEATDEF_CDS || featdeftype == FEATDEF_mRNA) && HaveUniqueFeatIDXrefs (xref, sfp->xref)) {
19907                     /* do not report CDS or mRNA if every one has a unique product and unique featID xrefs */
19908                   } else if (featdeftype == FEATDEF_GENE &&
19909                              StringStr (sfp->except_text, "dicistronic gene") != NULL &&
19910                              StringStr (except_text, "dicistronic gene") != NULL &&
19911                              isCuratedFlybase) {
19912                     /* do not report genes marked dicistronic */
19913                   } else if (vsp->is_small_genome_set && SeqLocCompare (sfp->location, last->location) != SLC_A_EQ_B) {
19914                     /* do not report trans-spliced features that really are different on far components */
19915                   } else {
19916                     if (featdeftype == FEATDEF_GENE && isViral && (sfp->partial || last->partial)) {
19917                       severity = SEV_WARNING;
19918                     }
19919                     if (featdeftype == FEATDEF_CDS && isViral && (sfp->partial || last->partial)) {
19920                       severity = SEV_WARNING;
19921                     }
19922                     if (featdeftype == FEATDEF_mRNA && (sfp->partial || last->partial)) {
19923                       severity = SEV_WARNING;
19924                     }
19925                     if (featdeftype == FEATDEF_GENE && (sfp->partial || last->partial)) {
19926                       severity = SEV_WARNING;
19927                     }
19928                     if (featdeftype == FEATDEF_GENE && sfp->pseudo && last->pseudo) {
19929                       severity = SEV_WARNING;
19930                     }
19931                     if (featdeftype == FEATDEF_GENE && isViral) {
19932                       severity = SEV_WARNING;
19933                     }
19934                     if (severity == SEV_ERROR && featdeftype == FEATDEF_CDS && mip != NULL) {
19935                       if (mip->tech >= MI_TECH_htgs_1 && mip->tech <= MI_TECH_htgs_3) {
19936                         severity = SEV_WARNING;
19937                       }
19938                     }
19939                     if (fcontext.seqfeattype == SEQFEAT_IMP) {
19940                       severity = SEV_WARNING;
19941                     }
19942                     ValidErr (vsp, severity, ERR_SEQ_FEAT_DuplicateFeat, "Features have identical intervals, but labels differ");
19943                   }
19944                 }
19945               }
19946             } else {
19947               if (samelabel) {
19948                 if (GeneXrefsDifferent (sfp, last)) {
19949                   severity = SEV_WARNING;
19950                 }
19951                 if (vsp->is_small_genome_set) {
19952                   severity = SEV_WARNING;
19953                 }
19954                 ValidErr (vsp, severity, ERR_SEQ_FEAT_FeatContentDup, "Duplicate feature (packaged in different feature table)");
19955               } else if (featdeftype != FEATDEF_PUB) {
19956                 if ((featdeftype == FEATDEF_CDS || featdeftype == FEATDEF_mRNA) && HaveUniqueFeatIDXrefs (xref, sfp->xref)) {
19957                   /* do not report CDS or mRNA if every one has a unique product and unique featID xrefs */
19958                 } else {
19959                   ValidErr (vsp, /* severity */ SEV_WARNING, ERR_SEQ_FEAT_DuplicateFeat, "Features have identical intervals, but labels differ (packaged in different feature table)");
19960                 }
19961               }
19962             }
19963             vsp->sfp = NULL;
19964             if (gcp != NULL) {
19965               gcp->itemID = olditemid;
19966               gcp->thistype = olditemtype;
19967             }
19968           }
19969         }
19970       }
19971       found_overlapping_peptide = FALSE;
19972       if (fcontext.featdeftype == FEATDEF_mat_peptide_aa ||
19973           fcontext.featdeftype == FEATDEF_sig_peptide_aa || fcontext.featdeftype == FEATDEF_transit_peptide_aa) {
19974         if (featdeftype == FEATDEF_mat_peptide_aa || featdeftype == FEATDEF_sig_peptide_aa || featdeftype == FEATDEF_transit_peptide_aa) {
19975           if (fcontext.left <= right && NotPeptideException (sfp, last)) {
19976             if (gcp != NULL) {
19977               gcp->itemID = fcontext.itemID;
19978               gcp->thistype = OBJ_SEQFEAT;
19979             }
19980             buf [0] = '\0';
19981             cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
19982             if (cds != NULL) {
19983               nbsp = BioseqFindFromSeqLoc (cds->location);
19984               if (nbsp != NULL) {
19985                 SeqIdWrite (nbsp->id, buf, PRINTID_FASTA_LONG, sizeof (buf) - 1);
19986               }
19987             }
19988             vsp->descr = NULL;
19989             if (StringDoesHaveText (buf)) {
19990               if (!last_reported) {
19991                 vsp->sfp = last;
19992                 ValidErr (vsp, overlapPepSev, ERR_SEQ_FEAT_OverlappingPeptideFeat,
19993                           "Signal, Transit, or Mature peptide features overlap (parent CDS is on %s)", buf);
19994               }
19995               vsp->sfp = sfp;
19996               ValidErr (vsp, overlapPepSev, ERR_SEQ_FEAT_OverlappingPeptideFeat,
19997                         "Signal, Transit, or Mature peptide features overlap (parent CDS is on %s)", buf);
19998             } else {
19999               if (!last_reported) {
20000                 vsp->sfp = last;
20001                 ValidErr (vsp, overlapPepSev, ERR_SEQ_FEAT_OverlappingPeptideFeat, "Signal, Transit, or Mature peptide features overlap");
20002               }
20003               vsp->sfp = sfp;
20004               ValidErr (vsp, overlapPepSev, ERR_SEQ_FEAT_OverlappingPeptideFeat, "Signal, Transit, or Mature peptide features overlap");
20005             }
20006             found_overlapping_peptide = TRUE;
20007             vsp->sfp = NULL;
20008             if (gcp != NULL) {
20009               gcp->itemID = olditemid;
20010               gcp->thistype = olditemtype;
20011             }
20012           }
20013         }
20014       }
20015       last_reported = found_overlapping_peptide;
20016     }
20017     if (leave) {
20018       last = sfp;
20019       left = fcontext.left;
20020       right = fcontext.right;
20021       label = fcontext.label;
20022       comment = sfp->comment;
20023       strand = fcontext.strand;
20024       partialL = fcontext.partialL;
20025       partialR = fcontext.partialR;
20026       featdeftype = fcontext.featdeftype;
20027       numivals = fcontext.numivals;
20028       ivals = fcontext.ivals;
20029       sap = fcontext.sap;
20030       xref = sfp->xref;
20031       except_text = sfp->except_text;
20032       frame = 0;
20033       if (sfp->data.choice == SEQFEAT_CDREGION) {
20034         crp = (CdRegionPtr) sfp->data.value.ptrvalue;
20035         if (crp != NULL) {
20036           frame = crp->frame;
20037         }
20038       }
20039     }
20040     sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext);
20041   }
20042 
20043   lastLabel = NULL;
20044   lastsfp = NULL;
20045   lastgrp = NULL;
20046   grp = NULL;
20047   sfp = SeqMgrGetNextFeatureByLabel (bsp, NULL, SEQFEAT_GENE, 0, &fcontext);
20048   while (sfp != NULL) {
20049     grp = (GeneRefPtr) sfp->data.value.ptrvalue;
20050     label = fcontext.label;
20051     if (lastLabel != NULL) {
20052       message = NULL;
20053       if (StringCmp (lastLabel, label) == 0) {
20054         message = "Colliding names in gene features";
20055       } else if (StringICmp (lastLabel, label) == 0) {
20056         message = "Colliding names (with different capitalization) in gene features";
20057       }
20058       if (message != NULL && (ReportGeneCollision (grp, lastgrp))) {
20059         if (gcp != NULL) {
20060           gcp->itemID = fcontext.itemID;
20061           gcp->thistype = OBJ_SEQFEAT;
20062         }
20063         vsp->descr = NULL;
20064         vsp->sfp = sfp;
20065 
20066         if (lastsfp != NULL && SeqLocCompare (sfp->location, lastsfp->location) == SLC_A_EQ_B) {
20067           ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_MultiplyAnnotatedGenes, "%s, but feature locations are identical", message);
20068         } else if (vsp->is_small_genome_set && StringISearch (lastsfp->except_text, "trans-splicing") != NULL && StringISearch (sfp->except_text, "trans-splicing") != NULL) {
20069           /* suppress for trans-spliced genes on small genome set */
20070         } else if (FeatureSequencesIdentical (sfp, lastsfp)) {
20071           if (vsp->is_gpipe_in_sep && FeatureHastRNAXref(sfp) && FeatureHastRNAXref(lastsfp)) {
20072             /* suppress for gpipe */
20073           } else {
20074             ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_ReplicatedGeneSequence, "%s, but underlying sequences are identical", message);
20075           }
20076         } else {
20077           if (vsp->is_gpipe_in_sep && FeatureHastRNAXref(sfp) && FeatureHastRNAXref(lastsfp)) {
20078             /* suppress for gpipe */
20079           } else {
20080             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_CollidingGeneNames, "%s", message);
20081           }
20082         }
20083         vsp->sfp = NULL;
20084         if (gcp != NULL) {
20085           gcp->itemID = olditemid;
20086           gcp->thistype = olditemtype;
20087         }
20088       }
20089     }
20090     lastLabel = label;
20091     lastsfp = sfp;
20092     lastgrp = grp;
20093     sfp = SeqMgrGetNextFeatureByLabel (bsp, sfp, SEQFEAT_GENE, 0, &fcontext);
20094   }
20095 
20096   lastLabel = NULL;
20097   lastIsSplitGene = FALSE;
20098   sfp = SeqMgrGetNextGeneByLocusTag (bsp, NULL, &fcontext);
20099   while (sfp != NULL) {
20100     label = NULL;
20101     isSplitGene = FALSE;
20102     if (sfp->data.choice == SEQFEAT_GENE) {
20103       grp = (GeneRefPtr) sfp->data.value.ptrvalue;
20104       if (grp != NULL) {
20105         label = grp->locus_tag;
20106       }
20107       if (sfp->excpt) {
20108         if (StringStr (sfp->except_text, "gene split at ") != NULL) {
20109           isSplitGene = TRUE;
20110         }
20111       }
20112     }
20113     if (isSplitGene && label == NULL) {
20114       if (gcp != NULL) {
20115         gcp->itemID = fcontext.itemID;
20116         gcp->thistype = OBJ_SEQFEAT;
20117       }
20118       vsp->descr = NULL;
20119       vsp->sfp = sfp;
20120       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ExceptionProblem, "Gene has split exception but no locus_tag");
20121       vsp->sfp = NULL;
20122       if (gcp != NULL) {
20123         gcp->itemID = olditemid;
20124         gcp->thistype = olditemtype;
20125       }
20126     }
20127     if (lastLabel != NULL) {
20128       message = NULL;
20129       if (StringCmp (lastLabel, label) == 0) {
20130         message = "Colliding locus_tags in gene features";
20131       } else if (StringICmp (lastLabel, label) == 0) {
20132         message = "Colliding locus_tags (with different capitalization) in gene features";
20133       }
20134       if (message != NULL) {
20135         if (isSplitGene && lastIsSplitGene) {
20136           /* suppress if colliding locus_tags have split gene exception */
20137         } else {
20138           if (gcp != NULL) {
20139             gcp->itemID = fcontext.itemID;
20140             gcp->thistype = OBJ_SEQFEAT;
20141           }
20142           vsp->descr = NULL;
20143           vsp->sfp = sfp;
20144           ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_CollidingLocusTags, "%s", message);
20145           vsp->sfp = NULL;
20146           if (gcp != NULL) {
20147             gcp->itemID = olditemid;
20148             gcp->thistype = olditemtype;
20149           }
20150         }
20151       }
20152     }
20153     lastLabel = label;
20154     lastIsSplitGene = isSplitGene;
20155     sfp = SeqMgrGetNextGeneByLocusTag (bsp, sfp, &fcontext);
20156   }
20157 
20158   utr5plus = NULL;
20159   cdsplus = NULL;
20160   utr3plus = NULL;
20161   utr5minus = NULL;
20162   cdsminus = NULL;
20163   utr3minus = NULL;
20164 
20165   utr5pgene = NULL;
20166   cdspgene = NULL;
20167   utr3pgene = NULL;
20168   utr5mgene = NULL;
20169   cdsmgene = NULL;
20170   utr3mgene = NULL;
20171 
20172   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
20173   while (sfp != NULL) {
20174     strand = fcontext.strand;
20175     if (sfp->idx.subtype == FEATDEF_CDS) {
20176       if (strand == Seq_strand_minus) {
20177         cdsminus = sfp;
20178         cdsmgene = SeqMgrGetOverlappingGene (sfp->location, NULL);
20179       } else {
20180         cdsplus = sfp;
20181         cdspgene = SeqMgrGetOverlappingGene (sfp->location, NULL);
20182       }
20183     } else if (sfp->idx.subtype == FEATDEF_5UTR) {
20184       if (strand == Seq_strand_minus) {
20185         utr5minus = sfp;
20186         utr5mgene = SeqMgrGetOverlappingGene (sfp->location, NULL);
20187         if (utr3minus != NULL && cdsminus == NULL && utr3mgene == utr5mgene && utr5mgene != NULL) {
20188           if (gcp != NULL) {
20189             gcp->itemID = utr5mgene->idx.itemID;
20190             gcp->thistype = OBJ_SEQFEAT;
20191           }
20192           vsp->descr = NULL;
20193           vsp->sfp = sfp;
20194           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_CDSnotBetweenUTRs, "CDS not between 5'UTR and 3'UTR on minus strand");
20195           if (gcp != NULL) {
20196             gcp->itemID = olditemid;
20197             gcp->thistype = olditemtype;
20198           }
20199         }
20200         utr5minus = NULL;
20201         cdsminus = NULL;
20202         utr3minus = NULL;
20203         utr5mgene = NULL;
20204         cdsmgene = NULL;
20205         utr3mgene = NULL;
20206       } else {
20207         utr5plus = sfp;
20208         utr5pgene = SeqMgrGetOverlappingGene (sfp->location, NULL);
20209       }
20210     } else if (sfp->idx.subtype == FEATDEF_3UTR) {
20211       if (strand == Seq_strand_minus) {
20212         utr3minus = sfp;
20213         utr3mgene = SeqMgrGetOverlappingGene (sfp->location, NULL);
20214       } else {
20215         utr3plus = sfp;
20216         utr3pgene = SeqMgrGetOverlappingGene (sfp->location, NULL);
20217         if (utr5plus != NULL && cdsplus == NULL && utr5pgene == utr3pgene && utr3pgene != NULL) {
20218           if (gcp != NULL) {
20219             gcp->itemID = utr3pgene->idx.itemID;
20220             gcp->thistype = OBJ_SEQFEAT;
20221           }
20222           vsp->descr = NULL;
20223           vsp->sfp = sfp;
20224           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_CDSnotBetweenUTRs, "CDS not between 5'UTR and 3'UTR on plus strand");
20225           if (gcp != NULL) {
20226             gcp->itemID = olditemid;
20227             gcp->thistype = olditemtype;
20228           }
20229         }
20230         utr5plus = NULL;
20231         cdsplus = NULL;
20232         utr3plus = NULL;
20233         utr5minus = NULL;
20234         cdsminus = NULL;
20235         utr3minus = NULL;
20236       }
20237     }
20238     sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext);
20239   }
20240 
20241   /* do UTR vs. CDS check on genomic if only one CDS, still need separate minus strand logic */
20242   cdscount = 0;
20243   genecount = 0;
20244   utr5count = 0;
20245   utr3count = 0;
20246   cdsgene = NULL;
20247   utr5gene = NULL;
20248   utr3gene = NULL;
20249   strand = Seq_strand_plus;
20250   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
20251   while (sfp != NULL /* && cdscount < 2 && genecount < 2 */) {
20252     if (sfp->idx.subtype == FEATDEF_CDS) {
20253       strand = fcontext.strand;
20254       cdscount++;
20255       cdsgene = SeqMgrGetOverlappingGene (sfp->location, NULL);
20256     } else if (sfp->idx.subtype == FEATDEF_GENE) {
20257       genecount++;
20258     } else if (sfp->idx.subtype == FEATDEF_5UTR) {
20259       utr5count++;
20260       utr5gene = SeqMgrGetOverlappingGene (sfp->location, NULL);
20261     } else if (sfp->idx.subtype == FEATDEF_3UTR) {
20262       utr3count++;
20263       utr3gene = SeqMgrGetOverlappingGene (sfp->location, NULL);
20264     }
20265     sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext);
20266   }
20267   if (bvsp->is_mrna || cdscount == 1 && genecount < 2) {
20268     if (bvsp->is_mrna) {
20269       strand = Seq_strand_plus;
20270     }
20271     fiveUTRright = 0;
20272     cdsRight = 0;
20273     threeUTRright = 0;
20274     firstCDS = TRUE;
20275 
20276     if (strand == Seq_strand_minus) {
20277 
20278       sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
20279       while (sfp != NULL) {
20280         if (gcp != NULL) {
20281           gcp->itemID = fcontext.itemID;
20282           gcp->thistype = OBJ_SEQFEAT;
20283         }
20284         vsp->descr = NULL;
20285         vsp->sfp = sfp;
20286         if (sfp->idx.subtype == FEATDEF_3UTR && utr3count < 2) {
20287           if (fcontext.strand != Seq_strand_minus) {
20288             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "3'UTR is not on minus strand");
20289           } else if (threeUTRright > 0) {
20290             if (threeUTRright + 1 != fcontext.left) {
20291               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "Previous 3'UTR does not abut next 3'UTR");
20292             }
20293           }
20294           threeUTRright = fcontext.right;
20295         } else if (sfp->idx.subtype == FEATDEF_CDS) {
20296           cdsRight = fcontext.right;
20297           if (threeUTRright > 0 && firstCDS) {
20298             if (threeUTRright + 1 != fcontext.left) {
20299               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "CDS does not abut 3'UTR");
20300             }
20301           }
20302           firstCDS = FALSE;
20303         } else if (sfp->idx.subtype == FEATDEF_5UTR && utr5count < 2) {
20304           if (fcontext.strand != Seq_strand_minus) {
20305             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "5'UTR is not on minus strand");
20306           } else if (cdsRight > 0) {
20307             if (cdsRight + 1 != fcontext.left) {
20308               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "5'UTR does not abut CDS");
20309             }
20310           }
20311           threeUTRright = fcontext.right;
20312         }
20313         sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext);
20314       }
20315 
20316     } else {
20317 
20318       sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
20319       while (sfp != NULL) {
20320         if (gcp != NULL) {
20321           gcp->itemID = fcontext.itemID;
20322           gcp->thistype = OBJ_SEQFEAT;
20323         }
20324         vsp->descr = NULL;
20325         vsp->sfp = sfp;
20326         if (sfp->idx.subtype == FEATDEF_5UTR && utr5count < 2) {
20327           if (fcontext.strand == Seq_strand_minus) {
20328             if (genecount > 1 && cdsgene != NULL && utr5gene != NULL && cdsgene != utr5gene) {
20329               /* ignore */
20330             } else {
20331               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "5'UTR is not on plus strand");
20332             }
20333           }
20334           fiveUTRright = fcontext.right;
20335         } else if (sfp->idx.subtype == FEATDEF_CDS) {
20336           cdsRight = fcontext.right;
20337           if (fiveUTRright > 0 && firstCDS) {
20338             if (fiveUTRright + 1 != fcontext.left) {
20339               if (genecount > 1 && cdsgene != NULL && utr5gene != NULL && cdsgene != utr5gene) {
20340                 /* ignore */
20341               } else {
20342                 ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "5'UTR does not abut CDS");
20343               }
20344             }
20345           }
20346           firstCDS = FALSE;
20347         } else if (sfp->idx.subtype == FEATDEF_3UTR && utr3count < 2) {
20348           if (fcontext.strand == Seq_strand_minus) {
20349             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "3'UTR is not on plus strand");
20350           } else if (threeUTRright > 0) {
20351             if (threeUTRright + 1 != fcontext.left) {
20352               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "Previous 3'UTR does not abut next 3'UTR");
20353             }
20354           } else if (cdsRight > 0) {
20355             if (cdsRight + 1 != fcontext.left) {
20356               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotAbutCDS, "CDS does not abut 3'UTR");
20357             }
20358           }
20359           if (bvsp->is_mrna && cdscount == 1 && utr3count == 1 && fcontext.right != bsp->length - 1) {
20360             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UTRdoesNotExtendToEnd, "3'UTR does not extend to end of mRNA");
20361           }
20362           threeUTRright = fcontext.right;
20363         }
20364         sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext);
20365       }
20366     }
20367   }
20368 
20369   if (bvsp->is_mrna) {
20370     sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &fcontext);
20371     while (sfp != NULL) {
20372       if (SeqLocStrand (sfp->location) == Seq_strand_minus) {
20373         if (gcp != NULL) {
20374           gcp->itemID = fcontext.itemID;
20375           gcp->thistype = OBJ_SEQFEAT;
20376         }
20377         vsp->descr = NULL;
20378         vsp->sfp = sfp;
20379         ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_CDSonMinusStrandMRNA, "CDS should not be on minus strand of mRNA molecule");
20380       }
20381       sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_CDREGION, 0, &fcontext);
20382     }
20383   }
20384 
20385   if (! bvsp->is_mrna) {
20386     last = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_RNA, 0, &fcontext);
20387     if (last != NULL) {
20388       lastrnatype = WhichRNA (last);
20389       left = fcontext.left;
20390       right = fcontext.right;
20391       strand = fcontext.strand;
20392       partialL = fcontext.partialL;
20393       partialR = fcontext.partialR;
20394       sfp = SeqMgrGetNextFeature (bsp, last, SEQFEAT_RNA, 0, &fcontext);
20395       while (sfp != NULL) {
20396         if (sfp->idx.subtype == FEATDEF_rRNA || sfp->idx.subtype == FEATDEF_otherRNA) {
20397           thisrnatype = WhichRNA (sfp);
20398           if (fcontext.strand == strand || (strand != Seq_strand_minus && fcontext.strand != Seq_strand_minus)) {
20399             if (lastrnatype != 0 && thisrnatype != 0) {
20400               if (right + 1 < fcontext.left) {
20401                 /* gap */
20402                 if (BaseRangeIsVirtual (bsp, right + 1, fcontext.left)) {
20403                   /* ignore if abuts gap */
20404                 } else if (strand == Seq_strand_minus) {
20405                   if ((lastrnatype == RIGHT_RIBOSOMAL_SUBUNIT && (thisrnatype == INTERNAL_SPACER_2 || thisrnatype == INTERNAL_SPACER_X)) ||
20406                       (lastrnatype == INTERNAL_SPACER_2 && thisrnatype == MIDDLE_RIBOSOMAL_SUBUNIT) ||
20407                       (lastrnatype == MIDDLE_RIBOSOMAL_SUBUNIT && thisrnatype == INTERNAL_SPACER_1) ||
20408                       (lastrnatype == INTERNAL_SPACER_1 && thisrnatype == LEFT_RIBOSOMAL_SUBUNIT) ||
20409                       (lastrnatype == INTERNAL_SPACER_X && thisrnatype == LEFT_RIBOSOMAL_SUBUNIT)) {
20410                     if (gcp != NULL) {
20411                       gcp->itemID = fcontext.itemID;
20412                       gcp->thistype = OBJ_SEQFEAT;
20413                     }
20414                     vsp->descr = NULL;
20415                     vsp->sfp = sfp;
20416                     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ITSdoesNotAbutRRNA, "ITS does not abut adjacent rRNA component");
20417                   }
20418                 } else {
20419                   if ((lastrnatype == LEFT_RIBOSOMAL_SUBUNIT && (thisrnatype == INTERNAL_SPACER_1 || thisrnatype == INTERNAL_SPACER_X)) ||
20420                       (lastrnatype == MIDDLE_RIBOSOMAL_SUBUNIT && thisrnatype == INTERNAL_SPACER_2) ||
20421                       (lastrnatype == INTERNAL_SPACER_1 && thisrnatype == MIDDLE_RIBOSOMAL_SUBUNIT) ||
20422                       (lastrnatype == INTERNAL_SPACER_2 && thisrnatype == RIGHT_RIBOSOMAL_SUBUNIT) ||
20423                       (lastrnatype == INTERNAL_SPACER_X && thisrnatype == RIGHT_RIBOSOMAL_SUBUNIT)) {
20424                     if (gcp != NULL) {
20425                       gcp->itemID = fcontext.itemID;
20426                       gcp->thistype = OBJ_SEQFEAT;
20427                     }
20428                     vsp->descr = NULL;
20429                     vsp->sfp = sfp;
20430                     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ITSdoesNotAbutRRNA, "ITS does not abut adjacent rRNA component");
20431                   }
20432                 }
20433               } else if (right + 1 > fcontext.left) {
20434                 /* overlaps */
20435                 if (strand == Seq_strand_minus) {
20436                   if ((lastrnatype == RIGHT_RIBOSOMAL_SUBUNIT && (thisrnatype == INTERNAL_SPACER_2 || thisrnatype == INTERNAL_SPACER_X)) ||
20437                       (lastrnatype == INTERNAL_SPACER_2 && thisrnatype == MIDDLE_RIBOSOMAL_SUBUNIT) ||
20438                       (lastrnatype == MIDDLE_RIBOSOMAL_SUBUNIT && thisrnatype == INTERNAL_SPACER_1) ||
20439                       (lastrnatype == INTERNAL_SPACER_1 && thisrnatype == LEFT_RIBOSOMAL_SUBUNIT) ||
20440                       (lastrnatype == INTERNAL_SPACER_X && thisrnatype == LEFT_RIBOSOMAL_SUBUNIT)) {
20441                     if (gcp != NULL) {
20442                       gcp->itemID = fcontext.itemID;
20443                       gcp->thistype = OBJ_SEQFEAT;
20444                     }
20445                     vsp->descr = NULL;
20446                     vsp->sfp = sfp;
20447                     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadRRNAcomponentOverlap, "ITS overlaps adjacent rRNA component");
20448                   }
20449                 } else {
20450                   if ((lastrnatype == LEFT_RIBOSOMAL_SUBUNIT && (thisrnatype == INTERNAL_SPACER_1 || thisrnatype == INTERNAL_SPACER_X)) ||
20451                       (lastrnatype == MIDDLE_RIBOSOMAL_SUBUNIT && thisrnatype == INTERNAL_SPACER_2) ||
20452                       (lastrnatype == INTERNAL_SPACER_1 && thisrnatype == MIDDLE_RIBOSOMAL_SUBUNIT) ||
20453                       (lastrnatype == INTERNAL_SPACER_2 && thisrnatype == RIGHT_RIBOSOMAL_SUBUNIT) ||
20454                       (lastrnatype == INTERNAL_SPACER_X && thisrnatype == RIGHT_RIBOSOMAL_SUBUNIT)) {
20455                     if (gcp != NULL) {
20456                       gcp->itemID = fcontext.itemID;
20457                       gcp->thistype = OBJ_SEQFEAT;
20458                     }
20459                     vsp->descr = NULL;
20460                     vsp->sfp = sfp;
20461                     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadRRNAcomponentOverlap, "ITS overlaps adjacent rRNA component");
20462                   }
20463                 }
20464               } else {
20465                 /* abuts */
20466                 if (strand == Seq_strand_minus) {
20467                   if (lastrnatype == thisrnatype && partialL && fcontext.partialR && bsp->repr == Seq_repr_seg) {
20468                     /* okay in segmented set */
20469                   } else if ((lastrnatype == RIGHT_RIBOSOMAL_SUBUNIT && (thisrnatype != INTERNAL_SPACER_2 && thisrnatype != INTERNAL_SPACER_X)) ||
20470                       (lastrnatype == INTERNAL_SPACER_2 && thisrnatype != MIDDLE_RIBOSOMAL_SUBUNIT) ||
20471                       (lastrnatype == MIDDLE_RIBOSOMAL_SUBUNIT && thisrnatype != INTERNAL_SPACER_1) ||
20472                       (lastrnatype == INTERNAL_SPACER_1 && thisrnatype != LEFT_RIBOSOMAL_SUBUNIT) ||
20473                       (lastrnatype == INTERNAL_SPACER_X && thisrnatype != LEFT_RIBOSOMAL_SUBUNIT)) {
20474                     if (gcp != NULL) {
20475                       gcp->itemID = fcontext.itemID;
20476                       gcp->thistype = OBJ_SEQFEAT;
20477                     }
20478                     vsp->descr = NULL;
20479                     vsp->sfp = sfp;
20480                     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadRRNAcomponentOrder, "Problem with order of abutting rRNA components");
20481                   }
20482                 } else {
20483                   if (lastrnatype == thisrnatype && partialR && fcontext.partialL && bsp->repr == Seq_repr_seg) {
20484                     /* okay in segmented set */
20485                   } else if ((lastrnatype == LEFT_RIBOSOMAL_SUBUNIT && (thisrnatype != INTERNAL_SPACER_1 && thisrnatype != INTERNAL_SPACER_X)) ||
20486                       (lastrnatype == MIDDLE_RIBOSOMAL_SUBUNIT && thisrnatype != INTERNAL_SPACER_2) ||
20487                       (lastrnatype == INTERNAL_SPACER_1 && thisrnatype != MIDDLE_RIBOSOMAL_SUBUNIT) ||
20488                       (lastrnatype == INTERNAL_SPACER_2 && thisrnatype != RIGHT_RIBOSOMAL_SUBUNIT) ||
20489                       (lastrnatype == INTERNAL_SPACER_X && thisrnatype != RIGHT_RIBOSOMAL_SUBUNIT)) {
20490                     if (gcp != NULL) {
20491                       gcp->itemID = fcontext.itemID;
20492                       gcp->thistype = OBJ_SEQFEAT;
20493                     }
20494                     vsp->descr = NULL;
20495                     vsp->sfp = sfp;
20496                     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadRRNAcomponentOrder, "Problem with order of abutting rRNA components");
20497                   }
20498                 }
20499               }
20500             }
20501           } else {
20502             if (lastrnatype != 0 && thisrnatype != 0) {
20503               if ((lastrnatype == LEFT_RIBOSOMAL_SUBUNIT || lastrnatype == MIDDLE_RIBOSOMAL_SUBUNIT || lastrnatype == RIGHT_RIBOSOMAL_SUBUNIT) &&
20504                   (thisrnatype == LEFT_RIBOSOMAL_SUBUNIT || thisrnatype == MIDDLE_RIBOSOMAL_SUBUNIT || thisrnatype == RIGHT_RIBOSOMAL_SUBUNIT)) {
20505               } else if ((lastrnatype == INTERNAL_SPACER_1 || lastrnatype == INTERNAL_SPACER_2 || lastrnatype == INTERNAL_SPACER_X) &&
20506                   (thisrnatype == INTERNAL_SPACER_1 || thisrnatype == INTERNAL_SPACER_2 || thisrnatype == INTERNAL_SPACER_X)) {
20507               } else {
20508                 if (gcp != NULL) {
20509                   gcp->itemID = fcontext.itemID;
20510                   gcp->thistype = OBJ_SEQFEAT;
20511                 }
20512                 vsp->descr = NULL;
20513                 vsp->sfp = sfp;
20514                 ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InconsistentRRNAstrands, "Inconsistent strands for rRNA components");
20515               }
20516             }
20517           }
20518           last = sfp;
20519           left = fcontext.left;
20520           right = fcontext.right;
20521           strand = fcontext.strand;
20522           partialL = fcontext.partialL;
20523           partialR = fcontext.partialR;
20524           lastrnatype = thisrnatype;
20525         }
20526         sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_RNA, 0, &fcontext);
20527       }
20528     }
20529   }
20530 
20531   vsp->sfp = NULL;
20532   if (gcp != NULL) {
20533     gcp->itemID = olditemid;
20534     gcp->thistype = olditemtype;
20535   }
20536 
20537   CheckForGenesInconsistent(bsp, vsp);
20538 
20539   if (ISA_na (bsp->mol)) {
20540     sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_gap, &fcontext);
20541     while (sfp != NULL) {
20542       estimated_length = 0;
20543       for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
20544         if (StringICmp (gbq->qual, "estimated_length") != 0) continue;
20545         if (StringHasNoText (gbq->val)) continue;
20546         if (StringICmp (gbq->val, "unknown") == 0) continue;
20547         if (sscanf (gbq->val, "%ld", &val) == 1) {
20548           estimated_length = val;
20549         }
20550       }
20551       if (StreamCacheSetup (NULL, sfp->location, EXPAND_GAPS_TO_DASHES, &sc)) {
20552         dashes = 0;
20553         Ns = 0;
20554         realBases = 0;
20555         while ((res = StreamCacheGetResidue (&sc)) != '\0') {
20556           if (IS_LOWER (res)) {
20557             res = TO_UPPER (res);
20558           }
20559           if (res == '-') {
20560             dashes++;
20561           } else if (res == 'N') {
20562             Ns++;
20563           } else {
20564             realBases++;
20565           }
20566         }
20567         if (gcp != NULL) {
20568           gcp->itemID = fcontext.itemID;
20569           gcp->thistype = OBJ_SEQFEAT;
20570         }
20571         vsp->descr = NULL;
20572         vsp->sfp = sfp;
20573         loclen = SeqLocLen (sfp->location);
20574         if (estimated_length > 0 && estimated_length != loclen) {
20575           ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_GapFeatureProblem, "Gap feature estimated_length %ld does not match %ld feature length",
20576                     (long) estimated_length, (long) loclen);
20577         } else if (realBases > 0 && Ns > 0) {
20578           ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_GapFeatureProblem, "Gap feature over %ld real bases and %ld Ns", (long) realBases, (long) Ns);
20579         } else if (realBases > 0) {
20580           ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_GapFeatureProblem, "Gap feature over %ld real bases", (long) realBases);
20581         } else if (Ns > 0) {
20582           ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_GapFeatureProblem, "Gap feature over %ld Ns", (long) Ns);
20583         } else if (estimated_length > 0 && dashes != estimated_length) {
20584           ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_GapFeatureProblem, "Gap feature estimated_length %ld does not match %ld gap characters",
20585                     (long) estimated_length, (long) dashes);
20586         }
20587       }
20588       sfp = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_gap, &fcontext);
20589     }
20590   }
20591   if (gcp != NULL) {
20592     gcp->itemID = olditemid;
20593     gcp->thistype = olditemtype;
20594   }
20595   vsp->descr = NULL;
20596   vsp->sfp = NULL;
20597 
20598   if (ISA_aa (bsp->mol)) {
20599     sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
20600     while (sfp != NULL) {
20601       slp = SeqLocFindNext (sfp->location, NULL);
20602       while (slp != NULL) {
20603         if (SeqLocStrand (slp) == Seq_strand_minus) {
20604           if (gcp != NULL) {
20605             gcp->itemID = fcontext.itemID;
20606             gcp->thistype = OBJ_SEQFEAT;
20607           }
20608           vsp->descr = NULL;
20609           vsp->sfp = sfp;
20610           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_MinusStrandProtein, "Feature on protein indicates negative strand");
20611         }
20612         slp = SeqLocFindNext (sfp->location, slp);
20613       }
20614       sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext);
20615     }
20616   }
20617   if (gcp != NULL) {
20618     gcp->itemID = olditemid;
20619     gcp->thistype = olditemtype;
20620   }
20621   vsp->descr = NULL;
20622   vsp->sfp = NULL;
20623 
20624   lastbiop = NULL;
20625   lastsfp = NULL;
20626   numBadFullSource = 0;
20627   sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_BIOSRC, 0, &fcontext);
20628   if (sfp != NULL) {
20629     if (fcontext.left == 0 && fcontext.right == bsp->length - 1 && fcontext.numivals == 1) {
20630       showBadFullSource = TRUE;
20631       sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
20632       if (sdp != NULL) {
20633         biop = (BioSourcePtr) sdp->data.ptrvalue;
20634         if (biop != NULL) {
20635           if (biop->is_focus) {
20636             showBadFullSource = FALSE;
20637           }
20638           for (sbsp = biop->subtype; sbsp != NULL; sbsp = sbsp->next) {
20639             if (sbsp->subtype == SUBSRC_transgenic) {
20640               showBadFullSource = FALSE;
20641             }
20642           }
20643         }
20644       }
20645       if (showBadFullSource) {
20646         if (gcp != NULL) {
20647           gcp->itemID = fcontext.itemID;
20648           gcp->thistype = OBJ_SEQFEAT;
20649         }
20650         vsp->descr = NULL;
20651         vsp->sfp = sfp;
20652         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadFullLengthFeature, "Source feature is full length, should be descriptor");
20653         vsp->sfp = NULL;
20654         if (gcp != NULL) {
20655           gcp->itemID = olditemid;
20656           gcp->thistype = olditemtype;
20657         }
20658       }
20659     }
20660   }
20661   /* and fall through to continue testing first and remaining source features */
20662   while (sfp != NULL) {
20663     if (fcontext.left == 0 && fcontext.right == bsp->length - 1 && fcontext.numivals == 1) {
20664       numBadFullSource++;
20665       if (numBadFullSource > 1) {
20666         if (gcp != NULL) {
20667           gcp->itemID = fcontext.itemID;
20668           gcp->thistype = OBJ_SEQFEAT;
20669         }
20670         vsp->descr = NULL;
20671         vsp->sfp = sfp;
20672         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadFullLengthFeature, "Multiple full-length source features, should only be one if descriptor is transgenic");
20673         vsp->sfp = NULL;
20674         if (gcp != NULL) {
20675           gcp->itemID = olditemid;
20676           gcp->thistype = olditemtype;
20677         }
20678       }
20679     }
20680     biop = (BioSourcePtr) sfp->data.value.ptrvalue;
20681     if (biop != NULL && lastbiop != NULL) {
20682       if (lastsfp != NULL) {
20683         if (StringDoesHaveText (lastsfp->comment) && StringDoesHaveText (sfp->comment) && StringICmp (lastsfp->comment, sfp->comment) != 0) {
20684           /* different comments, so ignore */
20685         } else if (IsIdenticalBioSource (biop, lastbiop) && (! bvsp->is_synthetic) && (!bvsp->is_artificial)
20686                    && !SuppressMultipleEquivBioSources(biop)) {
20687           if (gcp != NULL) {
20688             gcp->itemID = fcontext.itemID;
20689             gcp->thistype = OBJ_SEQFEAT;
20690           }
20691           vsp->descr = NULL;
20692           vsp->sfp = sfp;
20693           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_MultipleEquivBioSources, "Multiple equivalent source features should be combined into one multi-interval feature");
20694           vsp->sfp = NULL;
20695           if (gcp != NULL) {
20696             gcp->itemID = olditemid;
20697             gcp->thistype = olditemtype;
20698           }
20699         }
20700       }
20701     }
20702     lastbiop = biop;
20703     lastsfp = sfp;
20704     sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_BIOSRC, 0, &fcontext);
20705   }
20706 
20707   lastpdp = NULL;
20708   lastsfp = NULL;
20709   sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_PUB, 0, &fcontext);
20710   if (sfp != NULL) {
20711     if (fcontext.left == 0 && fcontext.right == bsp->length - 1 && fcontext.numivals == 1) {
20712       if (gcp != NULL) {
20713         gcp->itemID = fcontext.itemID;
20714         gcp->thistype = OBJ_SEQFEAT;
20715       }
20716       vsp->descr = NULL;
20717       vsp->sfp = sfp;
20718       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadFullLengthFeature, "Publication feature is full length, should be descriptor");
20719       vsp->sfp = NULL;
20720       if (gcp != NULL) {
20721         gcp->itemID = olditemid;
20722         gcp->thistype = olditemtype;
20723       }
20724     }
20725   }
20726   while (sfp != NULL) {
20727     pdp = (PubdescPtr) sfp->data.value.ptrvalue;
20728     if (pdp != NULL && lastpdp != NULL) {
20729       if (lastsfp != NULL) {
20730         if (StringDoesHaveText (lastsfp->comment) && StringDoesHaveText (sfp->comment) && StringICmp (lastsfp->comment, sfp->comment) != 0) {
20731           /* different comments, so ignore */
20732         } else if (IsIdenticalPublication (pdp, lastpdp)) {
20733           if (gcp != NULL) {
20734             gcp->itemID = fcontext.itemID;
20735             gcp->thistype = OBJ_SEQFEAT;
20736           }
20737           vsp->descr = NULL;
20738           vsp->sfp = sfp;
20739           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_MultipleEquivPublications, "Multiple equivalent publication features should be combined into one multi-interval feature");
20740           vsp->sfp = NULL;
20741           if (gcp != NULL) {
20742             gcp->itemID = olditemid;
20743             gcp->thistype = olditemtype;
20744           }
20745         }
20746       }
20747     }
20748     lastpdp = pdp;
20749     lastsfp = sfp;
20750     sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_PUB, 0, &fcontext);
20751   }
20752 
20753   SeqMgrExploreDescriptors (bsp, (Pointer) bvsp, ValidateSeqDescrIndexed, NULL);
20754 
20755   if (gcp != NULL) {
20756     omdp = ObjMgrGetData (gcp->entityID);
20757     if (omdp != NULL && omdp->datatype == OBJ_SEQSUB) {
20758       ssp = (SeqSubmitPtr) omdp->dataptr;
20759       if (ssp != NULL) {
20760         sbp = ssp->sub;
20761         if (sbp != NULL) {
20762           bvsp->got_a_pub = TRUE;
20763         }
20764       }
20765     }
20766   }
20767 
20768   ValidateCDSmRNAmatch (vsp, bsp, numgene, numcds, nummrna);
20769 
20770   if (vsp->locusTagGeneralMatch) {
20771     ValidateLocusTagGeneral (vsp, bsp);
20772   }
20773 
20774   if (ISA_na (bsp->mol) && SeqMgrGetParentOfPart (bsp, NULL) == NULL) {
20775     LookForMultipleUnpubPubs (vsp, gcp, bsp);
20776   }
20777 
20778   if (bsp->repr == Seq_repr_delta && ISA_na (bsp->mol)) {
20779     CheckBioseqForFeatsInGap (bsp, vsp);
20780   }
20781 
20782 if (! vsp->debugTestDuJour) {
20783   if (bsp->repr == Seq_repr_raw || (bsp->repr == Seq_repr_delta && DeltaLitOnly (bsp))) {
20784     CheckBioseqForFeatsInNs (bsp, vsp);
20785   }
20786 }
20787 
20788   CheckForNonViralComplete (bsp, vsp, gcp);
20789 
20790   LookForViralMolInfoInLineage (bsp, vsp, gcp);
20791 
20792   return TRUE;
20793 }
20794 
20795 //LCOV_EXCL_START
ValidateBioseqContextGather(GatherContextPtr gcp)20796 static Boolean ValidateBioseqContextGather (GatherContextPtr gcp)
20797 {
20798   ValidStructPtr  vsp;
20799   BioseqValidStrPtr bvsp;
20800   CitSubPtr       csp;
20801 
20802   bvsp = (BioseqValidStrPtr) (gcp->userdata);
20803   vsp = bvsp->vsp;
20804   vsp->descr = NULL;
20805   vsp->sfp = NULL;
20806   vsp->gcp = gcp;               /* needed for ValidErr */
20807 
20808   switch (gcp->thistype) {
20809   case OBJ_SEQFEAT:
20810     ValidateSeqFeatContext (gcp);
20811     break;
20812   case OBJ_SEQDESC:
20813     ValidateSeqDescrContext (gcp);
20814     break;
20815   case OBJ_SEQSUB_CIT:
20816     bvsp->got_a_pub = TRUE;
20817     csp = (CitSubPtr) gcp->thisitem;
20818     ValidateCitSub (vsp, csp);
20819     break;
20820   default:
20821     break;
20822   }
20823   return TRUE;
20824 }
20825 //LCOV_EXCL_STOP
20826 
20827 
ListFeaturesContainedInLocation(BioseqPtr bsp,SeqLocPtr slp,Uint1 seqfeatChoice,Uint1 featdefChoice)20828 static ValNodePtr ListFeaturesContainedInLocation (BioseqPtr bsp, SeqLocPtr slp, Uint1 seqfeatChoice, Uint1 featdefChoice)
20829 {
20830   ValNodePtr        feat_list = NULL;
20831   SeqMgrFeatContext fcontext;
20832   SeqFeatPtr        sfp;
20833   Int4              loc_left, loc_right, tmp;
20834   Int4              cmp;
20835 
20836   if (bsp == NULL || slp == NULL) return NULL;
20837 
20838   loc_left = SeqLocStart (slp);
20839   loc_right = SeqLocStop (slp);
20840   if (loc_left > loc_right) {
20841     tmp = loc_left;
20842     loc_left = loc_right;
20843     loc_right = tmp;
20844   }
20845   for (sfp = SeqMgrGetNextFeature (bsp, NULL, seqfeatChoice, featdefChoice, &fcontext);
20846        sfp != NULL && fcontext.left <= loc_right;
20847        sfp = SeqMgrGetNextFeature (bsp, sfp, seqfeatChoice, featdefChoice, &fcontext))
20848   {
20849     cmp = SeqLocCompare (sfp->location, slp);
20850     if (cmp == SLC_A_EQ_B || cmp == SLC_A_IN_B)
20851     {
20852       ValNodeAddPointer (&feat_list, OBJ_SEQFEAT, sfp);
20853     }
20854   }
20855   return feat_list;
20856 }
20857 
20858 
20859 typedef struct multigeneoverlap {
20860   SeqFeatPtr gene;
20861   Int4       left;
20862   Int4       right;
20863   Boolean    reported;
20864 } MultiGeneOverlapData, PNTR MultiGeneOverlapPtr;
20865 
MultiGeneOverlapNew(SeqFeatPtr gene,Int4 left,Int4 right)20866 static MultiGeneOverlapPtr MultiGeneOverlapNew (SeqFeatPtr gene, Int4 left, Int4 right)
20867 {
20868   MultiGeneOverlapPtr m;
20869 
20870   m = (MultiGeneOverlapPtr) MemNew (sizeof (MultiGeneOverlapData));
20871   m->gene = gene;
20872   m->left = left;
20873   m->right = right;
20874   m->reported = FALSE;
20875   return m;
20876 }
20877 
MultiGeneOverlapFree(MultiGeneOverlapPtr m)20878 static MultiGeneOverlapPtr MultiGeneOverlapFree (MultiGeneOverlapPtr m)
20879 {
20880   m = MemFree (m);
20881   return m;
20882 }
20883 
20884 
ReportContainedGenes(SeqFeatPtr gene,ValNodePtr contained_list,ValidStructPtr vsp)20885 static void ReportContainedGenes (SeqFeatPtr gene, ValNodePtr contained_list, ValidStructPtr vsp)
20886 {
20887   ValNodePtr vnp;
20888   Int4       cmp, num_overlap = 0;
20889   MultiGeneOverlapPtr g;
20890 
20891   if (gene == NULL || contained_list == NULL || contained_list->next == NULL) {
20892     return;
20893   }
20894   for (vnp = contained_list; vnp != NULL; vnp = vnp->next) {
20895     g = (MultiGeneOverlapPtr) vnp->data.ptrvalue;
20896     cmp = SeqLocCompare (gene->location, g->gene->location);
20897     if (cmp == SLC_A_EQ_B || cmp == SLC_B_IN_A) {
20898       num_overlap++;
20899     }
20900   }
20901   if (num_overlap > 4) {
20902     vsp->descr = NULL;
20903     vsp->sfp = gene;
20904 
20905     vsp->gcp->entityID = gene->idx.entityID;
20906     vsp->gcp->itemID = gene->idx.itemID;
20907     vsp->gcp->thistype = OBJ_SEQFEAT;
20908     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_MultipleGeneOverlap, "Gene contains %d other genes", num_overlap);
20909   }
20910 }
20911 
20912 
AddMultiGeneOverlapToList(ValNodePtr PNTR list,MultiGeneOverlapPtr m,ValidStructPtr vsp)20913 static void AddMultiGeneOverlapToList (ValNodePtr PNTR list, MultiGeneOverlapPtr m, ValidStructPtr vsp)
20914 {
20915   ValNodePtr vnp;
20916   MultiGeneOverlapPtr g;
20917 
20918   if (list == NULL || m == NULL || vsp == NULL) {
20919     return;
20920   }
20921 
20922   if (*list == NULL) {
20923     ValNodeAddPointer (list, 0, m);
20924     return;
20925   }
20926 
20927   /* we're examining the interval for the first gene on the list */
20928   g = (MultiGeneOverlapPtr) (*list)->data.ptrvalue;
20929 
20930   /* if the new gene is outside the interval for the first gene, it's time to remove the first gene
20931    */
20932   while (g != NULL && g->right < m->left) {
20933     /* first, figure out if this gene contains other genes */
20934     ReportContainedGenes (g->gene, (*list)->next, vsp);
20935     /* now remove it from the list */
20936     vnp = *list;
20937     *list = (*list)->next;
20938     vnp->next = NULL;
20939     vnp = ValNodeFreeData (vnp);
20940     if ((*list) == NULL) {
20941       g = NULL;
20942     } else {
20943       g = (*list)->data.ptrvalue;
20944     }
20945   }
20946 
20947   /* now add this gene to list */
20948   ValNodeAddPointer (list, 0, m);
20949 }
20950 
20951 
NewFindMultiGeneOverlaps(BioseqPtr bsp,ValidStructPtr vsp)20952 static void NewFindMultiGeneOverlaps (BioseqPtr bsp, ValidStructPtr vsp)
20953 {
20954   SeqMgrFeatContext context;
20955   SeqFeatPtr        gene;
20956   ValNodePtr        gene_list = NULL, vnp;
20957   MultiGeneOverlapPtr m;
20958 
20959   vsp->bsp = bsp;
20960 
20961   for (gene = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_GENE, 0, &context);
20962        gene != NULL;
20963        gene = SeqMgrGetNextFeature (bsp, gene, SEQFEAT_GENE, 0, &context)) {
20964     m = MultiGeneOverlapNew (gene, context.left, context.right);
20965     AddMultiGeneOverlapToList (&gene_list, m, vsp);
20966   }
20967   for (vnp = gene_list; vnp != NULL && vnp->next != NULL; vnp = vnp->next) {
20968     m = (MultiGeneOverlapPtr) vnp->data.ptrvalue;
20969     ReportContainedGenes (m->gene, vnp->next, vsp);
20970   }
20971   gene_list = ValNodeFreeData (gene_list);
20972 }
20973 
20974 
20975 /*****************************************************************************
20976 *   FindMultiGeneOverlaps (BioseqPtr bsp, ValidStructPtr vsp)
20977 *
20978 *      This function reports genes that overlap two or more other genes.
20979 *****************************************************************************/
FindMultiGeneOverlaps(BioseqPtr bsp,ValidStructPtr vsp)20980 static void FindMultiGeneOverlaps (BioseqPtr bsp, ValidStructPtr vsp)
20981 {
20982   GatherContextPtr  gcp;
20983   Uint2           oldEntityID, oldItemtype;
20984   Uint4           oldItemID;
20985 
20986   if (bsp == NULL || vsp == NULL || vsp->gcp == NULL) {
20987     return;
20988   }
20989 
20990   gcp = vsp->gcp;
20991 
20992   oldEntityID = gcp->entityID;
20993   oldItemID = gcp->itemID;
20994   oldItemtype = gcp->thistype;
20995 
20996   NewFindMultiGeneOverlaps (bsp, vsp);
20997 
20998   gcp->entityID = oldEntityID;
20999   gcp->itemID = oldItemID;
21000   gcp->thistype = oldItemtype;
21001 
21002 }
21003 
LocationIsFar(SeqLocPtr location)21004 static Boolean LocationIsFar (SeqLocPtr location)
21005 
21006 {
21007   BioseqPtr    bsp;
21008   DeltaSeqPtr  dsp;
21009   Boolean      is_far = FALSE;
21010   SeqLocPtr    loc;
21011   SeqEntryPtr  oldscope;
21012   SeqIdPtr     sip;
21013   SeqLocPtr    slp;
21014 
21015   if (location == NULL) return FALSE;
21016 
21017   oldscope = SeqEntrySetScope (NULL);
21018 
21019   slp = SeqLocFindNext (location, NULL);
21020   while (slp != NULL) {
21021     if (slp->choice != SEQLOC_NULL) {
21022       sip = SeqLocId (slp);
21023       bsp = BioseqFind (sip);
21024       if (bsp == NULL) {
21025         is_far = TRUE;
21026       } else if (bsp->repr == Seq_repr_delta && bsp->seq_ext_type == 4) {
21027         for (dsp = (DeltaSeqPtr) bsp->seq_ext;
21028              dsp != NULL && (! is_far);
21029              dsp = dsp->next) {
21030           if (dsp->choice != 1) continue;
21031           loc = (SeqLocPtr) dsp->data.ptrvalue;
21032           if (loc == NULL) continue;
21033           if (loc->choice == SEQLOC_NULL) continue;
21034           sip = SeqLocId (loc);
21035           bsp = BioseqFind (sip);
21036           if (bsp == NULL) {
21037             is_far = TRUE;
21038           }
21039         }
21040       } else if (bsp->repr == Seq_repr_seg && bsp->seq_ext_type == 1) {
21041 //LCOV_EXCL_START
21042 // Only for SegSets
21043         for (loc = (SeqLocPtr) bsp->seq_ext;
21044              loc != NULL && (! is_far);
21045              loc = loc->next) {
21046           if (loc == NULL) continue;
21047           if (loc->choice == SEQLOC_NULL) continue;
21048           sip = SeqLocId (loc);
21049           bsp = BioseqFind (sip);
21050           if (bsp == NULL) {
21051             is_far = TRUE;
21052           }
21053         }
21054 //LCOV_EXCL_STOP
21055       }
21056     }
21057     slp = SeqLocFindNext (location, slp);
21058   }
21059 
21060   SeqEntrySetScope (oldscope);
21061 
21062   return is_far;
21063 }
21064 
NoFetchFunctions(void)21065 static Boolean NoFetchFunctions (void)
21066 
21067 {
21068   ObjMgrProcPtr  ompp = NULL;
21069 
21070   ompp = ObjMgrProcFindNext (NULL, OMPROC_FETCH, OBJ_SEQID, OBJ_BIOSEQ, NULL);
21071 
21072   return (Boolean) (ompp == NULL);
21073 }
21074 
HasAssemblyOrNullGap(BioseqPtr bsp)21075 static Boolean HasAssemblyOrNullGap (BioseqPtr bsp)
21076 
21077 {
21078   DeltaSeqPtr  dsp;
21079   SeqLitPtr    litp;
21080 
21081   if (bsp == NULL || bsp->repr != Seq_repr_delta) return FALSE;
21082 
21083   for (dsp = (DeltaSeqPtr) bsp->seq_ext; dsp; dsp=dsp->next) {
21084     if (dsp->choice != 2) continue;
21085     litp = (SeqLitPtr) dsp->data.ptrvalue;
21086     if (litp == NULL) continue;
21087     if (litp->seq_data == NULL) return TRUE;
21088     if (litp->seq_data_type == Seq_code_gap) return TRUE;
21089   }
21090 
21091   return FALSE;
21092 }
21093 
ReportBadAssemblyGap(BioseqPtr bsp,ValidStructPtr vsp,GatherContextPtr gcp)21094 static void ReportBadAssemblyGap (BioseqPtr bsp, ValidStructPtr vsp, GatherContextPtr gcp)
21095 
21096 {
21097   DeltaSeqPtr  dsp;
21098   SeqLitPtr    litp;
21099   Uint2        oldEntityID, oldItemtype;
21100   Uint4        oldItemID;
21101   SeqGapPtr    sgp;
21102 
21103   if (bsp == NULL || bsp->repr != Seq_repr_delta) return;
21104   if (vsp == NULL || gcp == NULL) return;
21105 
21106   oldEntityID = gcp->entityID;
21107   oldItemID = gcp->itemID;
21108   oldItemtype = gcp->thistype;
21109 
21110   vsp->bsp = bsp;
21111   vsp->descr = NULL;
21112   vsp->sfp = NULL;
21113   gcp->entityID = bsp->idx.entityID;
21114   gcp->itemID = bsp->idx.itemID;
21115   gcp->thistype = OBJ_BIOSEQ;
21116 
21117   for (dsp = (DeltaSeqPtr) bsp->seq_ext; dsp; dsp=dsp->next) {
21118     if (dsp->choice != 2) continue;
21119     litp = (SeqLitPtr) dsp->data.ptrvalue;
21120     if (litp == NULL) {
21121       ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_SeqGapProblem, "TSA gap not assembly_gap");
21122     } else if (litp->seq_data == NULL) {
21123       ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_SeqGapProblem, "TSA Seq_data NULL");
21124     } else if (litp->seq_data_type == Seq_code_gap) {
21125       sgp = (SeqGapPtr) litp->seq_data;
21126       if (sgp == NULL) {
21127         ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_SeqGapProblem, "TSA Seq_gap NULL");
21128       } else if (sgp->type == 0) {
21129         ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_SeqGapProblem, "TSA Seq_gap.unknown");
21130       } else if (sgp->type == 255) {
21131         ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_SeqGapProblem, "TSA Seq_gap.other");
21132       }
21133     }
21134   }
21135 
21136   gcp->entityID = oldEntityID;
21137   gcp->itemID = oldItemID;
21138   gcp->thistype = oldItemtype;
21139 }
21140 
21141 
ReportBadWGSGap(BioseqPtr bsp,ValidStructPtr vsp,GatherContextPtr gcp)21142 static void ReportBadWGSGap(BioseqPtr bsp, ValidStructPtr vsp, GatherContextPtr gcp)
21143 
21144 {
21145     DeltaSeqPtr  dsp;
21146     SeqLitPtr    litp;
21147     Uint2        oldEntityID, oldItemtype;
21148     Uint4        oldItemID;
21149     SeqGapPtr    sgp;
21150     SeqDescrPtr  sdp;
21151     SeqMgrDescContext context;
21152     MolInfoPtr   mip;
21153     Boolean      is_wgs = FALSE;
21154     SeqIdPtr     sip;
21155     Boolean      linkage_evidence_missing = FALSE;
21156 
21157     if (bsp == NULL || bsp->repr != Seq_repr_delta) return;
21158     if (vsp == NULL || gcp == NULL) return;
21159 
21160     sdp = SeqMgrGetNextDescriptor(bsp, NULL, Seq_descr_molinfo, &context);
21161     if (sdp == NULL || (mip = (MolInfoPtr)sdp->data.ptrvalue) == NULL || mip->tech != MI_TECH_wgs) {
21162         return;
21163     }
21164     for (sip = bsp->id; sip != NULL; sip = sip->next) {
21165         if (sip->choice == SEQID_DDBJ || sip->choice == SEQID_EMBL || sip->choice == SEQID_OTHER) {
21166             return;
21167         }
21168     }
21169 
21170     for (dsp = (DeltaSeqPtr)bsp->seq_ext; dsp && !linkage_evidence_missing; dsp = dsp->next) {
21171         if (dsp->choice != 2) /* continue */ return;
21172         litp = (SeqLitPtr)dsp->data.ptrvalue;
21173         if (litp == NULL || litp->seq_data == NULL) {
21174             linkage_evidence_missing = TRUE;
21175         } else if (litp->seq_data_type == Seq_code_gap) {
21176             sgp = (SeqGapPtr)litp->seq_data;
21177             if (sgp->linkage_evidence == NULL) {
21178                 linkage_evidence_missing = TRUE;
21179             }
21180         }
21181     }
21182 
21183     if (linkage_evidence_missing) {
21184         oldEntityID = gcp->entityID;
21185         oldItemID = gcp->itemID;
21186         oldItemtype = gcp->thistype;
21187 
21188         vsp->bsp = bsp;
21189         vsp->descr = NULL;
21190         vsp->sfp = NULL;
21191         gcp->entityID = bsp->idx.entityID;
21192         gcp->itemID = bsp->idx.itemID;
21193         gcp->thistype = OBJ_BIOSEQ;
21194 
21195         ValidErr(vsp, SEV_ERROR, ERR_SEQ_INST_SeqGapProblem, "WGS submission includes wrong gap type. Gaps for WGS genomes should be Assembly Gaps with linkage evidence.");
21196         gcp->entityID = oldEntityID;
21197         gcp->itemID = oldItemID;
21198         gcp->thistype = oldItemtype;
21199     }
21200 }
21201 
21202 
ValidateTSASequenceForNs(BioseqPtr bsp,ValidStructPtr vsp)21203 static void ValidateTSASequenceForNs (BioseqPtr bsp, ValidStructPtr vsp)
21204 {
21205   Int4              total = 0, totalN = 0, totalDash = 0, totalBang = 0, max_stretch = 0;
21206   GatherContextPtr  gcp;
21207   ErrSev            logsev;
21208   ErrSev            msgsev;
21209   Uint2             oldEntityID, oldItemtype;
21210   Uint4             oldItemID;
21211   Int4              percent_N, allowed_percentN = 10;
21212   SeqFeat           sf;
21213   SeqInt            si;
21214   CharPtr           str;
21215   ValNode           vn;
21216 
21217   if (ISA_aa (bsp->mol)) {
21218     return;
21219   }
21220   if (bsp->repr == Seq_repr_virtual) return;
21221   if (bsp->repr == Seq_repr_delta && (! DeltaLitOnly (bsp)) && NoFetchFunctions ()) return;
21222   if (bsp->repr == Seq_repr_ref && NoFetchFunctions ()) return;
21223 
21224   gcp = vsp->gcp;
21225 
21226   oldEntityID = gcp->entityID;
21227   oldItemID = gcp->itemID;
21228   oldItemtype = gcp->thistype;
21229 
21230   if (IsTSA (bsp)) {
21231     ReportBadAssemblyGap (bsp, vsp, gcp);
21232 
21233     if (HasAssemblyOrNullGap (bsp)) return;
21234 
21235     msgsev = ErrSetMessageLevel (SEV_MAX);
21236     logsev = ErrSetLogLevel (SEV_MAX);
21237 
21238     CountNsInSequence (bsp, &totalN, &totalDash, &totalBang, &max_stretch, /* FALSE */ TRUE, TRUE);
21239     total = totalN + totalDash + totalBang;
21240 
21241     ErrSetLogLevel (logsev);
21242     ErrSetMessageLevel (msgsev);
21243 
21244     percent_N = (total * 100) / bsp->length;
21245     if (percent_N > allowed_percentN) {
21246       vsp->bsp = bsp;
21247       vsp->descr = NULL;
21248       vsp->sfp = NULL;
21249       gcp->entityID = bsp->idx.entityID;
21250       gcp->itemID = bsp->idx.itemID;
21251       gcp->thistype = OBJ_BIOSEQ;
21252 
21253       ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_HighNContentPercent, "Sequence contains %d percent Ns", percent_N);
21254     }
21255     if (max_stretch >= 15) {
21256       vsp->bsp = bsp;
21257       vsp->descr = NULL;
21258       vsp->sfp = NULL;
21259       gcp->entityID = bsp->idx.entityID;
21260       gcp->itemID = bsp->idx.itemID;
21261       gcp->thistype = OBJ_BIOSEQ;
21262 
21263       ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_HighNContentStretch, "Sequence has a stretch of %d Ns", max_stretch);
21264     } else if (bsp->length > 20) {
21265       vsp->bsp = bsp;
21266       vsp->descr = NULL;
21267       vsp->sfp = NULL;
21268       gcp->entityID = bsp->idx.entityID;
21269       gcp->itemID = bsp->idx.itemID;
21270       gcp->thistype = OBJ_BIOSEQ;
21271 
21272       MemSet ((Pointer) &sf, 0, sizeof (SeqFeat));
21273       MemSet ((Pointer) &si, 0, sizeof (SeqInt));
21274       MemSet ((Pointer) &vn, 0, sizeof (ValNode));
21275       sf.location = &vn;
21276       vn.choice = SEQLOC_INT;
21277       vn.data.ptrvalue = (Pointer) &si;
21278       si.id = bsp->id;
21279       si.from = 0;
21280       si.to = 19;
21281       str = GetSequenceByFeatureEx (&sf, STREAM_EXPAND_GAPS | SEQ_GAP_AS_TILDE);
21282       if (StringStr (str, "NNNNNNNNNN") != NULL) {
21283         ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_HighNContentStretch, "Sequence has a stretch of at least 10 Ns within the first 20 bases");
21284       }
21285       MemFree (str);
21286       si.from = bsp->length - 20;
21287       si.to = bsp->length - 1;
21288       str = GetSequenceByFeatureEx (&sf, STREAM_EXPAND_GAPS | SEQ_GAP_AS_TILDE);
21289       if (StringStr (str, "NNNNNNNNNN") != NULL) {
21290         ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_HighNContentStretch, "Sequence has a stretch of at least 10 Ns within the last 20 bases");
21291       }
21292       MemFree (str);
21293     }
21294   } else {
21295     msgsev = ErrSetMessageLevel (SEV_MAX);
21296     logsev = ErrSetLogLevel (SEV_MAX);
21297 
21298     CountNsInSequence (bsp, &totalN, &totalDash, &totalBang, &max_stretch, FALSE, TRUE);
21299     total = totalN + totalDash + totalBang;
21300 
21301     ErrSetLogLevel (logsev);
21302     ErrSetMessageLevel (msgsev);
21303 
21304     percent_N = (total * 100) / bsp->length;
21305     if (percent_N > 50) {
21306       vsp->bsp = bsp;
21307       vsp->descr = NULL;
21308       vsp->sfp = NULL;
21309       gcp->entityID = bsp->idx.entityID;
21310       gcp->itemID = bsp->idx.itemID;
21311       gcp->thistype = OBJ_BIOSEQ;
21312 
21313       ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_HighNContentPercent, "Sequence contains %d percent Ns", percent_N);
21314     }
21315   }
21316 
21317   ReportBadWGSGap(bsp, vsp, gcp);
21318 
21319   gcp->entityID = oldEntityID;
21320   gcp->itemID = oldItemID;
21321   gcp->thistype = oldItemtype;
21322 }
21323 
21324 
FindBracketed(CharPtr title,CharPtr taxname)21325 static Boolean FindBracketed (CharPtr title, CharPtr taxname)
21326 
21327 {
21328   CharPtr  ptr;
21329 
21330   if (StringHasNoText (title) || StringHasNoText (taxname)) return FALSE;
21331 
21332   ptr = StringStr (title, taxname);
21333   if (ptr == NULL) return FALSE;
21334   if (ptr == title) return FALSE;
21335   if (*(ptr - 1) != '[') return FALSE;
21336   if (*(ptr + StringLen (taxname)) != ']') return FALSE;
21337 
21338   return TRUE;
21339 }
21340 
ValidateRefSeqTitle(BioseqPtr bsp,ValidStructPtr vsp,Boolean is_virus)21341 static void ValidateRefSeqTitle (BioseqPtr bsp, ValidStructPtr vsp, Boolean is_virus)
21342 {
21343   SeqDescrPtr       sdp;
21344   SeqMgrDescContext dcontext;
21345   SeqMgrFeatContext fcontext;
21346   CharPtr           taxname = NULL, title;
21347   BioSourcePtr      biop;
21348   SeqFeatPtr        cds, src_feat;
21349   size_t            len, tlen;
21350 
21351   if (bsp == NULL || vsp == NULL) {
21352     return;
21353   }
21354 
21355   if (is_virus) return;
21356 
21357   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
21358   if (sdp != NULL
21359       && (biop = (BioSourcePtr) sdp->data.ptrvalue) != NULL
21360       && biop->org != NULL) {
21361     taxname = biop->org->taxname;
21362   }
21363   if (ISA_aa(bsp->mol)) {
21364     cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
21365     if (cds != NULL) {
21366       src_feat = SeqMgrGetOverlappingSource (cds->location, &fcontext);
21367       if (src_feat != NULL
21368           && (biop = (BioSourcePtr) src_feat->data.value.ptrvalue) != NULL
21369           && biop->org != NULL) {
21370         taxname = biop->org->taxname;
21371       }
21372     }
21373   }
21374 
21375   if (StringDoesHaveText (taxname)) {
21376     sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_title, &dcontext);
21377     if (sdp != NULL) {
21378       title = (CharPtr) sdp->data.ptrvalue;
21379       if (StringDoesHaveText (title)) {
21380         if (StringNCmp (title, "PREDICTED: ", 11) == 0) {
21381           title += 11;
21382         }
21383         len = StringLen (taxname);
21384         tlen = StringLen (title);
21385         if (ISA_na (bsp->mol)) {
21386           if (tlen < len || StringNICmp (title, taxname, len) != 0) {
21387             ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_NoOrganismInTitle, "RefSeq nucleotide title does not start with organism name");
21388           }
21389         } else if (ISA_aa (bsp->mol)) {
21390           if (tlen < len + 3 ||
21391               StringNICmp (title + tlen - len - 1, taxname, len) != 0 ||
21392               title [tlen - len - 2] != '[' ||
21393               title [tlen - 1] != ']') {
21394             if (vsp->is_wp_in_sep && FindBracketed (title, taxname)) {
21395             } else {
21396               ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_NoOrganismInTitle, "RefSeq protein title does not end with organism name");
21397             }
21398           }
21399         }
21400       }
21401     }
21402   }
21403 }
21404 
21405 
EndsWithSuffixPlusFieldValue(CharPtr str,CharPtr suffix,CharPtr val)21406 static Boolean EndsWithSuffixPlusFieldValue (CharPtr str, CharPtr suffix, CharPtr val)
21407 {
21408   CharPtr cp, last_word;
21409 
21410   cp = StringSearch (str, suffix);
21411   if (cp == NULL) {
21412     return FALSE;
21413   }
21414   last_word = StringRChr (str, ' ');
21415   if (last_word == NULL || last_word < cp) {
21416     return FALSE;
21417   }
21418   if (StringCmp (last_word + 1, val) == 0) {
21419     return TRUE;
21420   } else {
21421     return FALSE;
21422   }
21423 
21424 }
21425 
21426 
ValidateBarcodeIndexNumber(CharPtr bin,BioseqPtr bsp,ValidStructPtr vsp)21427 static void ValidateBarcodeIndexNumber (CharPtr bin, BioseqPtr bsp, ValidStructPtr vsp)
21428 {
21429   SeqDescPtr        sdp;
21430   SeqMgrDescContext context;
21431   BioSourcePtr      biop;
21432   Int4              bin_len;
21433 
21434   if (StringHasNoText (bin) || bsp == NULL || vsp == NULL) {
21435     return;
21436   }
21437 
21438   bin_len = StringLen (bin);
21439 
21440   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
21441   if (sdp == NULL || (biop = (BioSourcePtr) sdp->data.ptrvalue) == NULL || biop->org == NULL) {
21442     return;
21443   }
21444   /* only check if name contains "sp." or "bacterium" */
21445   if (StringISearch (biop->org->taxname, "sp.") == NULL && StringISearch (biop->org->taxname, "bacterium") == NULL) {
21446     return;
21447   }
21448   /* only check if name contains BOLD */
21449   if (StringSearch (biop->org->taxname, "BOLD") == NULL) {
21450     return;
21451   }
21452   if (!EndsWithSuffixPlusFieldValue(biop->org->taxname, "sp. ", bin)
21453       && !EndsWithSuffixPlusFieldValue(biop->org->taxname, "bacterium ", bin)) {
21454     ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_BadStrucCommInvalidFieldValue, "Organism name should end with sp. plus Barcode Index Number (%s)", bin);
21455   }
21456 }
21457 
21458 
ValidateStructuredCommentsInContext(BioseqPtr bsp,ValidStructPtr vsp)21459 static void ValidateStructuredCommentsInContext (BioseqPtr bsp, ValidStructPtr vsp)
21460 {
21461   SeqDescPtr    sdp;
21462   SeqMgrDescContext dcontext;
21463   UserObjectPtr uop;
21464   ObjectIdPtr   oip;
21465   UserFieldPtr  curr;
21466 
21467   /* validate structured comments in context */
21468   for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_user, &dcontext);
21469        sdp != NULL;
21470        sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_user, &dcontext))
21471   {
21472     uop = sdp->data.ptrvalue;
21473     if (uop != NULL && uop->type != NULL && StringICmp (uop->type->str, "StructuredComment") == 0)
21474     {
21475       for (curr = uop->data; curr != NULL; curr = curr->next)
21476       {
21477         if (curr->choice != 1) continue;
21478         oip = curr->label;
21479         if (oip == NULL || StringCmp (oip->str, "Barcode Index Number") != 0) continue;
21480         ValidateBarcodeIndexNumber ((CharPtr) curr->data.ptrvalue, bsp, vsp);
21481       }
21482     }
21483   }
21484 }
21485 
21486 
TestForUnwantedCompleteFlag(BioseqPtr bsp,GatherContextPtr gcp)21487 static void TestForUnwantedCompleteFlag (BioseqPtr bsp,  GatherContextPtr gcp)
21488 {
21489   MolInfoPtr      mip;
21490   ErrSev  sev = SEV_WARNING;
21491   Boolean do_report = FALSE;
21492   SeqMgrDescContext dcontext;
21493   ValNodePtr      vnp = NULL;
21494   SeqIdPtr        sip;
21495   Boolean         is_gb = FALSE;
21496   CharPtr         str;
21497   BioSourcePtr    biop;
21498   OrgNamePtr      onp;
21499   OrgRefPtr       orp;
21500   ValidStructPtr  vsp;
21501   Uint2           oldEntityID, oldItemtype;
21502   Uint4           oldItemID;
21503   Uint2           mipEntityID = 0, mipItemtype = 0;
21504   Uint4           mipItemID = 0;
21505 
21506 
21507   if (bsp == NULL || !ISA_na (bsp->mol)) {
21508     return;
21509   }
21510   vsp = (ValidStructPtr) (gcp->userdata);
21511 
21512   vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext);
21513   if (vnp == NULL || (mip = (MolInfoPtr) vnp->data.ptrvalue) == NULL || mip->completeness != 1) {
21514     /* nothing to validate if no molinfo or not complete */
21515     return;
21516   }
21517   mipEntityID = dcontext.entityID;
21518   mipItemID = dcontext.itemID;
21519   mipItemtype = OBJ_SEQDESC;
21520 
21521   vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_title, &dcontext);
21522   if (vnp != NULL) {
21523     str = (CharPtr) vnp->data.ptrvalue;
21524     if (StringDoesHaveText (str)) {
21525       if (StringISearch (str, "complete sequence") != NULL || StringISearch (str, "complete genome") != NULL) {
21526         /* complete sequence or complete genome in title suppresses warning */
21527         return;
21528       }
21529     }
21530   }
21531 
21532   if (mip->biomol == MOLECULE_TYPE_GENOMIC) {
21533     sev = /* SEV_ERROR */ SEV_WARNING;
21534     if (mip->tech == MI_TECH_htgs_3) {
21535       sev = SEV_WARNING;
21536     }
21537     for (sip = bsp->id; sip != NULL; sip = sip->next) {
21538       if (sip->choice == SEQID_GENBANK) {
21539         is_gb = TRUE;
21540       }
21541     }
21542 
21543     if (is_gb) {
21544       if (bsp->topology == 2) {
21545         ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_CompleteCircleProblem, "Circular topology has complete flag set, but title should say complete sequence or complete genome");
21546       } else {
21547         do_report = TRUE;
21548       }
21549     }
21550   }
21551   if (!do_report) {
21552     /* for SQD-1484
21553      * warn if completeness = complete, organism not viral and not artificial, no location set or location is genomic
21554      */
21555     vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
21556     if (vnp != NULL) {
21557       biop = (BioSourcePtr) vnp->data.ptrvalue;
21558       if (biop != NULL) {
21559         orp = biop->org;
21560         if (orp != NULL) {
21561           onp = orp->orgname;
21562           if (onp != NULL) {
21563             if (StringSearch(onp->lineage, "Viruses") == NULL &&
21564                 StringSearch(onp->lineage, "Viroids") == NULL &&
21565                 biop->origin != 4 /* not artificial */ &&
21566                 (biop->genome == GENOME_unknown || biop->genome == GENOME_genomic)) { /* location not set or genomic */
21567               do_report = TRUE;
21568               sev = SEV_WARNING;
21569             }
21570           }
21571         }
21572       }
21573     }
21574   }
21575   if (do_report) {
21576     oldEntityID = gcp->entityID;
21577     oldItemID = gcp->itemID;
21578     oldItemtype = gcp->thistype;
21579 
21580     gcp->entityID = mipEntityID;
21581     gcp->itemID = mipItemID;
21582     gcp->thistype = mipItemtype;
21583 
21584     ValidErr (vsp, sev, ERR_SEQ_DESCR_UnwantedCompleteFlag, "Suspicious use of complete");
21585 
21586     gcp->entityID = oldEntityID;
21587     gcp->itemID = oldItemID;
21588     gcp->thistype = oldItemtype;
21589   }
21590 }
21591 
IsBspWGS(BioseqPtr bsp,Pointer userdata)21592 static void IsBspWGS (BioseqPtr bsp, Pointer userdata)
21593 
21594 {
21595   BoolPtr            bp;
21596   SeqMgrDescContext  dcontext;
21597   MolInfoPtr         mip;
21598   SeqDescrPtr        sdp;
21599 
21600   if (bsp == NULL) return;
21601 
21602   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext);
21603   if (sdp == NULL) return;
21604   mip = (MolInfoPtr) sdp->data.ptrvalue;
21605   if (mip == NULL) return;
21606   if (mip->tech != MI_TECH_wgs) return;
21607   bp = (BoolPtr) userdata;
21608   if (bp == NULL) return;
21609   *bp = TRUE;
21610 }
21611 
WGSinBssp(BioseqSetPtr bssp)21612 static Boolean WGSinBssp (BioseqSetPtr bssp)
21613 
21614 {
21615   Boolean  is_wgs = FALSE;
21616 
21617   if (bssp == NULL) return FALSE;
21618 
21619   VisitBioseqsInSet (bssp, (Pointer) (&is_wgs), IsBspWGS);
21620 
21621   return is_wgs;
21622 }
21623 
HasBadPlasmidChromLinkName(CharPtr name,CharPtr taxname)21624 static Boolean HasBadPlasmidChromLinkName (CharPtr name, CharPtr taxname)
21625 
21626 {
21627   if (StringHasNoText (name)) return FALSE;
21628 
21629   if (StringLen (name) > 33) return TRUE;
21630 
21631   if (StringStr (name, "plasmid") != NULL) return TRUE;
21632   if (StringStr (name, "chromosome") != NULL) return TRUE;
21633   if (StringStr (name, "linkage group") != NULL) return TRUE;
21634   if (StringStr (name, "chr") != NULL) return TRUE;
21635 
21636   if (StringDoesHaveText (taxname) && StringStr (name, taxname) != NULL) return TRUE;
21637 
21638   return FALSE;
21639 }
21640 
21641 
21642 /*****************************************************************************
21643 *
21644 *   ValidateBioseqContext(gcp)
21645 *      Validate one Bioseq for descriptors, features, and context
21646 *      This is done as a second Gather, focussed on the Bioseq in
21647 *        question.
21648 *
21649 *****************************************************************************/
ValidateBioseqContext(GatherContextPtr gcp)21650 static void ValidateBioseqContext (GatherContextPtr gcp)
21651 {
21652   size_t          acclen;
21653   ValidStructPtr  vsp;
21654   BioseqPtr       bsp;
21655   GatherScope     gs;
21656   BioseqValidStr  bvs;
21657   SeqFeatPtr      sfp;
21658   ValNode         fake_whole;
21659   SeqIdPtr        sip;
21660   ValNodePtr      vnp = NULL;
21661   MolInfoPtr      mip = NULL;
21662   DbtagPtr        dbt;
21663   SeqMgrDescContext dcontext;
21664   SeqMgrFeatContext fcontext;
21665   BioseqContextPtr bcp;
21666   Uint2           oldEntityID, oldItemtype;
21667   Uint4           oldItemID;
21668   Uint2           mipEntityID = 0, mipItemtype = 0;
21669   Uint4           mipItemID = 0;
21670   ObjMgrDataPtr   omdp;
21671   BioseqPtr       parent;
21672   PatentSeqIdPtr  psip;
21673   IdPatPtr        ipp;
21674   Boolean         isPDB = FALSE;
21675   Boolean         is_wgs = FALSE;
21676   Boolean         is_gb = FALSE;
21677   Boolean         is_eb_db = FALSE;
21678   Boolean         is_ac = FALSE;
21679   Boolean         is_ch_or_cm = FALSE;
21680   Boolean         is_nc = FALSE;
21681   Boolean         is_local = FALSE;
21682   Boolean         is_local_only = TRUE;
21683   Boolean         is_organelle = FALSE;
21684   Boolean         is_plasmid = FALSE;
21685   Boolean         is_prokaryote = FALSE;
21686   Boolean         is_refseq = FALSE;
21687   Boolean         is_neg_strand_virus = FALSE;
21688   Boolean         is_ambisense_virus = FALSE;
21689   Boolean         is_synthetic = FALSE;
21690   Boolean         is_transgenic = FALSE;
21691   Boolean         is_virus = FALSE;
21692   Boolean         has_cds = FALSE;
21693   Boolean         has_chromosome = FALSE;
21694   Boolean         has_linkage_group = FALSE;
21695   Boolean         has_wgs_general = FALSE;
21696   ErrSev          sev;
21697   SubSourcePtr    ssp;
21698   CharPtr         str;
21699   CharPtr         taxname = NULL;
21700   TextSeqIdPtr    tsip;
21701   BioSourcePtr    biop = NULL;
21702   OrgRefPtr       orp;
21703   OrgNamePtr      onp;
21704   OrgModPtr       omp;
21705   /*
21706   Char            buf1[255];
21707   */
21708   EMBLBlockPtr    ebp;
21709   GBBlockPtr      gbp;
21710   Boolean         okay;
21711   Char            prefix [32];
21712   ValNodePtr      secondaries;
21713   Uint4           whichdb;
21714 
21715   vsp = (ValidStructPtr) (gcp->userdata);
21716   bsp = (BioseqPtr) (gcp->thisitem);
21717   vsp->bsp = bsp;
21718   vsp->descr = NULL;
21719   vsp->sfp = NULL;
21720   vsp->bssp = (BioseqSetPtr) (gcp->parentitem);
21721 
21722   MemSet (&gs, 0, sizeof (GatherScope));
21723   fake_whole.choice = SEQLOC_WHOLE;
21724   sip = SeqIdFindBest (bsp->id, 0);
21725 
21726   fake_whole.data.ptrvalue = sip;
21727 
21728   fake_whole.next = NULL;
21729   gs.target = &fake_whole;
21730   gs.get_feats_location = TRUE;
21731   gs.nointervals = TRUE;
21732   MemSet ((Pointer) (gs.ignore), (int) TRUE, (size_t) (sizeof (Boolean) * OBJ_MAX));
21733   gs.ignore[OBJ_SEQDESC] = FALSE;
21734   gs.ignore[OBJ_SEQFEAT] = FALSE;
21735   gs.ignore[OBJ_SEQANNOT] = FALSE;
21736   gs.ignore[OBJ_SUBMIT_BLOCK] = FALSE;
21737   gs.ignore[OBJ_SEQSUB_CIT] = FALSE;
21738 
21739   gs.scope = vsp->sep;
21740 
21741   MemSet (&bvs, 0, sizeof (BioseqValidStr));
21742   bvs.vsp = vsp;
21743 
21744   /* now looking for molinfo on every bioseq (okay on segset) */
21745   if (bsp != NULL) {
21746     vnp = NULL;
21747     if (vsp->useSeqMgrIndexes) {
21748       vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext);
21749       if (vnp != NULL) {
21750         mipEntityID = dcontext.entityID;
21751         mipItemID = dcontext.itemID;
21752         mipItemtype = OBJ_SEQDESC;
21753       }
21754     } else {
21755 //LCOV_EXCL_START
21756       bcp = BioseqContextNew (bsp);
21757       vnp = BioseqContextGetSeqDescr (bcp, Seq_descr_molinfo, NULL, NULL);
21758       BioseqContextFree (bcp);
21759 //LCOV_EXCL_STOP
21760     }
21761     if (vnp != NULL) {
21762       mip = (MolInfoPtr) vnp->data.ptrvalue;
21763     }
21764 
21765     vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
21766     if (vnp != NULL) {
21767       biop = (BioSourcePtr) vnp->data.ptrvalue;
21768       if (biop != NULL) {
21769         orp = biop->org;
21770         if (orp != NULL) {
21771           taxname = orp->taxname;
21772           if (StringICmp (orp->taxname, "Human immunodeficiency virus") == 0 ||
21773               StringICmp (orp->taxname, "Human immunodeficiency virus 1") == 0 ||
21774               StringICmp (orp->taxname, "Human immunodeficiency virus 2") == 0) {
21775             ValidateLocationForHIV (vsp, biop, bsp);
21776           } else if (StringICmp (orp->taxname, "synthetic construct") == 0) {
21777             bvs.is_syn_constr = TRUE;
21778           } else if (StringISearch (orp->taxname, "vector") != NULL) {
21779             bvs.is_syn_constr = TRUE;
21780           }
21781           onp = orp->orgname;
21782           if (onp != NULL) {
21783             if (StringICmp (onp->div, "SYN") == 0) {
21784               bvs.is_syn_constr = TRUE;
21785               is_synthetic = TRUE;
21786             }
21787             if (StringISearch (onp->lineage, "artificial sequences") != NULL) {
21788               bvs.is_syn_constr = TRUE;
21789             }
21790             if (StringISearch (onp->lineage, "negative-strand viruses") != NULL) {
21791               is_neg_strand_virus = TRUE;
21792             }
21793             if (StringISearch (onp->lineage, "Arenavirus") != NULL ||
21794                 StringISearch (onp->lineage, "Arenaviridae") != NULL ||
21795                 StringISearch (onp->lineage, "Phlebovirus") != NULL ||
21796                 StringISearch (onp->lineage, "Tospovirus") != NULL ||
21797                 StringISearch (onp->lineage, "Tenuivirus") != NULL) {
21798               is_ambisense_virus = TRUE;
21799             }
21800             if (StringNICmp (onp->lineage, "Viruses; ", 9) == 0 ||
21801                 StringNICmp (onp->lineage, "Bacteria; ", 10) == 0 ||
21802                 StringNICmp (onp->lineage, "Archaea; ", 9) == 0 ||
21803                 StringCmp (onp->div, "BCT") == 0 ||
21804                 StringCmp (onp->div, "VRL") == 0) {
21805               is_prokaryote = TRUE;
21806             }
21807             if (StringNICmp (onp->lineage, "Viruses; ", 9) == 0) {
21808               is_virus = TRUE;
21809             }
21810             is_organelle = IsLocationOrganelle (biop->genome);
21811             is_plasmid = (Boolean) (biop->genome == GENOME_plasmid);
21812             for (omp = onp->mod; omp != NULL; omp = omp->next) {
21813               if (omp->subtype == ORGMOD_other) {
21814                 if (mip != NULL && (StringICmp (omp->subname, "cRNA") == 0)) {
21815                   oldEntityID = gcp->entityID;
21816                   oldItemID = gcp->itemID;
21817                   oldItemtype = gcp->thistype;
21818 
21819                   gcp->entityID = dcontext.entityID;
21820                   gcp->itemID = dcontext.itemID;
21821                   gcp->thistype = OBJ_SEQDESC;
21822 
21823                   if (mip->biomol == MOLECULE_TYPE_CRNA) {
21824                     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "cRNA note redundant with molecule type");
21825                   } else {
21826                     ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "cRNA note conflicts with molecule type");
21827                   }
21828 
21829                   gcp->entityID = oldEntityID;
21830                   gcp->itemID = oldItemID;
21831                   gcp->thistype = oldItemtype;
21832                 }
21833               }
21834             }
21835             if (mip != NULL && mip->biomol == MOLECULE_TYPE_GENOMIC && bsp->mol == MOLECULE_CLASS_DNA) {
21836               if (StringNICmp (onp->lineage, "Viruses; ", 9) == 0 && StringISearch (onp->lineage, "no DNA stage") != NULL) {
21837                 ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Genomic DNA viral lineage indicates no DNA stage");
21838               }
21839             }
21840           }
21841         }
21842         if (biop->origin == ORG_ARTIFICIAL) {
21843           bvs.is_artificial = TRUE;
21844         } else if (biop->origin == ORG_SYNTHETIC) {
21845           bvs.is_synthetic = TRUE;
21846         }
21847         if (biop->origin == ORG_MUT || biop->origin == ORG_ARTIFICIAL || biop->origin == ORG_SYNTHETIC) {
21848           is_synthetic = TRUE;
21849         }
21850         for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
21851           if (ssp->subtype == SUBSRC_transgenic) {
21852             is_transgenic = TRUE;
21853           } else if (ssp->subtype == SUBSRC_chromosome) {
21854               if (StringDoesHaveText(ssp->name)) {
21855                   has_chromosome = TRUE;
21856               }
21857           } else if (ssp->subtype == SUBSRC_linkage_group) {
21858               if (StringDoesHaveText(ssp->name)) {
21859                   has_linkage_group = TRUE;
21860               }
21861           } else if (ssp->subtype == SUBSRC_other) {
21862             if (mip != NULL && (StringICmp (ssp->name, "cRNA") == 0)) {
21863               oldEntityID = gcp->entityID;
21864               oldItemID = gcp->itemID;
21865               oldItemtype = gcp->thistype;
21866 
21867               gcp->entityID = dcontext.entityID;
21868               gcp->itemID = dcontext.itemID;
21869               gcp->thistype = OBJ_SEQDESC;
21870 
21871               if (mip->biomol == MOLECULE_TYPE_CRNA) {
21872                 ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "cRNA note redundant with molecule type");
21873               } else {
21874                 ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "cRNA note conflicts with molecule type");
21875               }
21876 
21877               gcp->entityID = oldEntityID;
21878               gcp->itemID = oldItemID;
21879               gcp->thistype = oldItemtype;
21880             }
21881           }
21882           if (ssp->subtype == SUBSRC_chromosome || ssp->subtype == SUBSRC_plasmid_name || ssp->subtype == SUBSRC_linkage_group) {
21883             if (HasBadPlasmidChromLinkName (ssp->name, taxname)) {
21884               oldEntityID = gcp->entityID;
21885               oldItemID = gcp->itemID;
21886               oldItemtype = gcp->thistype;
21887 
21888               gcp->entityID = dcontext.entityID;
21889               gcp->itemID = dcontext.itemID;
21890               gcp->thistype = OBJ_SEQDESC;
21891 
21892               ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_BioSourceInconsistency, "Problematic plasmid/chromosome/linkage group name '%s'", ssp->name);
21893 
21894               gcp->entityID = oldEntityID;
21895               gcp->itemID = oldItemID;
21896               gcp->thistype = oldItemtype;
21897             }
21898           }
21899         }
21900         if (is_transgenic && ISA_na (bsp->mol)) {
21901           if (SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_BIOSRC, 0, &fcontext) == NULL) {
21902             ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_TransgenicProblem, "Transgenic source descriptor requires presence of source feature");
21903           }
21904         }
21905       }
21906     }
21907 
21908     if (mip != NULL && mip->tech == MI_TECH_tsa && bsp->mol == MOLECULE_CLASS_DNA) {
21909       oldEntityID = gcp->entityID;
21910       oldItemID = gcp->itemID;
21911       oldItemtype = gcp->thistype;
21912 
21913       gcp->entityID = mipEntityID;
21914       gcp->itemID = mipItemID;
21915       gcp->thistype = mipItemtype;
21916 
21917       ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_ConflictingBiomolTech, "TSA sequence should not be DNA");
21918 
21919       gcp->entityID = oldEntityID;
21920       gcp->itemID = oldItemID;
21921       gcp->thistype = oldItemtype;
21922     }
21923 
21924     if (mip != NULL && mip->tech == MI_TECH_tsa && mip->biomol == MOLECULE_TYPE_TRANSCRIBED_RNA) {
21925       oldEntityID = gcp->entityID;
21926       oldItemID = gcp->itemID;
21927       oldItemtype = gcp->thistype;
21928 
21929       sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &fcontext);
21930       while (sfp != NULL) {
21931 
21932         if (SeqLocStrand (sfp->location) == Seq_strand_minus) {
21933           gcp->entityID = sfp->idx.entityID;
21934           gcp->itemID = sfp->idx.itemID;
21935           gcp->thistype = OBJ_SEQFEAT;
21936 
21937           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_CDSonMinusStrandTranscribedRNA, "Coding region on TSA transcribed RNA should not be on the minus strand");
21938         }
21939 
21940         sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_CDREGION, 0, &fcontext);
21941       }
21942 
21943       gcp->entityID = oldEntityID;
21944       gcp->itemID = oldItemID;
21945       gcp->thistype = oldItemtype;
21946     }
21947 
21948     if (BioseqHasKeyword(bsp, "BARCODE") && BioseqHasKeyword(bsp, "UNVERIFIED")) {
21949       ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BadKeyword, "Sequence has both BARCODE and UNVERIFIED keywords");
21950     }
21951   }
21952 
21953   if (is_neg_strand_virus && mip != NULL) {
21954     oldEntityID = gcp->entityID;
21955     oldItemID = gcp->itemID;
21956     oldItemtype = gcp->thistype;
21957 
21958     gcp->entityID = mipEntityID;
21959     gcp->itemID = mipItemID;
21960     gcp->thistype = mipItemtype;
21961 
21962     sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &fcontext);
21963     while (sfp != NULL) {
21964       has_cds = TRUE;
21965       if (SeqLocStrand (sfp->location) == Seq_strand_minus) {
21966         if (mip->biomol != MOLECULE_TYPE_GENOMIC) {
21967           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Negative-strand virus with minus strand CDS should be genomic");
21968         }
21969       } else {
21970         if (mip->biomol != MOLECULE_TYPE_MRNA && mip->biomol != MOLECULE_TYPE_CRNA && (! is_ambisense_virus) && (! is_synthetic)) {
21971           ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Negative-strand virus with plus strand CDS should be mRNA or cRNA");
21972         }
21973       }
21974       sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_CDREGION, 0, &fcontext);
21975     }
21976     if (! has_cds) {
21977       sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_misc_feature, &fcontext);
21978       while (sfp != NULL) {
21979         if (StringISearch (sfp->comment, "nonfunctional") != NULL) {
21980           if (SeqLocStrand (sfp->location) == Seq_strand_minus) {
21981             if (mip->biomol != MOLECULE_TYPE_GENOMIC) {
21982               ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Negative-strand virus with nonfunctional minus strand misc_feature should be genomic");
21983             }
21984           } else {
21985             if (mip->biomol != MOLECULE_TYPE_MRNA && mip->biomol != MOLECULE_TYPE_CRNA && (! is_ambisense_virus)) {
21986               ValidErr (vsp, SEV_WARNING, ERR_SEQ_DESCR_BioSourceInconsistency, "Negative-strand virus with nonfunctional plus strand misc_feature should be mRNA or cRNA");
21987             }
21988           }
21989         }
21990         sfp = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_misc_feature, &fcontext);
21991       }
21992     }
21993 
21994     gcp->entityID = oldEntityID;
21995     gcp->itemID = oldItemID;
21996     gcp->thistype = oldItemtype;
21997   }
21998 
21999   TestForUnwantedCompleteFlag(bsp, gcp);
22000 
22001   bvs.is_mrna = FALSE;
22002   bvs.is_prerna = FALSE;
22003   if (bsp != NULL && ISA_na (bsp->mol)) {
22004     if (mip != NULL) {
22005       if (mip->biomol == MOLECULE_TYPE_MRNA) {
22006         bvs.is_mrna = TRUE;
22007       } else if (mip->biomol == MOLECULE_TYPE_PRE_MRNA) {
22008         bvs.is_prerna = TRUE;
22009       }
22010       if (mip->biomol >= MOLECULE_TYPE_PRE_MRNA && mip->biomol <= MOLECULE_TYPE_SCRNA && bsp->mol == Seq_mol_dna) {
22011         /* - this is how we indicate an mRNA sequenced from a cDNA, so no error
22012         ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_RnaDnaConflict, "MolInfo says RNA, Bioseq says DNA");
22013         */
22014       }
22015     } else if (bsp->mol == Seq_mol_rna) {
22016       bvs.is_mrna = TRUE;       /* if no molinfo, assume rna is mrna */
22017     }
22018   }
22019 
22020   if (mip != NULL) {
22021 
22022     oldEntityID = gcp->entityID;
22023     oldItemID = gcp->itemID;
22024     oldItemtype = gcp->thistype;
22025 
22026     gcp->entityID = mipEntityID;
22027     gcp->itemID = mipItemID;
22028     gcp->thistype = mipItemtype;
22029 
22030     if (mip->tech == MI_TECH_sts ||
22031         mip->tech == MI_TECH_survey ||
22032         mip->tech == MI_TECH_wgs ||
22033         mip->tech == MI_TECH_htgs_0 || mip->tech == MI_TECH_htgs_1 ||
22034         mip->tech == MI_TECH_htgs_2 || mip->tech == MI_TECH_htgs_3 ||
22035         mip->tech == MI_TECH_composite_wgs_htgs) {
22036       if (mip->tech == MI_TECH_sts && bsp->mol == Seq_mol_rna && mip->biomol == MOLECULE_TYPE_MRNA) {
22037         /* there are some STS sequences derived from cDNAs, so do not report these */
22038       } else if (mip->biomol != MOLECULE_TYPE_GENOMIC) {
22039         ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_ConflictingBiomolTech, "HTGS/STS/GSS/WGS sequence should be genomic");
22040       } else if (bsp == NULL || (bsp->mol != Seq_mol_dna && bsp->mol != Seq_mol_na)) {
22041         ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_ConflictingBiomolTech, "HTGS/STS/GSS/WGS sequence should not be RNA");
22042       }
22043     } else if (mip->tech == MI_TECH_est && mip->biomol != MOLECULE_TYPE_MRNA) {
22044       ValidErr (vsp, SEV_WARNING, ERR_SEQ_INST_ConflictingBiomolTech, "EST sequence should be mRNA");
22045     }
22046 
22047     gcp->entityID = oldEntityID;
22048     gcp->itemID = oldItemID;
22049     gcp->thistype = oldItemtype;
22050   }
22051 
22052   if (ISA_aa (bsp->mol)) {
22053     bvs.is_aa = TRUE;
22054     /* check proteins in nuc-prot set have a CdRegion */
22055     if (vsp->bssp != NULL) {
22056       if (vsp->bssp->_class == 1) {     /* in a nuc-prot set */
22057         sfp = NULL;
22058         if (vsp->useSeqMgrIndexes) {
22059           sfp = SeqMgrGetCDSgivenProduct (bsp, NULL);
22060           if (sfp == NULL) {
22061             sfp = SeqMgrGetPROTgivenProduct (bsp, NULL); /* now instantiating and indexing products of protein processing */
22062           }
22063         } else {
22064 //LCOV_EXCL_START
22065           sfp = SeqEntryGetSeqFeat (vsp->sep, 3, NULL, NULL, 1, bsp);
22066 //LCOV_EXCL_STOP
22067         }
22068         if (sfp == NULL) {      /* no CdRegion points to this bsp */
22069           sev = SEV_ERROR;
22070           if (WGSinBssp (vsp->bssp)) {
22071             sev = SEV_REJECT;
22072           }
22073           ValidErr (vsp, sev, ERR_SEQ_PKG_NoCdRegionPtr, "No CdRegion in nuc-prot set points to this protein");
22074         }
22075       }
22076     }
22077   }
22078 
22079   if (vsp->useSeqMgrIndexes) {
22080     bvs.gcp = gcp;
22081     bvs.bsp = bsp;
22082     ValidateBioseqContextIndexed (bsp, &bvs);
22083   } else {
22084 //LCOV_EXCL_START
22085     GatherSeqEntry (vsp->sep, &bvs, ValidateBioseqContextGather, &gs);
22086 //LCOV_EXCL_STOP
22087   }
22088 
22089   vsp->gcp = gcp;               /* reset the gcp pointer changed in previous gather */
22090   vsp->descr = NULL;
22091   vsp->sfp = NULL;
22092 
22093   if ((!bvs.got_a_pub) && (!vsp->suppress_no_pubs) && (! vsp->seqSubmitParent)) {
22094     omdp = NULL;
22095     if (gcp != NULL) {
22096       omdp = ObjMgrGetData (gcp->entityID);
22097     }
22098     if (omdp == NULL || omdp->datatype != OBJ_SEQSUB) {
22099       sev = SEV_ERROR;
22100       if ((! IsNoncuratedRefSeq (bsp, &sev)) && (! IsWgsIntermediate (vsp->sep)) && (! IsTsaIntermediate (vsp->sep))) {
22101         ValidErr (vsp, sev, ERR_SEQ_DESCR_NoPubFound, "No publications refer to this Bioseq.");
22102       }
22103     }
22104   }
22105 
22106   for (sip = bsp->id; sip != NULL; sip = sip->next) {
22107     if (sip->choice == SEQID_LOCAL) {
22108       is_local = TRUE;
22109     } else {
22110       is_local_only = FALSE;
22111     }
22112     if (sip->choice == SEQID_PATENT) {
22113       psip = (PatentSeqIdPtr) sip->data.ptrvalue;
22114       if (psip != NULL) {
22115         ipp = psip->cit;
22116         if (ipp != NULL && StringICmp (ipp->country, "US") == 0)
22117           return;
22118       }
22119       return;
22120     } else if (sip->choice == SEQID_PDB) {
22121       isPDB = TRUE;
22122     } else if (sip->choice == SEQID_GENBANK ||
22123                sip->choice == SEQID_EMBL ||
22124                sip->choice == SEQID_DDBJ) {
22125       is_gb = TRUE;
22126       if (sip->choice == SEQID_EMBL || sip->choice == SEQID_DDBJ) {
22127         is_eb_db = TRUE;
22128       }
22129       tsip = (TextSeqIdPtr) sip->data.ptrvalue;
22130       if (tsip != NULL && tsip->accession != NULL) {
22131         acclen = StringLen (tsip->accession);
22132         if (acclen == 12) {
22133           is_wgs = TRUE;
22134         } else if (acclen == 13) {
22135           is_wgs = TRUE;
22136         } else if (acclen == 14) {
22137           is_wgs = TRUE;
22138         } else if (acclen == 15) {
22139           is_wgs = TRUE;
22140         /*
22141         } else if (StringNCmp (tsip->accession, "CH", 2) == 0 ||
22142                    StringNCmp (tsip->accession, "CM", 2) == 0) {
22143           is_ch_or_cm = TRUE;
22144         */
22145         } else if (WHICH_db_accession (tsip->accession) == ACCN_NCBI_SEGSET) {
22146           /* NOTE '==' is appropriate here, rather than '|', because we
22147            * really do only want to suppress if the type is exactly ACCN_NCBI_SEGSET
22148            * and NOT if the type is ACCN_NCBI_SEGSET | ACCN_AMBIGOUS_MOL (prefix is AH)
22149            */
22150           is_ch_or_cm = TRUE;
22151         }
22152       }
22153     } else if (sip->choice == SEQID_OTHER) {
22154       is_refseq = TRUE;
22155       tsip = (TextSeqIdPtr) sip->data.ptrvalue;
22156       if (tsip != NULL && tsip->accession != NULL) {
22157         if (StringNCmp (tsip->accession, "NM_", 3) == 0 ||
22158             StringNCmp (tsip->accession, "NP_", 3) == 0 ||
22159             StringNCmp (tsip->accession, "NG_", 3) == 0 ||
22160             StringNCmp (tsip->accession, "NR_", 3) == 0) {
22161           is_gb = TRUE;
22162         } else if (StringNCmp (tsip->accession, "NC_", 3) == 0) {
22163           is_nc = TRUE;
22164         } else if (StringNCmp (tsip->accession, "AC_", 3) == 0) {
22165           is_ac = TRUE;
22166         }
22167       }
22168     } else if (sip->choice == SEQID_GENERAL) {
22169       dbt = (DbtagPtr) sip->data.ptrvalue;
22170       if (dbt != NULL) {
22171         if (StringNICmp (dbt->db, "WGS:", 4) == 0) {
22172           has_wgs_general = TRUE;
22173         }
22174       }
22175     }
22176   }
22177   if (is_nc || is_ac) {
22178       if (!is_prokaryote && !is_organelle && !has_chromosome && !is_plasmid && !has_linkage_group) {
22179       ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_MissingChromosome, "Missing chromosome qualifier on NC or AC RefSeq record");
22180     }
22181   }
22182   if (! is_local) {
22183     is_local_only = FALSE;
22184   }
22185   if (is_wgs) {
22186     if (mip == NULL || (mip->tech != MI_TECH_wgs && mip->tech != MI_TECH_tsa)) {
22187       ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "WGS accession should have Mol-info.tech of wgs");
22188     }
22189   } else if (mip != NULL && mip->tech == MI_TECH_wgs && is_gb) {
22190     if (is_ch_or_cm || is_local_only || has_wgs_general) {
22191       /* skip warning if CH or CM (or other segset ID) or SEQID_LOCAL only */
22192     } else {
22193       secondaries = NULL;
22194       vnp = GetNextDescriptorUnindexed (bsp, Seq_descr_genbank, NULL);
22195       if (vnp != NULL && vnp->choice == Seq_descr_genbank) {
22196         gbp = (GBBlockPtr) vnp->data.ptrvalue;
22197         if (gbp != NULL) {
22198           secondaries = gbp->extra_accessions;
22199         }
22200       }
22201       if (secondaries == NULL) {
22202         vnp = GetNextDescriptorUnindexed (bsp, Seq_descr_embl, NULL);
22203         if (vnp != NULL && vnp->choice == Seq_descr_embl) {
22204           ebp = (EMBLBlockPtr) vnp->data.ptrvalue;
22205           if (ebp != NULL) {
22206             secondaries = ebp->extra_acc;
22207           }
22208         }
22209       }
22210       okay = TRUE;
22211       if (secondaries != NULL) {
22212         for (vnp = secondaries; vnp != NULL; vnp = vnp->next) {
22213           str = (CharPtr) vnp->data.ptrvalue;
22214           if (StringHasNoText (str)) continue;
22215           StringNCpy_0 (prefix, str, sizeof (prefix));
22216           whichdb = WHICH_db_accession (prefix);
22217           if (ACCN_IS_WGS (whichdb)) {
22218             acclen = StringLen (prefix);
22219             if (acclen > 8 && StringCmp (prefix + acclen - 6, "000000") == 0) {
22220               okay = FALSE;
22221             }
22222           }
22223         }
22224       }
22225       if (okay) {
22226         sev = SEV_ERROR;
22227         if (is_eb_db) {
22228           sev = SEV_WARNING;
22229         }
22230         if (! is_eb_db) {
22231           ValidErr (vsp, sev, ERR_SEQ_DESCR_InconsistentWGSFlags, "Mol-info.tech of wgs should have WGS accession");
22232         }
22233       }
22234     }
22235   }
22236   if (is_nc) {
22237     if (mip != NULL && mip->biomol != MOLECULE_TYPE_GENOMIC && mip->biomol != MOLECULE_TYPE_CRNA) {
22238       if (ISA_na (bsp->mol)) {
22239         ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_Inconsistent, "NC nucleotide should be genomic or cRNA");
22240       }
22241     }
22242   }
22243 
22244   if (bsp != NULL && is_refseq) {
22245     ValidateRefSeqTitle (bsp, vsp, is_virus);
22246   }
22247 
22248   if ((!bvs.last_org) && (!vsp->suppress_no_biosrc))
22249     ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_NoOrgFound, "No organism name has been applied to this Bioseq.  Other qualifiers may exist.");
22250 
22251 
22252   if ((bvs.is_aa) && (bvs.num_full_length_prot_ref == 0) && (!isPDB) && (bsp->repr != Seq_repr_virtual)) {
22253     parent = SeqMgrGetParentOfPart (bsp, NULL);
22254     if (parent == NULL || SeqMgrGetBestProteinFeature (bsp, NULL) == NULL) {
22255 
22256       oldEntityID = gcp->entityID;
22257       oldItemID = gcp->itemID;
22258       oldItemtype = gcp->thistype;
22259 
22260       if (SeqMgrGetCDSgivenProduct (bsp, &fcontext) != NULL) {
22261         gcp->entityID = fcontext.entityID;
22262         gcp->itemID = fcontext.itemID;
22263         gcp->thistype = OBJ_SEQFEAT;
22264       }
22265 
22266       ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_NoProtRefFound, "No full length Prot-ref feature applied to this Bioseq");
22267 
22268       gcp->entityID = oldEntityID;
22269       gcp->itemID = oldItemID;
22270       gcp->thistype = oldItemtype;
22271     }
22272   } else if (bvs.is_aa && bvs.num_full_length_prot_ref > 1) {
22273     if (bvs.num_justprot > 1 ||
22274         bvs.num_preprot > 1 ||
22275         bvs.num_matpep > 1 ||
22276         bvs.num_sigpep > 1 ||
22277         bvs.num_transpep > 1) {
22278       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_MultipleProtRefs, "%d full-length protein features present on protein",
22279                 (int) bvs.num_full_length_prot_ref);
22280     }
22281   }
22282 
22283   /* now flag missing molinfo even if not in Sequin */
22284   if (mip == NULL && (!isPDB)) {
22285     ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_NoMolInfoFound, "No Mol-info applies to this Bioseq");
22286   }
22287 
22288 #if 0 /* temporarily suppress */
22289   /* if tech is TSA, must have assembly */
22290   if (mip != NULL && mip->tech == MI_TECH_tsa
22291       && (bsp->hist == NULL || bsp->hist->assembly == NULL)) {
22292     SeqIdWrite (bsp->id, buf1, PRINTID_FASTA_SHORT, sizeof (buf1) - 1);
22293     ValidErr (vsp, SEV_ERROR, ERR_SEQ_INST_TSAHistAssemblyMissing, "TSA record %s should have Seq-hist.assembly", buf1);
22294   }
22295 #endif
22296 
22297   /* look for genes that overlap two other genes */
22298   FindMultiGeneOverlaps (bsp, vsp);
22299 
22300   /* TSA checks */
22301   ValidateTSASequenceForNs (bsp, vsp);
22302 
22303   /* validate structured comments in context */
22304   ValidateStructuredCommentsInContext (bsp, vsp);
22305 }
22306 
22307 /*****************************************************************************
22308 *
22309 *   ValidateSeqFeat(gcp)
22310 *
22311 *****************************************************************************/
EmptyOrNullString(CharPtr str)22312 static Boolean EmptyOrNullString (CharPtr str)
22313 {
22314   Char            ch;
22315 
22316   if (str == NULL)
22317     return TRUE;
22318   ch = *str;
22319   while (ch != '\0') {
22320     if (ch > ' ' && ch <= '~')
22321       return FALSE;
22322     str++;
22323     ch = *str;
22324   }
22325   return TRUE;
22326 }
22327 
CheckPeptideOnCodonBoundary(ValidStructPtr vsp,GatherContextPtr gcp,SeqFeatPtr sfp,CharPtr key)22328 static void CheckPeptideOnCodonBoundary (ValidStructPtr vsp, GatherContextPtr gcp, SeqFeatPtr sfp, CharPtr key)
22329 {
22330   SeqFeatPtr      cds;
22331   CdRegionPtr     crp;
22332   SeqLocPtr       first = NULL, last = NULL, slp = NULL;
22333   Boolean         partial5, partial3;
22334   Int4            pos1, pos2, adjust = 0, mod1, mod2;
22335 
22336   cds = SeqMgrGetOverlappingCDS (sfp->location, NULL);
22337   if (cds == NULL)
22338     return;
22339   crp = (CdRegionPtr) cds->data.value.ptrvalue;
22340   if (crp == NULL)
22341     return;
22342   if (crp->frame == 2) {
22343     adjust = 1;
22344   } else if (crp->frame == 3) {
22345     adjust = 2;
22346   }
22347 
22348   while ((slp = SeqLocFindNext (sfp->location, slp)) != NULL) {
22349     last = slp;
22350     if (first == NULL) {
22351       first = slp;
22352     }
22353   }
22354   if (first == NULL || last == NULL)
22355     return;
22356 
22357   pos1 = GetOffsetInLoc (first, cds->location, SEQLOC_START) - adjust;
22358   pos2 = GetOffsetInLoc (last, cds->location, SEQLOC_STOP) - adjust;
22359   mod1 = pos1 % 3;
22360   mod2 = pos2 % 3;
22361 
22362   CheckSeqLocForPartial (sfp->location, &partial5, &partial3);
22363   if (partial5) {
22364     mod1 = 0;
22365   }
22366   if (partial3) {
22367     mod2 = 2;
22368   }
22369 
22370   if (pos1 < 0 && pos2 < 0 && StringICmp (key, "sig_peptide") == 0) {
22371     /* ignore special case of sig_peptide completely before codon_start of CDS */
22372   } else if (mod1 != 0 && mod2 != 2) {
22373     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PeptideFeatOutOfFrame, "Start and stop of %s are out of frame with CDS codons", key);
22374   } else if (mod1 != 0) {
22375     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PeptideFeatOutOfFrame, "Start of %s is out of frame with CDS codons", key);
22376   } else if (mod2 != 2) {
22377     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PeptideFeatOutOfFrame, "Stop of %s is out of frame with CDS codons", key);
22378   }
22379 }
22380 
22381 static CharPtr  legal_repeat_types[] = {
22382   "tandem",
22383   "inverted",
22384   "flanking",
22385   "nested",
22386   "terminal",
22387   "direct",
22388   "dispersed",
22389   "long_terminal_repeat",
22390   "non_LTR_retrotransposon_polymeric_tract",
22391   "X_element_combinatorial_repeat",
22392   "Y_prime_element",
22393   "telomeric_repeat",
22394   "centromeric_repeat",
22395   "engineered_foreign_repetitive_element",
22396   "other",
22397   NULL
22398 };
22399 
22400 static CharPtr legal_cons_splice_strings [] = {
22401   "(5'site:YES, 3'site:YES)",
22402   "(5'site:YES, 3'site:NO)",
22403   "(5'site:YES, 3'site:ABSENT)",
22404   "(5'site:NO, 3'site:YES)",
22405   "(5'site:NO, 3'site:NO)",
22406   "(5'site:NO, 3'site:ABSENT)",
22407   "(5'site:ABSENT, 3'site:YES)",
22408   "(5'site:ABSENT, 3'site:NO)",
22409   "(5'site:ABSENT, 3'site:ABSENT)",
22410   NULL
22411 };
22412 
22413 static CharPtr legal_mobile_element_strings [] = {
22414   "transposon",
22415   "retrotransposon",
22416   "integron",
22417   "insertion sequence",
22418   "non-LTR retrotransposon",
22419   "SINE",
22420   "MITE",
22421   "LINE",
22422   "other",
22423   NULL
22424 };
22425 
22426 static CharPtr  legal_modified_bases[] = {
22427   "ac4c",
22428   "chm5u",
22429   "cm",
22430   "cmnm5s2u",
22431   "cmnm5u",
22432   "d",
22433   "fm",
22434   "gal q",
22435   "gm",
22436   "i",
22437   "i6a",
22438   "m1a",
22439   "m1f",
22440   "m1g",
22441   "m1i",
22442   "m22g",
22443   "m2a",
22444   "m2g",
22445   "m3c",
22446   "m4c",
22447   "m5c",
22448   "m6a",
22449   "m7g",
22450   "mam5u",
22451   "mam5s2u",
22452   "man q",
22453   "mcm5s2u",
22454   "mcm5u",
22455   "mo5u",
22456   "ms2i6a",
22457   "ms2t6a",
22458   "mt6a",
22459   "mv",
22460   "o5u",
22461   "osyw",
22462   "p",
22463   "q",
22464   "s2c",
22465   "s2t",
22466   "s2u",
22467   "s4u",
22468   "t",
22469   "t6a",
22470   "tm",
22471   "um",
22472   "yw",
22473   "x",
22474   "OTHER",
22475   NULL
22476 };
22477 
LookForECnumberPattern(CharPtr str)22478 NLM_EXTERN Boolean LookForECnumberPattern (CharPtr str)
22479 
22480 {
22481   Char     ch;
22482   Boolean  is_ambig;
22483   Int2     numdashes;
22484   Int2     numdigits;
22485   Int2     numperiods;
22486   CharPtr  ptr;
22487 
22488   if (StringHasNoText (str)) return FALSE;
22489 
22490   is_ambig = FALSE;
22491   numperiods = 0;
22492   numdigits = 0;
22493   numdashes = 0;
22494 
22495   ptr = str;
22496   ch = *ptr;
22497   while (ch != '\0') {
22498     if (IS_DIGIT (ch)) {
22499       numdigits++;
22500       if (is_ambig) {
22501         is_ambig = FALSE;
22502         numperiods = 0;
22503         numdashes = 0;
22504       }
22505       ptr++;
22506       ch = *ptr;
22507     } else if (ch == '-') {
22508       numdashes++;
22509       is_ambig = TRUE;
22510       ptr++;
22511       ch = *ptr;
22512     } else if (ch == 'n') {
22513       numdashes++;
22514       is_ambig = TRUE;
22515       ptr++;
22516       ch = *ptr;
22517     } else if (ch == '.') {
22518       numperiods++;
22519       if (numdigits > 0 && numdashes > 0) {
22520         is_ambig = FALSE;
22521         numperiods = 0;
22522         numdigits = 0;
22523         numdashes = 0;
22524       } else if (numdigits == 0 && numdashes == 0) {
22525         is_ambig = FALSE;
22526         numperiods = 0;
22527         numdigits = 0;
22528         numdashes = 0;
22529       } else if (numdashes > 1) {
22530         is_ambig = FALSE;
22531         numperiods = 0;
22532         numdigits = 0;
22533         numdashes = 0;
22534       }
22535       numdigits = 0;
22536       numdashes = 0;
22537       ptr++;
22538       ch = *ptr;
22539     } else {
22540       if (numperiods == 3) {
22541         if (numdigits > 0 && numdashes > 0) {
22542         is_ambig = FALSE;
22543         numperiods = 0;
22544         numdigits = 0;
22545         numdashes = 0;
22546         } else if (numdigits > 0 || numdashes == 1) return TRUE;
22547       }
22548       ptr++;
22549       ch = *ptr;
22550       is_ambig = FALSE;
22551       numperiods = 0;
22552       numdigits = 0;
22553       numdashes = 0;
22554     }
22555   }
22556 
22557   if (numperiods == 3) {
22558     if (numdigits > 0 && numdashes > 0) return FALSE;
22559     if (numdigits > 0 || numdashes == 1) return TRUE;
22560   }
22561 
22562   return FALSE;
22563 }
22564 
ValidateECnumber(CharPtr str)22565 NLM_EXTERN Boolean ValidateECnumber (CharPtr str)
22566 
22567 {
22568   Char     ch;
22569   Boolean  is_ambig;
22570   Int2     numdashes;
22571   Int2     numdigits;
22572   Int2     numperiods;
22573   CharPtr  ptr;
22574 
22575   if (StringHasNoText (str)) return FALSE;
22576 
22577   is_ambig = FALSE;
22578   numperiods = 0;
22579   numdigits = 0;
22580   numdashes = 0;
22581 
22582   ptr = str;
22583   ch = *ptr;
22584   while (ch != '\0') {
22585     if (IS_DIGIT (ch)) {
22586       numdigits++;
22587       if (is_ambig) return FALSE;
22588       ptr++;
22589       ch = *ptr;
22590     } else if (ch == '-') {
22591       numdashes++;
22592       is_ambig = TRUE;
22593       ptr++;
22594       ch = *ptr;
22595     } else if (ch == 'n') {
22596       if (numperiods == 3 && numdigits == 0 && IS_DIGIT(*(ptr + 1))) {
22597         /* allow/ignore n in first position of fourth number to not mean ambiguous, if followed by digit */
22598       } else {
22599         numdashes++;
22600         is_ambig = TRUE;
22601       }
22602       ptr++;
22603       ch = *ptr;
22604     } else if (ch == '.') {
22605       numperiods++;
22606       if (numdigits > 0 && numdashes > 0) return FALSE;
22607       if (numdigits == 0 && numdashes == 0) return FALSE;
22608       if (numdashes > 1) return FALSE;
22609       numdigits = 0;
22610       numdashes = 0;
22611       ptr++;
22612       ch = *ptr;
22613     } else {
22614       ptr++;
22615       ch = *ptr;
22616     }
22617   }
22618 
22619   if (numperiods == 3) {
22620     if (numdigits > 0 && numdashes > 0) return FALSE;
22621     if (numdigits > 0 || numdashes == 1) return TRUE;
22622   }
22623 
22624   return FALSE;
22625 }
22626 
ECNumberFSAFreeAll(void)22627 NLM_EXTERN void ECNumberFSAFreeAll (void)
22628 
22629 {
22630   CtrySetPtr  ctsp;
22631   TextFsaPtr  fsa;
22632 
22633   fsa = (TextFsaPtr) GetAppProperty ("SpecificECNumberFSA");
22634   if (fsa != NULL) {
22635     SetAppProperty ("SpecificECNumberFSA", NULL);
22636     TextFsaFree (fsa);
22637   }
22638 
22639   fsa = (TextFsaPtr) GetAppProperty ("AmbiguousECNumberFSA");
22640   if (fsa != NULL) {
22641     SetAppProperty ("AmbiguousECNumberFSA", NULL);
22642     TextFsaFree (fsa);
22643   }
22644 
22645   fsa = (TextFsaPtr) GetAppProperty ("DeletedECNumberFSA");
22646   if (fsa != NULL) {
22647     SetAppProperty ("DeletedECNumberFSA", NULL);
22648     TextFsaFree (fsa);
22649   }
22650 
22651   fsa = (TextFsaPtr) GetAppProperty ("ReplacedEECNumberFSA");
22652   if (fsa != NULL) {
22653     SetAppProperty ("ReplacedEECNumberFSA", NULL);
22654     TextFsaFree (fsa);
22655   }
22656 
22657   fsa = (TextFsaPtr) GetAppProperty ("BodiesOfWaterFSA");
22658   if (fsa != NULL) {
22659     SetAppProperty ("BodiesOfWaterFSA", NULL);
22660     TextFsaFree (fsa);
22661   }
22662 
22663   ctsp = (CtrySetPtr) GetAppProperty ("CountryLatLonData");
22664   if (ctsp != NULL) {
22665     SetAppProperty ("CountryLatLonData", NULL);
22666     FreeLatLonCountryData (ctsp);
22667   }
22668 
22669   ctsp = (CtrySetPtr) GetAppProperty ("WaterLatLonData");
22670   if (ctsp != NULL) {
22671     SetAppProperty ("WaterLatLonData", NULL);
22672     FreeLatLonCountryData (ctsp);
22673   }
22674 
22675   ic_code_data = MemFree (ic_code_data);
22676   ic_code_list = ValNodeFreeData (ic_code_list);
22677 }
22678 
GetECNumberFSA(ValidStructPtr vsp,CharPtr prop,CharPtr file,CharPtr PNTR local,size_t numitems,Boolean trimAtTab)22679 static TextFsaPtr GetECNumberFSA (ValidStructPtr vsp, CharPtr prop, CharPtr file, CharPtr PNTR local, size_t numitems, Boolean trimAtTab)
22680 
22681 {
22682   FileCache   fc;
22683   FILE        *fp = NULL;
22684   TextFsaPtr  fsa;
22685   Int2        i;
22686   Char        line [1024];
22687   Char        path [PATH_MAX];
22688   CharPtr     ptr;
22689   ErrSev      sev;
22690   CharPtr     str;
22691   Char        tmp [512];
22692   Boolean     use_data_dir_first = FALSE;
22693 
22694   fsa = (TextFsaPtr) GetAppProperty (prop);
22695   if (fsa != NULL) return fsa;
22696 
22697 #ifdef OS_UNIX
22698   str = (CharPtr) getenv ("NCBI_ECNUM_USE_DATA_DIR_FIRST");
22699   if (str != NULL && StringICmp (str, "TRUE") == 0) {
22700     use_data_dir_first = TRUE;
22701   }
22702 #endif
22703 
22704   if (use_data_dir_first) {
22705     if (FindPath ("ncbi", "ncbi", "data", path, sizeof (path))) {
22706       FileBuildPath (path, NULL, file);
22707       sev = ErrSetMessageLevel (SEV_ERROR);
22708       fp = FileOpen (path, "r");
22709       ErrSetMessageLevel (sev);
22710       if (fp == NULL && vsp != NULL) {
22711         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_EcNumberDataMissing, "Unable to use EC number file '%s' in data directory", file);
22712       }
22713     } else if (vsp != NULL) {
22714       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_EcNumberDataMissing, "Unable to find EC number file '%s' in data directory", file);
22715     }
22716   }
22717 
22718   fsa = TextFsaNew ();
22719   if (fsa != NULL) {
22720     if (fp != NULL) {
22721       FileCacheSetup (&fc, fp);
22722 
22723       str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
22724       while (str != NULL) {
22725         if (StringDoesHaveText (str)) {
22726           if (trimAtTab) {
22727             ptr = StringChr (str, '\t');
22728             if (ptr != NULL) {
22729               *ptr = '\0';
22730             }
22731           }
22732           if (StringLen (str) + 3 < sizeof (tmp)) {
22733             StringCpy (tmp, " ");
22734             StringCat (tmp, str);
22735             StringCat (tmp, " ");
22736             TextFsaAdd (fsa, tmp);
22737           }
22738         }
22739         str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
22740       }
22741 
22742     } else if (local != NULL) {
22743       for (i = 0; /* local [i] != NULL */ i < numitems; i++) {
22744         str = local [i];
22745         if (StringDoesHaveText (str)) {
22746           if (StringLen (str) + 3 < sizeof (tmp)) {
22747             StringCpy (tmp, " ");
22748             StringCat (tmp, str);
22749             if (trimAtTab) {
22750               ptr = StringChr (tmp, '\t');
22751               if (ptr != NULL) {
22752                 *ptr = '\0';
22753               }
22754             }
22755             StringCat (tmp, " ");
22756             TextFsaAdd (fsa, tmp);
22757           }
22758         }
22759       }
22760     }
22761   }
22762 
22763   if (fp != NULL) {
22764     FileClose (fp);
22765     using_ec_from_file = TRUE;
22766   }
22767 
22768   SetAppProperty (prop, (Pointer) fsa);
22769 
22770   return fsa;
22771 }
22772 
GetSpecificECNumberFSA(ValidStructPtr vsp)22773 static TextFsaPtr GetSpecificECNumberFSA (ValidStructPtr vsp)
22774 
22775 {
22776   return (GetECNumberFSA (vsp, "SpecificECNumberFSA", "ecnum_specific.txt", (CharPtr PNTR) kECNum_specific, sizeof (kECNum_specific) / sizeof (char*), TRUE));
22777 }
22778 
GetAmbiguousECNumberFSA(ValidStructPtr vsp)22779 static TextFsaPtr GetAmbiguousECNumberFSA (ValidStructPtr vsp)
22780 
22781 {
22782   return (GetECNumberFSA (vsp, "AmbiguousECNumberFSA", "ecnum_ambiguous.txt", (CharPtr PNTR) kECNum_ambiguous, sizeof (kECNum_ambiguous) / sizeof (char*), TRUE));
22783 }
22784 
GetDeletedECNumberFSA(ValidStructPtr vsp)22785 static TextFsaPtr GetDeletedECNumberFSA (ValidStructPtr vsp)
22786 
22787 {
22788   return (GetECNumberFSA (vsp, "DeletedECNumberFSA", "ecnum_deleted.txt", (CharPtr PNTR) kECNum_deleted, sizeof (kECNum_deleted) / sizeof (char*), TRUE));
22789 }
22790 
GetReplacedECNumberFSA(ValidStructPtr vsp)22791 static TextFsaPtr GetReplacedECNumberFSA (ValidStructPtr vsp)
22792 
22793 {
22794   return (GetECNumberFSA (vsp, "ReplacedEECNumberFSA", "ecnum_replaced.txt", (CharPtr PNTR) kECNum_replaced, sizeof (kECNum_replaced) / sizeof (char*), TRUE));
22795 }
22796 
ECnumberNotInList(CharPtr str)22797 NLM_EXTERN Boolean ECnumberNotInList (CharPtr str)
22798 
22799 {
22800   Char        ch;
22801   TextFsaPtr  fsa;
22802   ValNodePtr  matches;
22803   CharPtr     ptr;
22804   Int4        state;
22805 
22806   if (StringHasNoText (str)) return FALSE;
22807 
22808   fsa = GetSpecificECNumberFSA (NULL);
22809   if (fsa == NULL) return FALSE;
22810 
22811   state = 0;
22812   matches = NULL;
22813   state = TextFsaNext (fsa, state, ' ', &matches);
22814   for (ptr = str, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
22815     state = TextFsaNext (fsa, state, ch, &matches);
22816   }
22817   state = TextFsaNext (fsa, state, ' ', &matches);
22818   if (matches != NULL) return FALSE;
22819 
22820   fsa = GetAmbiguousECNumberFSA (NULL);
22821   if (fsa == NULL) return FALSE;
22822 
22823   state = 0;
22824   matches = NULL;
22825   state = TextFsaNext (fsa, state, ' ', &matches);
22826   for (ptr = str, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
22827     state = TextFsaNext (fsa, state, ch, &matches);
22828   }
22829   state = TextFsaNext (fsa, state, ' ', &matches);
22830   if (matches != NULL) return FALSE;
22831 
22832   return TRUE;
22833 }
22834 
ECnumberWasDeleted(CharPtr str)22835 NLM_EXTERN Boolean ECnumberWasDeleted (CharPtr str)
22836 
22837 {
22838   Char        ch;
22839   TextFsaPtr  fsa;
22840   ValNodePtr  matches;
22841   CharPtr     ptr;
22842   Int4        state;
22843 
22844   if (StringHasNoText (str)) return FALSE;
22845 
22846   fsa = GetDeletedECNumberFSA (NULL);
22847   if (fsa == NULL) return FALSE;
22848 
22849   state = 0;
22850   matches = NULL;
22851   state = TextFsaNext (fsa, state, ' ', &matches);
22852   for (ptr = str, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
22853     state = TextFsaNext (fsa, state, ch, &matches);
22854   }
22855   state = TextFsaNext (fsa, state, ' ', &matches);
22856   if (matches != NULL) return TRUE;
22857 
22858   return FALSE;
22859 }
22860 
ECnumberWasReplaced(CharPtr str)22861 NLM_EXTERN Boolean ECnumberWasReplaced (CharPtr str)
22862 
22863 {
22864   Char        ch;
22865   TextFsaPtr  fsa;
22866   ValNodePtr  matches;
22867   CharPtr     ptr;
22868   Int4        state;
22869 
22870   if (StringHasNoText (str)) return FALSE;
22871 
22872   fsa = GetReplacedECNumberFSA (NULL);
22873   if (fsa == NULL) return FALSE;
22874 
22875   state = 0;
22876   matches = NULL;
22877   state = TextFsaNext (fsa, state, ' ', &matches);
22878   for (ptr = str, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
22879     state = TextFsaNext (fsa, state, ch, &matches);
22880   }
22881   state = TextFsaNext (fsa, state, ' ', &matches);
22882   if (matches != NULL) return TRUE;
22883 
22884   return FALSE;
22885 }
22886 
22887 /*
22888   EC number replacement - copied from Sequin, with protection
22889   multiple reads if no replacement file available
22890 */
22891 
22892 typedef struct ecrepdata {
22893   CharPtr  before;
22894   CharPtr  after;
22895 } EcRepData, PNTR EcRepPtr;
22896 
22897 static ValNodePtr     ec_rep_list = NULL;
22898 static EcRepPtr PNTR  ec_rep_data = NULL;
22899 static Int4           ec_rep_len = 0;
22900 static Boolean        ec_rep_read = FALSE;
22901 
SortVnpByEcBefore(VoidPtr ptr1,VoidPtr ptr2)22902 static int LIBCALLBACK SortVnpByEcBefore (VoidPtr ptr1, VoidPtr ptr2)
22903 
22904 {
22905   EcRepPtr    erp1, erp2;
22906   CharPtr     str1, str2;
22907   ValNodePtr  vnp1, vnp2;
22908 
22909   if (ptr1 == NULL || ptr2 == NULL) return 0;
22910   vnp1 = *((ValNodePtr PNTR) ptr1);
22911   vnp2 = *((ValNodePtr PNTR) ptr2);
22912   if (vnp1 == NULL || vnp2 == NULL) return 0;
22913   erp1 = (EcRepPtr) vnp1->data.ptrvalue;
22914   erp2 = (EcRepPtr) vnp2->data.ptrvalue;
22915   if (erp1 == NULL || erp2 == NULL) return 0;
22916   str1 = erp1->before;
22917   str2 = erp2->before;
22918   if (str1 == NULL || str2 == NULL) return 0;
22919   return StringCmp (str1, str2);
22920 }
22921 
SetupECReplacementTable(ValidStructPtr vsp,CharPtr file,CharPtr PNTR local,size_t numitems)22922 static void SetupECReplacementTable (ValidStructPtr vsp, CharPtr file, CharPtr PNTR local, size_t numitems)
22923 
22924 {
22925   Char        buf [256];
22926   EcRepPtr    erp;
22927   FileCache   fc;
22928   FILE        *fp = NULL;
22929   Int4        i;
22930   ValNodePtr  last = NULL;
22931   Char        line [512];
22932   Char        path [PATH_MAX];
22933   CharPtr     ptr;
22934   ErrSev      sev;
22935   CharPtr     str;
22936   Boolean     use_data_dir_first = FALSE;
22937   ValNodePtr  vnp;
22938 
22939   if (ec_rep_data != NULL) return;
22940   if (ec_rep_read) return;
22941 
22942 #ifdef OS_UNIX
22943   str = (CharPtr) getenv ("NCBI_ECNUM_USE_DATA_DIR_FIRST");
22944   if (str != NULL && StringICmp (str, "TRUE") == 0) {
22945     use_data_dir_first = TRUE;
22946   }
22947 #endif
22948 
22949   if (use_data_dir_first) {
22950     if (FindPath ("ncbi", "ncbi", "data", path, sizeof (path))) {
22951       FileBuildPath (path, NULL, file);
22952       sev = ErrSetMessageLevel (SEV_ERROR);
22953       fp = FileOpen (path, "r");
22954       ErrSetMessageLevel (sev);
22955       if (fp == NULL && vsp != NULL) {
22956         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_EcNumberDataMissing, "Unable to use EC number file '%s' in data directory", file);
22957       }
22958     } else if (vsp != NULL) {
22959       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_EcNumberDataMissing, "Unable to find EC number file '%s' in data directory", file);
22960     }
22961   }
22962 
22963   if (fp != NULL) {
22964     FileCacheSetup (&fc, fp);
22965 
22966     str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
22967     while (str != NULL) {
22968       if (StringDoesHaveText (str)) {
22969         ptr = StringChr (str, '\t');
22970         if (ptr != NULL) {
22971           *ptr = '\0';
22972           ptr++;
22973           /* obsolete - only replace if a single destination number, not a split from one to many */
22974           if (/* StringChr (ptr, '\t') == NULL */ TRUE) {
22975             erp = (EcRepPtr) MemNew (sizeof (EcRepData));
22976             if (erp != NULL) {
22977               erp->before = StringSave (str);
22978               erp->after = StringSave (ptr);
22979               vnp = ValNodeAddPointer (&last, 0, (Pointer) erp);
22980               if (ec_rep_list == NULL) {
22981                 ec_rep_list = vnp;
22982               }
22983               last = vnp;
22984             }
22985           }
22986         }
22987       }
22988       str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
22989     }
22990 
22991     FileClose (fp);
22992   } else if (local != NULL) {
22993     for (i = 0; /* local [i] != NULL */ i < numitems; i++) {
22994       str = local [i];
22995       if (StringDoesHaveText (str)) {
22996         StringNCpy_0 (buf, str, sizeof (buf));
22997         str = buf;
22998         ptr = StringChr (str, '\t');
22999         if (ptr != NULL) {
23000           *ptr = '\0';
23001           ptr++;
23002           /* obsolete - only replace if a single destination number, not a split from one to many */
23003           if (/* StringChr (ptr, '\t') == NULL */ TRUE) {
23004             erp = (EcRepPtr) MemNew (sizeof (EcRepData));
23005             if (erp != NULL) {
23006               erp->before = StringSave (str);
23007               erp->after = StringSave (ptr);
23008               vnp = ValNodeAddPointer (&last, 0, (Pointer) erp);
23009               if (ec_rep_list == NULL) {
23010                 ec_rep_list = vnp;
23011               }
23012               last = vnp;
23013             }
23014           }
23015         }
23016       }
23017     }
23018   }
23019 
23020   ec_rep_len = ValNodeLen (ec_rep_list);
23021   if (ec_rep_len > 0) {
23022     ec_rep_list = ValNodeSort (ec_rep_list, SortVnpByEcBefore);
23023     ec_rep_data = (EcRepPtr PNTR) MemNew (sizeof (EcRepPtr) * (ec_rep_len + 1));
23024     if (ec_rep_data != NULL) {
23025       for (vnp = ec_rep_list, i = 0; vnp != NULL; vnp = vnp->next, i++) {
23026         erp = (EcRepPtr) vnp->data.ptrvalue;
23027         ec_rep_data [i] = erp;
23028       }
23029     }
23030   }
23031 
23032   ec_rep_read = TRUE;
23033 }
23034 
GetECReplacement(CharPtr str,BoolPtr splitp)23035 static EcRepPtr GetECReplacement (CharPtr str, BoolPtr splitp)
23036 
23037 {
23038   EcRepPtr  erp;
23039   Int4      L, R, mid;
23040 
23041   if (StringHasNoText (str)) return NULL;
23042 
23043   L = 0;
23044   R = ec_rep_len - 1;
23045   while (L < R) {
23046     mid = (L + R) / 2;
23047     erp = ec_rep_data [(int) mid];
23048     if (erp != NULL && StringCmp (erp->before, str) < 0) {
23049       L = mid + 1;
23050     } else {
23051       R = mid;
23052     }
23053   }
23054   erp = ec_rep_data [(int) R];
23055   if (erp == NULL) return NULL;
23056   if (StringCmp (erp->before, str) != 0) return NULL;
23057 
23058   if (StringChr (erp->after, '\t') != NULL) {
23059     if (splitp != NULL) {
23060       *splitp = TRUE;
23061     }
23062   }
23063 
23064   return erp;
23065 }
23066 
EcCnumberWasSplit(ValidStructPtr vsp,CharPtr str)23067 static Boolean EcCnumberWasSplit (ValidStructPtr vsp, CharPtr str)
23068 
23069 {
23070   EcRepPtr  erp;
23071   Boolean   split;
23072 
23073   if (StringHasNoText (str)) return FALSE;
23074 
23075   SetupECReplacementTable (vsp, "ecnum_replaced.txt", (CharPtr PNTR) kECNum_replaced, sizeof (kECNum_replaced) / sizeof (char*));
23076 
23077   split = FALSE;
23078   erp = GetECReplacement (str, &split);
23079   if (erp == NULL) return FALSE;
23080 
23081   return split;
23082 }
23083 
23084 //LCOV_EXCL_START
SqnGetLocusTagFromProtRef(SeqFeatPtr sfp,CharPtr buf,size_t len)23085 static Boolean SqnGetLocusTagFromProtRef (SeqFeatPtr sfp, CharPtr buf, size_t len)
23086 
23087 {
23088   BioseqPtr          bsp;
23089   SeqFeatPtr         cds;
23090   SeqMgrFeatContext  fcontext;
23091   SeqFeatPtr         gene;
23092   GeneRefPtr         grp;
23093 
23094   if (sfp == NULL || buf == NULL) return FALSE;
23095   grp = SeqMgrGetGeneXref (sfp);
23096   if (grp != NULL) {
23097     if (SeqMgrGeneIsSuppressed (grp)) return FALSE;
23098     if (StringDoesHaveText (grp->locus_tag)) {
23099       StringNCpy_0 (buf, grp->locus_tag, len);
23100       return TRUE;
23101     } else if (StringDoesHaveText (grp->locus)) {
23102       StringNCpy_0 (buf, grp->locus, len);
23103       return TRUE;
23104     }
23105   }
23106   bsp = BioseqFindFromSeqLoc (sfp->location);
23107   if (bsp == NULL) return FALSE;
23108   cds = SeqMgrGetCDSgivenProduct (bsp, &fcontext);
23109   if (cds == NULL) return FALSE;
23110   grp = SeqMgrGetGeneXref (cds);
23111   if (grp != NULL) {
23112     if (SeqMgrGeneIsSuppressed (grp)) return FALSE;
23113     if (StringDoesHaveText (grp->locus_tag)) {
23114       StringNCpy_0 (buf, grp->locus_tag, len);
23115       return TRUE;
23116     } else if (StringDoesHaveText (grp->locus)) {
23117       StringNCpy_0 (buf, grp->locus, len);
23118       return TRUE;
23119     }
23120   }
23121   gene = SeqMgrGetOverlappingGene (cds->location, &fcontext);
23122   if (gene == NULL || gene->data.choice != SEQFEAT_GENE) return FALSE;
23123   grp = (GeneRefPtr) gene->data.value.ptrvalue;
23124   if (grp != NULL) {
23125     if (SeqMgrGeneIsSuppressed (grp)) return FALSE;
23126     if (StringDoesHaveText (grp->locus_tag)) {
23127       StringNCpy_0 (buf, grp->locus_tag, len);
23128       return TRUE;
23129     } else if (StringDoesHaveText (grp->locus)) {
23130       StringNCpy_0 (buf, grp->locus, len);
23131       return TRUE;
23132     }
23133   }
23134   return FALSE;
23135 }
23136 
SqnGetIdentifierForProtRef(SeqFeatPtr sfp,CharPtr buf,size_t len)23137 static Boolean SqnGetIdentifierForProtRef (SeqFeatPtr sfp, CharPtr buf, size_t len)
23138 
23139 {
23140   BioseqPtr   bsp;
23141 
23142   if (sfp == NULL || buf == NULL) return FALSE;
23143 
23144   if (SqnGetLocusTagFromProtRef (sfp, buf, len)) return TRUE;
23145 
23146   bsp = BioseqFindFromSeqLoc (sfp->location);
23147   if (bsp == NULL) return FALSE;
23148   SeqIdWrite (bsp->id, buf, PRINTID_REPORT, len);
23149 
23150   return TRUE;
23151 }
23152 
23153 
23154 typedef struct ecdata {
23155   Int4             count;
23156   ValNodePtr PNTR  head;
23157   ValNodePtr PNTR  tail;
23158   Boolean          only_unambig;
23159   Boolean          justwarn;
23160 } EcData, PNTR EcDataPtr;
23161 
UpdateProtEC(SeqFeatPtr sfp,Pointer userdata)23162 static void UpdateProtEC (SeqFeatPtr sfp, Pointer userdata)
23163 
23164 {
23165   Char        buf [256];
23166   EcDataPtr   edp;
23167   EcRepPtr    erp;
23168   Char        id [64];
23169   Int2        inf_loop_check;
23170   CharPtr     nxt;
23171   ProtRefPtr  prp;
23172   CharPtr     rep;
23173   Boolean     split;
23174   CharPtr     str;
23175   ValNodePtr  vnp;
23176 
23177   if (sfp == NULL || sfp->data.choice != SEQFEAT_PROT) return;
23178   prp = (ProtRefPtr) sfp->data.value.ptrvalue;
23179   if (prp == NULL || prp->ec == NULL) return;
23180   edp = (EcDataPtr) userdata;
23181   if (edp == NULL) return;
23182 
23183   id [0] = '\0';
23184   for (vnp = prp->ec; vnp != NULL; vnp = vnp->next) {
23185     str = (CharPtr) vnp->data.ptrvalue;
23186     if (StringHasNoText (str)) continue;
23187     split = FALSE;
23188     erp = GetECReplacement (str, &split);
23189     if (erp == NULL) continue;
23190     rep = erp->after;
23191     if (rep == NULL) continue;
23192     if (split) {
23193       if (edp->only_unambig) continue;
23194       if (edp->head != NULL) {
23195         if (id [0] == '\0') {
23196           SqnGetIdentifierForProtRef (sfp, id, sizeof (id));
23197         }
23198         if (edp->justwarn) {
23199           sprintf (buf, "%s\tEC number split\t%s\t%s", id, str, rep);
23200         } else {
23201           sprintf (buf, "%s\tdeleted split EC number\t%s\t%s", id, str, rep);
23202         }
23203         ValNodeCopyStrEx (edp->head, edp->tail, 0, buf);
23204       }
23205       if (edp->justwarn) continue;
23206       /* delete split records here */
23207       vnp->data.ptrvalue = MemFree (vnp->data.ptrvalue);
23208       vnp->data.ptrvalue = StringSave ("");
23209       (edp->count)++;
23210     } else {
23211       nxt = rep;
23212       inf_loop_check = 0;
23213       while (nxt != NULL && inf_loop_check < 10) {
23214         rep = nxt;
23215         inf_loop_check++;
23216         erp = GetECReplacement (rep, &split);
23217         if (erp != NULL) {
23218           nxt = erp->after;
23219         } else {
23220           nxt = NULL;
23221         }
23222       }
23223       if (edp->head != NULL) {
23224         if (id [0] == '\0') {
23225           SqnGetIdentifierForProtRef (sfp, id, sizeof (id));
23226         }
23227         if (edp->justwarn) {
23228           sprintf (buf, "%s\tEC number changed\t%s\t%s", id, str, rep);
23229         } else {
23230           sprintf (buf, "%s\treplaced EC number\t%s\t%s", id, str, rep);
23231         }
23232         ValNodeCopyStrEx (edp->head, edp->tail, 0, buf);
23233       }
23234       if (edp->justwarn) continue;
23235       /* do unambiguous replacement here */
23236       vnp->data.ptrvalue = MemFree (vnp->data.ptrvalue);
23237       vnp->data.ptrvalue = StringSave (rep);
23238       (edp->count)++;
23239     }
23240   }
23241 }
23242 
UpdateReplacedECNumbersEx(SeqEntryPtr sep,ValNodePtr PNTR head,ValNodePtr PNTR tail,Boolean only_unambig,Boolean justwarn)23243 NLM_EXTERN Int4 UpdateReplacedECNumbersEx (SeqEntryPtr sep, ValNodePtr PNTR head, ValNodePtr PNTR tail, Boolean only_unambig, Boolean justwarn)
23244 
23245 {
23246   EcData  ed;
23247 
23248   if (sep == NULL) return 0;
23249 
23250   MemSet ((Pointer) &ed, 0, sizeof (EcData));
23251   ed.count = 0;
23252   ed.head = head;
23253   ed.tail = tail;
23254   ed.only_unambig = only_unambig;
23255   ed.justwarn = justwarn;
23256 
23257   SetupECReplacementTable (NULL, "ecnum_replaced.txt", (CharPtr PNTR) kECNum_replaced, sizeof (kECNum_replaced) / sizeof (char*));
23258   if (ec_rep_data != NULL && ec_rep_len > 0) {
23259     VisitFeaturesInSep (sep, (Pointer) &ed, UpdateProtEC);
23260   }
23261 
23262   return ed.count;
23263 }
23264 
UpdateReplacedECNumbers(SeqEntryPtr sep)23265 NLM_EXTERN Int4 UpdateReplacedECNumbers (SeqEntryPtr sep)
23266 
23267 {
23268   return UpdateReplacedECNumbersEx (sep, NULL, NULL, FALSE, FALSE);
23269 }
23270 
DeleteBadProtEC(SeqFeatPtr sfp,Pointer userdata)23271 static void DeleteBadProtEC (SeqFeatPtr sfp, Pointer userdata)
23272 
23273 {
23274   Char        buf [256];
23275   EcDataPtr   edp;
23276   Char        id [64];
23277   ProtRefPtr  prp;
23278   CharPtr     str;
23279   ValNodePtr  vnp;
23280 
23281   if (sfp == NULL || sfp->data.choice != SEQFEAT_PROT) return;
23282   prp = (ProtRefPtr) sfp->data.value.ptrvalue;
23283   if (prp == NULL || prp->ec == NULL) return;
23284   edp = (EcDataPtr) userdata;
23285   if (edp == NULL) return;
23286 
23287   id [0] = '\0';
23288   for (vnp = prp->ec; vnp != NULL; vnp = vnp->next) {
23289     str = (CharPtr) vnp->data.ptrvalue;
23290     if (StringHasNoText (str)) continue;
23291     if (! ValidateECnumber (str)) {
23292       if (edp->head != NULL) {
23293         if (id [0] == '\0') {
23294           SqnGetIdentifierForProtRef (sfp, id, sizeof (id));
23295         }
23296         if (edp->justwarn) {
23297           sprintf (buf, "%s\tEC number malformed\t%s", id, str);
23298         } else {
23299           sprintf (buf, "%s\tdeleted malformed EC number\t%s", id, str);
23300         }
23301         ValNodeCopyStrEx (edp->head, edp->tail, 0, buf);
23302       }
23303       if (edp->justwarn) continue;
23304       /* delete bad format here */
23305       vnp->data.ptrvalue = MemFree (vnp->data.ptrvalue);
23306       vnp->data.ptrvalue = StringSave ("");
23307       (edp->count)++;
23308       continue;
23309     }
23310     if (ECnumberNotInList (str)) {
23311       if (edp->head != NULL) {
23312         if (id [0] == '\0') {
23313           SqnGetIdentifierForProtRef (sfp, id, sizeof (id));
23314         }
23315         if (edp->justwarn) {
23316           sprintf (buf, "%s\tEC number invalid\t%s", id, str);
23317         } else {
23318           sprintf (buf, "%s\tdeleted invalid EC number\t%s", id, str);
23319         }
23320         ValNodeCopyStrEx (edp->head, edp->tail, 0, buf);
23321       }
23322       if (edp->justwarn) continue;
23323       /* delete illegal number here */
23324       vnp->data.ptrvalue = MemFree (vnp->data.ptrvalue);
23325       vnp->data.ptrvalue = StringSave ("");
23326       (edp->count)++;
23327     }
23328   }
23329 }
23330 
DeleteBadECNumbersEx(SeqEntryPtr sep,ValNodePtr PNTR head,ValNodePtr PNTR tail,Boolean justwarn)23331 NLM_EXTERN Int4 DeleteBadECNumbersEx (SeqEntryPtr sep, ValNodePtr PNTR head, ValNodePtr PNTR tail, Boolean justwarn)
23332 
23333 {
23334   EcData  ed;
23335 
23336   if (sep == NULL) return 0;
23337 
23338   MemSet ((Pointer) &ed, 0, sizeof (EcData));
23339   ed.count = 0;
23340   ed.head = head;
23341   ed.tail = tail;
23342   ed.justwarn = justwarn;
23343 
23344   VisitFeaturesInSep (sep, (Pointer) &ed, DeleteBadProtEC);
23345 
23346   return ed.count;
23347 }
23348 
DeleteBadECNumbers(SeqEntryPtr sep)23349 NLM_EXTERN Int4 DeleteBadECNumbers (SeqEntryPtr sep)
23350 
23351 {
23352   return DeleteBadECNumbersEx (sep, NULL, NULL, FALSE);
23353 }
23354 //LCOV_EXCL_STOP
23355 
23356 
23357 
RptUnitIsBaseRange(CharPtr str,Int4Ptr fromP,Int4Ptr toP)23358 static Boolean RptUnitIsBaseRange (CharPtr str, Int4Ptr fromP, Int4Ptr toP)
23359 
23360 {
23361   CharPtr   ptr;
23362   Char      tmp [32];
23363   long int  val;
23364 
23365   if (StringLen (str) > 25) return FALSE;
23366   StringNCpy_0 (tmp, str, sizeof (tmp));
23367   ptr = StringStr (tmp, "..");
23368   if (ptr == NULL) return FALSE;
23369   *ptr = '\0';
23370   if (StringHasNoText (tmp)) return FALSE;
23371   if (sscanf (tmp, "%ld", &val) != 1 || val < 1) return FALSE;
23372   if (fromP != NULL) {
23373     *fromP = val - 1;
23374   }
23375   ptr += 2;
23376   if (StringHasNoText (ptr)) return FALSE;
23377   if (sscanf (ptr, "%ld", &val) != 1 || val < 1) return FALSE;
23378   if (toP != NULL) {
23379     *toP = val - 1;
23380   }
23381   return TRUE;
23382 }
23383 
ValidatePseudogene(ValidStructPtr vsp,GatherContextPtr gcp,SeqFeatPtr sfp,GBQualPtr gbqual)23384 static void ValidatePseudogene (ValidStructPtr vsp, GatherContextPtr gcp, SeqFeatPtr sfp, GBQualPtr gbqual)
23385 
23386 {
23387   CharPtr  val;
23388 
23389   if (vsp == NULL || gcp == NULL || sfp == NULL || gbqual == NULL) return;
23390   val = gbqual->val;
23391   if (val == NULL) return;
23392 
23393   if (StringHasNoText (val)) {
23394     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InvalidQualifierValue, "/pseudogene value should be not empty");
23395     return;
23396   }
23397 
23398   if (StringICmp (val, "processed") == 0) return;
23399   if (StringICmp (val, "unprocessed") == 0) return;
23400   if (StringICmp (val, "unitary") == 0) return;
23401   if (StringICmp (val, "allelic") == 0) return;
23402   if (StringICmp (val, "unknown") == 0) return;
23403 
23404   ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InvalidQualifierValue, "/pseudogene value should be not '%s'", val);
23405 }
23406 
23407 
ValidateRptUnit(ValidStructPtr vsp,GatherContextPtr gcp,SeqFeatPtr sfp,GBQualPtr gbqual,Int2 qual,CharPtr key)23408 static void ValidateRptUnit (ValidStructPtr vsp, GatherContextPtr gcp, SeqFeatPtr sfp, GBQualPtr gbqual, Int2 qual, CharPtr key)
23409 
23410 {
23411   Boolean            badchars, found, just_nuc_letters, multi_rpt_unit, in_range;
23412   Char               ch;
23413   SeqMgrFeatContext  context;
23414   Int4               from = -1, to = -1, ffrom, fto, ftmp;
23415   CharPtr            ptr, tmp;
23416   SeqLocPtr          slp;
23417 
23418   if (vsp == NULL || gcp == NULL || sfp == NULL || gbqual == NULL || gbqual->val == NULL || key == NULL) return;
23419 
23420   found = FALSE;
23421   multi_rpt_unit = FALSE;
23422   for (ptr = gbqual->val, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
23423     if (ch <= ' ') {
23424       found = TRUE;
23425     } else if (ch == '(' || ch == ')' || ch == ',' || ch == '.' || IS_DIGIT (ch)) {
23426       multi_rpt_unit = TRUE;
23427     }
23428   }
23429   /*
23430   if (found) {
23431     ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "Illegal value for qualifier %s", gbqual->qual);
23432   } else if ((!multi_rpt_unit) && StringLen (gbqual->val) > 48) {
23433     ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "Illegal value for qualifier %s", gbqual->qual);
23434   }
23435   */
23436   if (StringICmp (key,"repeat_region") == 0 && qual == GBQUAL_rpt_unit_seq && ! multi_rpt_unit) {
23437     if (StringLen (gbqual->val) <= SeqLocLen (sfp->location)) {
23438       just_nuc_letters = TRUE;
23439       for (ptr = gbqual->val, ch = *ptr; ch != '\0' && just_nuc_letters; ptr++, ch = *ptr) {
23440         if (StringChr ("ACGTNacgtn", ch) == NULL) {
23441           just_nuc_letters = FALSE;
23442         }
23443       }
23444       if (just_nuc_letters) {
23445         tmp = GetSequenceByFeature (sfp);
23446         if (tmp != NULL) {
23447           if (StringISearch (tmp, gbqual->val) == NULL) {
23448             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InvalidQualifierValue, "repeat_region /rpt_unit and underlying sequence do not match");
23449           }
23450           MemFree (tmp);
23451         }
23452       } else {
23453         /* illegal character test is now handled better at the end of this function */
23454         /*
23455         ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_InvalidQualifierValue, "rpt_unit_seq qualifier contains invalid characters");
23456         */
23457       }
23458     } else {
23459       ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_InvalidQualifierValue, "Length of rpt_unit_seq is greater than feature length");
23460     }
23461   }
23462 
23463   if (qual == GBQUAL_rpt_unit_range) {
23464     if (RptUnitIsBaseRange (gbqual->val, &from, &to)) {
23465       if (SeqMgrGetDesiredFeature (sfp->idx.entityID, NULL, 0, 0, sfp, &context) == sfp) {
23466         if (from < context.left || from > context.right || to < context.left || to > context.right) {
23467           /* could be segmented sequence */
23468           ffrom = SeqLocStart (sfp->location);
23469           fto = SeqLocStop (sfp->location);
23470           if (ffrom > fto) {
23471             ftmp = ffrom;
23472             ffrom = fto;
23473             fto = ftmp;
23474           }
23475           if (from < ffrom || from > fto || to < ffrom || to > fto) {
23476             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_RptUnitRangeProblem, "/rpt_unit_range is not within sequence length");
23477           }
23478         } else if (LocationHasNullsBetween (sfp->location)) {
23479           in_range = FALSE;
23480           slp = SeqLocFindNext (sfp->location, NULL);
23481           while (slp != NULL) {
23482             ffrom = SeqLocStart (slp);
23483             fto = SeqLocStop (slp);
23484             if (ffrom > fto) {
23485               ftmp = ffrom;
23486               ffrom = fto;
23487               fto = ftmp;
23488             }
23489             if (from >= ffrom && from <= fto && to >= ffrom && to <= fto) {
23490               in_range = TRUE;
23491             }
23492             slp = SeqLocFindNext (sfp->location, slp);
23493           }
23494           if (! in_range) {
23495             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_RptUnitRangeProblem, "/rpt_unit_range is not within ordered intervals");
23496           }
23497         }
23498       }
23499     } else {
23500       //LCOV_EXCL_START
23501       //if not base range, BasicCleanup converts qual to rpt_unit_seq
23502       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InvalidQualifierValue, "/rpt_unit_range is not a base range");
23503       //LCOV_EXCL_STOP
23504     }
23505   }
23506 
23507   if (qual == GBQUAL_rpt_unit_seq) {
23508     badchars = FALSE;
23509     for (ptr = gbqual->val, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
23510       if (ch <= ' ') {
23511         badchars = TRUE;
23512       } else if (ch == '(' || ch == ')' || IS_DIGIT (ch) || IS_ALPHA (ch)) {
23513       } else if (ch == ',' || ch == ';') {
23514       } else {
23515         badchars = TRUE;
23516       }
23517     }
23518     if (badchars) {
23519       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InvalidQualifierValue, "/rpt_unit_seq has illegal characters");
23520     }
23521   }
23522 }
23523 
23524 
IsGbIndexQualPairValid(Int2 index,Int2 val)23525 static Boolean IsGbIndexQualPairValid (Int2 index, Int2 val)
23526 {
23527   Int2    i;
23528   Boolean found = FALSE;
23529 
23530   for (i = 0; i < ParFlat_GBFeat[index].opt_num && !found; i++) {
23531     if (ParFlat_GBFeat[index].opt_qual[i] == val) {
23532       found = TRUE;
23533     }
23534   }
23535   for (i = 0; i < ParFlat_GBFeat[index].mand_num && !found; i++) {
23536     if (ParFlat_GBFeat[index].mand_qual[i] == val) {
23537       found = TRUE;
23538     }
23539   }
23540   return found;
23541 }
23542 
23543 
GetGBFeatKeyForFeature(SeqFeatPtr sfp)23544 NLM_EXTERN CharPtr GetGBFeatKeyForFeature (SeqFeatPtr sfp)
23545 {
23546   CharPtr key = NULL;
23547   ImpFeatPtr ifp;
23548 
23549   if (sfp == NULL) {
23550     return NULL;
23551   }
23552 
23553   if (sfp->data.choice == SEQFEAT_IMP) {
23554     ifp = (ImpFeatPtr) sfp->data.value.ptrvalue;
23555     if (StringCmp (ifp->key, "-") == 0) {
23556       key = StringSave ("misc_feature");
23557     } else {
23558       key = StringSaveNoNull (ifp->key);
23559     }
23560   } else {
23561     key = StringSaveNoNull (FeatDefTypeLabel (sfp));
23562     if (StringCmp (key, "Gene") == 0) {
23563       *key = 'g';
23564     } else if (StringCmp (key, "preRNA") == 0) {
23565       key = MemFree (key);
23566       key = StringSave ("precursor_RNA");
23567     }
23568   }
23569   return key;
23570 }
23571 
23572 //LCOV_EXCL_START
23573 //not used for validation
ShouldSuppressGBQual(Uint1 subtype,CharPtr qual_name)23574 NLM_EXTERN Boolean ShouldSuppressGBQual(Uint1 subtype, CharPtr qual_name)
23575 {
23576   if (StringHasNoText (qual_name)) {
23577     return FALSE;
23578   }
23579 
23580   /* always suppress experiment and inference quals, and now pseudogene */
23581   if (StringCmp (qual_name, "experiment") == 0 ||
23582       StringCmp (qual_name, "inference") == 0 ||
23583       StringCmp (qual_name, "pseudogene") == 0) {
23584     return TRUE;
23585   }
23586 
23587   if (subtype == FEATDEF_ncRNA) {
23588     if (StringCmp (qual_name, "product") == 0
23589         || StringCmp (qual_name, "ncRNA_class") == 0) {
23590       return TRUE;
23591     }
23592   } else if (subtype == FEATDEF_tmRNA) {
23593     if (StringCmp (qual_name, "product") == 0
23594         || StringCmp (qual_name, "tag_peptide") == 0) {
23595       return TRUE;
23596     }
23597   } else if (subtype == FEATDEF_otherRNA) {
23598     if (StringCmp (qual_name, "product") == 0) {
23599       return TRUE;
23600     }
23601   }
23602 
23603   return FALSE;
23604 }
23605 
23606 //not used for validation
ShouldBeAGBQual(Uint1 subtype,Int2 qual,Boolean allowProductGBQual)23607 NLM_EXTERN Boolean ShouldBeAGBQual (Uint1 subtype, Int2 qual, Boolean allowProductGBQual)
23608 
23609 {
23610   if (qual < 0) return FALSE;
23611   if (allowProductGBQual && qual == GBQUAL_product) return TRUE;
23612   if (qual == GBQUAL_citation ||
23613       qual == GBQUAL_db_xref ||
23614       qual == GBQUAL_evidence ||
23615       qual == GBQUAL_exception ||
23616       qual == GBQUAL_gene ||
23617       qual == GBQUAL_gene_synonym ||
23618       qual == GBQUAL_insertion_seq ||
23619       qual == GBQUAL_label ||
23620       qual == GBQUAL_locus_tag ||
23621       qual == GBQUAL_non_functional ||
23622       qual == GBQUAL_note ||
23623       qual == GBQUAL_partial ||
23624       qual == GBQUAL_product ||
23625       qual == GBQUAL_pseudo ||
23626       qual == GBQUAL_pseudogene ||
23627       qual == GBQUAL_rpt_unit ||
23628       qual == GBQUAL_transposon ||
23629       qual == GBQUAL_experiment ||
23630       qual == GBQUAL_trans_splicing ||
23631       qual == GBQUAL_ribosomal_slippage ||
23632       qual == GBQUAL_standard_name ||
23633       qual == GBQUAL_inference)
23634   {
23635     return FALSE;
23636   }
23637   if (subtype == FEATDEF_CDS)
23638   {
23639     if (qual == GBQUAL_codon_start
23640         || qual == GBQUAL_codon
23641         || qual == GBQUAL_EC_number
23642         || qual == GBQUAL_gdb_xref
23643         || qual == GBQUAL_number
23644         || qual == GBQUAL_protein_id
23645         || qual == GBQUAL_transl_except
23646         || qual == GBQUAL_transl_table
23647         || qual == GBQUAL_translation
23648         || qual == GBQUAL_allele
23649         || qual == GBQUAL_function
23650         || qual == GBQUAL_old_locus_tag)
23651     {
23652       return FALSE;
23653     }
23654   }
23655   if (qual == GBQUAL_map && subtype != FEATDEF_ANY && subtype != FEATDEF_repeat_region && subtype != FEATDEF_gap) return FALSE;
23656   if (qual == GBQUAL_operon && subtype != FEATDEF_ANY && subtype != FEATDEF_operon) return FALSE;
23657   if (Nlm_GetAppProperty ("SequinUseEMBLFeatures") == NULL)
23658   {
23659     if (qual == GBQUAL_usedin)
23660     {
23661       return FALSE;
23662     }
23663   }
23664 
23665   if (qual > -1 && ShouldSuppressGBQual (subtype, ParFlat_GBQual_names [qual].name)) {
23666     return FALSE;
23667   }
23668 
23669   return TRUE;
23670 }
23671 //LCOV_EXCL_STOP
23672 
23673 static CharPtr sWrongQualReasons[] = {
23674   "conflicting codon_start values",
23675   "codon_start value should be 1, 2, or 3"
23676 };
23677 
23678 typedef enum {
23679   eWrongQualReason_conflicting_codon_start = 0,
23680   eWrongQualReason_bad_codon_start_value
23681 } EWrongQualReason;
23682 
23683 /*
23684  * Return values:
23685  * 1: yes
23686  * 0: no
23687  * -1: don't know
23688  * 2: no for special reasons
23689  */
IsQualValidForFeature(GBQualPtr gbqual,SeqFeatPtr sfp)23690 NLM_EXTERN Int4 IsQualValidForFeature (GBQualPtr gbqual, SeqFeatPtr sfp)
23691 {
23692   CharPtr     key = NULL;
23693   Int2        val;
23694   Int4        rval = -1;
23695   Int2        index;
23696   CdRegionPtr crp;
23697 
23698   if (sfp == NULL || gbqual == NULL) {
23699     return -1;
23700   }
23701 
23702   key = GetGBFeatKeyForFeature (sfp);
23703   index = GBFeatKeyNameValid (&key, FALSE);
23704   key = MemFree (key);
23705 
23706   if (index == -1) {
23707     /* unknown */
23708     rval = -1;
23709   } else if (StringCmp (gbqual->qual, "gsdb_id") == 0) {
23710     /* force good */
23711     rval = 1;
23712   } else if (sfp->data.choice == SEQFEAT_GENE &&
23713              (StringCmp (gbqual->qual, "gen_map") == 0 ||
23714               StringCmp (gbqual->qual, "cyt_map") == 0 ||
23715               StringCmp (gbqual->qual, "rad_map") == 0)) {
23716     rval = 1;
23717   } else if (sfp->data.choice == SEQFEAT_CDREGION
23718              && StringCmp (gbqual->qual, "orig_transcript_id") == 0) {
23719     rval = 1;
23720   } else if (sfp->data.choice == SEQFEAT_RNA &&
23721              (StringCmp (gbqual->qual, "orig_protein_id") == 0 ||
23722               StringCmp (gbqual->qual, "orig_transcript_id") == 0)) {
23723     rval = 1;
23724   } else if ((val = GBQualNameValid (gbqual->qual)) == -1) {
23725     rval = -1;
23726   } else if (sfp->data.choice == SEQFEAT_CDREGION
23727              && val == GBQUAL_codon_start) {
23728     crp = (CdRegionPtr) sfp->data.value.ptrvalue;
23729     if (crp != NULL) {
23730       if (crp->frame > 0) {
23731         rval = eWrongQualReason_conflicting_codon_start + 2;
23732       } else {
23733         rval = eWrongQualReason_bad_codon_start_value + 2;
23734       }
23735     }
23736   } else if (IsGbIndexQualPairValid (index, val)) {
23737     rval = 1;
23738   } else {
23739     rval = 0;
23740   }
23741   return rval;
23742 }
23743 
AssemblyGapFeatValidate(ValidStructPtr vsp,GatherContextPtr gcp,SeqFeatPtr sfp,BioseqPtr bsp)23744 static void AssemblyGapFeatValidate (
23745   ValidStructPtr vsp,
23746   GatherContextPtr gcp,
23747   SeqFeatPtr sfp,
23748   BioseqPtr bsp
23749 )
23750 
23751 {
23752   Char       ch;
23753   Int4       count = 0;
23754   int        i;
23755   Boolean    is5 = FALSE;
23756   Boolean    is3 = FALSE;
23757   size_t     len;
23758   CharPtr    seq;
23759   ErrSev     sev = SEV_WARNING;
23760   SeqIntPtr  sintp;
23761   SeqLocPtr  slp;
23762 
23763   if (vsp == NULL || gcp == NULL || sfp == NULL || sfp->location == NULL || bsp == NULL) return;
23764 
23765   slp = (SeqLocPtr) AsnIoMemCopy ((Pointer) sfp->location, (AsnReadFunc) SeqLocAsnRead, (AsnWriteFunc) SeqLocAsnWrite);
23766   if (slp == NULL) return;
23767 
23768   if (vsp->genomeSubmission) {
23769     sev = SEV_ERROR;
23770   }
23771 
23772   if (slp->choice == SEQLOC_INT) {
23773     sintp = (SeqIntPtr) slp->data.ptrvalue;
23774     if (sintp != NULL && sintp->from > 0 && sintp->to < bsp->length - 1) {
23775       (sintp->from)--;
23776       (sintp->to)++;
23777       seq = GetSequenceByLocation (slp);
23778       if (seq != NULL) {
23779         len = StringLen (seq);
23780         if (len > 0 && len == SeqLocLen (slp)) {
23781           ch = seq [0];
23782           if (ch == 'N') {
23783             is5 = TRUE;
23784           }
23785           ch = seq [len - 1];
23786           if (ch == 'N') {
23787             is3 = TRUE;
23788           }
23789           if (is5 && is3) {
23790             ValidErr (vsp, sev, ERR_SEQ_FEAT_AssemblyGapAdjacentToNs, "Assembly_gap flanked by Ns on 5' and 3' sides");
23791           } else if (is5) {
23792             ValidErr (vsp, sev, ERR_SEQ_FEAT_AssemblyGapAdjacentToNs, "Assembly_gap flanked by Ns on 5' side");
23793           } else if (is3) {
23794             ValidErr (vsp, sev, ERR_SEQ_FEAT_AssemblyGapAdjacentToNs, "Assembly_gap flanked by Ns on 3' side");
23795           }
23796           for (i = 1; i < len - 1; i++) {
23797             ch = seq [i];
23798             if (ch != 'N') {
23799               count++;
23800             }
23801           }
23802           if (count > 0) {
23803             ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_AssemblyGapCoversSequence, "Assembly_gap extends into sequence");
23804           }
23805         }
23806       }
23807       MemFree (seq);
23808     }
23809   }
23810 
23811   SeqLocFree (slp);
23812 }
23813 
ValidateImpFeat(ValidStructPtr vsp,GatherContextPtr gcp,SeqFeatPtr sfp,ImpFeatPtr ifp)23814 static void ValidateImpFeat (ValidStructPtr vsp, GatherContextPtr gcp, SeqFeatPtr sfp, ImpFeatPtr ifp)
23815 
23816 {
23817   Int2            adv;
23818   BioseqPtr       bsp;
23819   SeqFeatPtr      cds;
23820   Char            ch;
23821   Boolean         failed;
23822   Boolean         found;
23823   IntFuzzPtr      fuzz;
23824   GBQualPtr       gbqual;
23825   SeqMgrFeatContext fcontext, gcontext;
23826   SeqFeatPtr      gene;
23827   GeneRefPtr      grp;
23828   Int2            i;
23829   Int2            index;
23830   Boolean         just_nuc_letters;
23831   Boolean         just_prt_letters;
23832   CharPtr         key;
23833   Int4            left;
23834   size_t          len;
23835   Boolean         multi_compare;
23836   Boolean         no_white_space;
23837   Int2            num_intervals;
23838   SeqFeatPtr      nxt;
23839   Boolean         ok;
23840   Boolean         only_digits;
23841   ProtRefPtr      prp;
23842   SeqFeatPtr      prt;
23843   CharPtr         ptr;
23844   Int2            qual;
23845   Char            range[32];
23846   Int4            right;
23847   ErrSev          sev;
23848   SeqIntPtr       sint;
23849   SeqIdPtr        sip;
23850   SeqLocPtr       slp;
23851   SeqPntPtr       spp;
23852   CharPtr         str;
23853   Uint1           strand;
23854   CharPtr         tmp;
23855   Boolean         twintron;
23856   Int2            val;
23857   ValNodePtr      vnp;
23858   Int4            qvalid;
23859 
23860   if (vsp == NULL || gcp == NULL || sfp == NULL || ifp == NULL)
23861     return;
23862   if (StringCmp (ifp->key, "-") == 0) {
23863     key = StringSave ("misc_feature");
23864   } else {
23865     key = StringSaveNoNull (ifp->key);
23866   }
23867   index = GBFeatKeyNameValid (&key, FALSE);
23868   if (index == -1) {
23869     if (key != NULL) {
23870       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnknownImpFeatKey, "Unknown feature key %s", key);
23871     } else {
23872       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnknownImpFeatKey, "NULL feature key");
23873     }
23874   } else if (StringICmp (key, "virion") == 0 ||
23875              StringICmp (key, "mutation") == 0 ||
23876              StringICmp (key, "allele") == 0 ||
23877              StringICmp (key, "Import") == 0) {
23878     ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_UnknownImpFeatKey, "Feature key %s is no longer legal", key);
23879   } else if (StringICmp (key, "polyA_site") == 0) {
23880     sev = SEV_WARNING;
23881     if (vsp->is_refseq_in_sep) {
23882       sev = SEV_ERROR;
23883     }
23884     if (SeqLocStart (sfp->location) != SeqLocStop (sfp->location)) {
23885       ValidErr (vsp, sev, ERR_SEQ_FEAT_PolyAsiteNotPoint, "PolyA_site should be a single point");
23886     }
23887   } else if (StringICmp (key, "polyA_signal") == 0) {
23888     sev = SEV_WARNING;
23889     if (vsp->is_refseq_in_sep) {
23890       sev = SEV_ERROR;
23891     }
23892     if (SeqLocStart (sfp->location) == SeqLocStop (sfp->location)) {
23893       ValidErr (vsp, sev, ERR_SEQ_FEAT_PolyAsignalNotRange, "PolyA_signal should be a range");
23894     }
23895   } else if (StringICmp (key, "mat_peptide") == 0 ||
23896              StringICmp (key, "sig_peptide") == 0 ||
23897              StringICmp (key, "transit_peptide") == 0) {
23898     sev = SEV_WARNING;
23899     if (vsp->is_refseq_in_sep) {
23900       sev = SEV_ERROR;
23901     }
23902     if (vsp->is_embl_ddbj_in_sep) {
23903       if (SeqMgrGetOverlappingCDS (sfp->location, NULL) == NULL) {
23904         sev = SEV_ERROR;
23905       }
23906       ValidErr (vsp, sev, ERR_SEQ_FEAT_PeptideFeatureLacksCDS, "sig/mat/transit_peptide feature cannot be associated with a protein product of a coding region feature");
23907     } else {
23908       ValidErr (vsp, sev, ERR_SEQ_FEAT_PeptideFeatureLacksCDS, "Peptide processing feature should be converted to the appropriate protein feature subtype");
23909     }
23910     CheckPeptideOnCodonBoundary (vsp, gcp, sfp, key);
23911   } else if (StringICmp (key, "preprotein") == 0 ||
23912              StringICmp (key, "proprotein") == 0) {
23913     //LCOV_EXCL_START
23914     //preprotein and proprotein are unknown feature keys, this code is never reached
23915     sev = SEV_WARNING;
23916     if (vsp->is_refseq_in_sep) {
23917       sev = SEV_ERROR;
23918     }
23919     if (vsp->is_embl_ddbj_in_sep) {
23920       if (SeqMgrGetOverlappingCDS (sfp->location, NULL) == NULL) {
23921         sev = SEV_ERROR;
23922       }
23923       ValidErr (vsp, sev, ERR_SEQ_FEAT_PeptideFeatureLacksCDS, "Pre/pro protein feature cannot be associated with a protein product of a coding region feature");
23924     } else {
23925       ValidErr (vsp, sev, ERR_SEQ_FEAT_PeptideFeatureLacksCDS, "Peptide processing feature should be converted to the appropriate protein feature subtype");
23926     }
23927     //LCOV_EXCL_STOP
23928   } else if (StringICmp (key, "mRNA") == 0 ||
23929              StringICmp (key, "tRNA") == 0 ||
23930              StringICmp (key, "rRNA") == 0 ||
23931              StringICmp (key, "snRNA") == 0 ||
23932              StringICmp (key, "scRNA") == 0 ||
23933              StringICmp (key, "snoRNA") == 0 ||
23934              StringICmp (key, "misc_RNA") == 0 ||
23935              StringICmp (key, "precursor_RNA") == 0) {
23936     //LCOV_EXCL_START
23937     //BasicCleanup converts imp RNA to real RNA
23938     ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidForType,
23939               "RNA feature should be converted to the appropriate RNA feature subtype, location should be converted manually");
23940     //LCOV_EXCL_STOP
23941   } else if (StringICmp (key, "CDS") == 0) {
23942       //LCOV_EXCL_START
23943       // Basic Cleanup converts imp CDS to real CDS
23944     failed = TRUE;              /* impfeat CDS must be pseudo; fail if not */
23945     if (sfp->pseudo) {
23946       failed = FALSE;
23947     } else {
23948       grp = SeqMgrGetGeneXref (sfp);
23949       if (grp != NULL && grp->pseudo) {
23950         failed = FALSE;
23951       } else {
23952         gene = SeqMgrGetOverlappingGene (sfp->location, &gcontext);
23953         if (gene != NULL) {
23954           if (gene->pseudo) {
23955             failed = FALSE;
23956           } else {
23957             grp = (GeneRefPtr) gene->data.value.ptrvalue;
23958             if (grp != NULL && grp->pseudo) {
23959               failed = FALSE;
23960             }
23961           }
23962         }
23963       }
23964     }
23965     for (gbqual = sfp->qual; gbqual != NULL; gbqual = gbqual->next) {
23966       if (StringCmp (gbqual->qual, "translation") == 0) {
23967         ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_ImpCDShasTranslation, "ImpFeat CDS with /translation found");
23968       }
23969     }
23970     if (failed) {
23971       ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_ImpCDSnotPseudo, "ImpFeat CDS should be pseudo");
23972     }
23973     //LCOV_EXCL_STOP
23974   } else if (StringICmp (key, "misc_feature") == 0) {
23975     for (gbqual = sfp->qual; gbqual != NULL; gbqual = gbqual->next) {
23976       if (StringCmp (gbqual->qual, "standard_name") == 0) {
23977         if (StringCmp (gbqual->val, "Vector Contamination") == 0) {
23978           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_VectorContamination, "Vector Contamination region should be trimmed from sequence");
23979         }
23980       }
23981     }
23982     if (StringHasNoText(sfp->comment) && sfp->qual == NULL && sfp->dbxref == NULL) {
23983       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_NeedsNote, "A note or other qualifier is required for a misc_feature");
23984     }
23985   } else if (StringICmp (key, "intron") == 0) {
23986     num_intervals = 0;
23987     for (slp = SeqLocFindNext (sfp->location, NULL); slp != NULL; slp = SeqLocFindNext (sfp->location, slp)) {
23988       num_intervals++;
23989     }
23990     if (num_intervals > 1) {
23991       sev = SEV_ERROR;
23992       if (vsp->is_embl_ddbj_in_sep) {
23993         sev = SEV_WARNING;
23994       }
23995       twintron = FALSE;
23996       bsp = BioseqFindFromSeqLoc (sfp->location);
23997       if (SeqMgrGetDesiredFeature (0, bsp, 0, 0, sfp, &gcontext) == sfp && gcontext.numivals == 2 && gcontext.ivals != NULL) {
23998         left = gcontext.ivals [1];
23999         right = gcontext.ivals [2];
24000         strand = gcontext.strand;
24001         nxt = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_intron, &gcontext);
24002         if (nxt != NULL) {
24003           if (strand == gcontext.strand) {
24004             if (left + 1 == gcontext.left && right - 1 == gcontext.right) {
24005               twintron = TRUE;
24006             }
24007           }
24008         }
24009       }
24010       if (twintron) {
24011         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_MultiIntervalIntron, "Multi-interval intron contains possible twintron");
24012       } else {
24013         ValidErr (vsp, sev, ERR_SEQ_FEAT_MultiIntervalIntron, "An intron should not have multiple intervals");
24014       }
24015     }
24016   } else if (StringICmp (key, "repeat_region") == 0) {
24017     if (StringHasNoText(sfp->comment) && sfp->qual == NULL && sfp->dbxref == NULL) {
24018       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_NeedsNote, "repeat_region has no qualifiers");
24019     }
24020   } else if (StringICmp (key, "regulatory") == 0) {
24021     for (gbqual = sfp->qual; gbqual != NULL; gbqual = gbqual->next) {
24022       if (StringICmp (gbqual->qual, "regulatory_class") != 0) continue;
24023       if (StringHasNoText (gbqual->val)) continue;
24024       if (StringICmp (gbqual->val, "other") == 0 && StringHasNoText (sfp->comment)) {
24025         ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "The regulatory_class 'other' is missing the required /note");
24026       }
24027       if (IsStringInRegulatoryClassList (gbqual->val)) continue;
24028       if (StringICmp (gbqual->val, "other") == 0) {
24029         ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "The regulatory_class value should not be '%s'", gbqual->val);
24030       } else {
24031         /*
24032         ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_InvalidQualifierValue, "Other regulatory_class value '%s'", gbqual->val);
24033         */
24034       }
24035     }
24036   } else if (StringICmp (key, "misc_recomb") == 0) {
24037     for (gbqual = sfp->qual; gbqual != NULL; gbqual = gbqual->next) {
24038       if (StringICmp (gbqual->qual, "recombination_class") != 0) continue;
24039       if (StringHasNoText (gbqual->val)) continue;
24040       if (StringICmp (gbqual->val, "other") == 0 && StringHasNoText (sfp->comment)) {
24041         ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "The recombination_class 'other' is missing the required /note");
24042       }
24043       if (IsStringInRecombinationClassList (gbqual->val)) continue;
24044       if (StringICmp (gbqual->val, "other") == 0) {
24045         ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "The recombination_class value should not be '%s'", gbqual->val);
24046       } else {
24047         /*
24048         ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_InvalidQualifierValue, "Other recombination_class value '%s'", gbqual->val);
24049         */
24050       }
24051     }
24052   } else if (StringICmp (key, "assembly_gap") == 0) {
24053     bsp = BioseqFindFromSeqLoc (sfp->location);
24054     if (! IsDeltaSeqWithFarpointers (bsp)) {
24055       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_GapFeatureProblem, "An assembly_gap feature should only be on a contig record");
24056     }
24057     AssemblyGapFeatValidate (vsp, gcp, sfp, bsp);
24058   }
24059   for (gbqual = sfp->qual; gbqual != NULL; gbqual = gbqual->next) {
24060     qvalid = IsQualValidForFeature (gbqual, sfp);
24061     if (qvalid == 0) {
24062       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_WrongQualOnImpFeat, "Wrong qualifier %s for feature %s", gbqual->qual, key);
24063     }
24064 
24065     if (StringCmp (gbqual->qual, "gsdb_id") == 0) {
24066       continue;
24067     }
24068     val = GBQualNameValid (gbqual->qual);
24069     if (val == -1) {
24070       if (gbqual->qual != NULL) {
24071         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnknownImpFeatQual, "Unknown qualifier %s", gbqual->qual);
24072       } else {
24073         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnknownImpFeatQual, "NULL qualifier");
24074       }
24075     } else if (index != -1) {
24076       if (gbqual->val != NULL) {
24077         if (val == GBQUAL_rpt_type) {
24078           failed = FALSE;
24079           tmp = StringSave (gbqual->val);
24080           str = tmp;
24081           if (*str == '(') {
24082             str++;
24083           }
24084           while (!StringHasNoText (str)) {
24085             ptr = StringChr (str, ',');
24086             if (ptr == NULL) {
24087               ptr = StringChr (str, ')');
24088             }
24089             if (ptr != NULL) {
24090               *ptr = '\0';
24091               ptr++;
24092             }
24093             found = FALSE;
24094             for (i = 0; legal_repeat_types[i] != NULL; i++) {
24095               if (StringICmp (str, legal_repeat_types[i]) == 0) {
24096                 found = TRUE;
24097                 break;
24098               }
24099             }
24100             if (!found) {
24101               failed = TRUE;
24102             }
24103             str = ptr;
24104           }
24105           MemFree (tmp);
24106           if (failed) {
24107             ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "%s is not a legal value for qualifier %s", gbqual->val, gbqual->qual);
24108           }
24109         } else if (val == GBQUAL_rpt_unit || val == GBQUAL_rpt_unit_range || val == GBQUAL_rpt_unit_seq) {
24110           ValidateRptUnit (vsp, gcp, sfp, gbqual, val, key);
24111         } else if (val == GBQUAL_pseudogene) {
24112           ValidatePseudogene (vsp, gcp, sfp, gbqual);
24113         } else if (val == GBQUAL_label) {
24114           //LCOV_EXCL_START
24115           //BasicCleanup removes label qualifier, puts contents in note
24116           no_white_space = TRUE;
24117           only_digits = TRUE;
24118           for (ptr = gbqual->val, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
24119             if (IS_WHITESP (ch)) {
24120               no_white_space = FALSE;
24121             }
24122             if (! IS_DIGIT (ch)) {
24123               only_digits = FALSE;
24124             }
24125           }
24126           if (only_digits || (! no_white_space)) {
24127             ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "Illegal value for qualifier %s", gbqual->qual);
24128           }
24129           //LCOV_EXCL_STOP
24130         } else if (val == GBQUAL_replace) {
24131           bsp = BioseqFindFromSeqLoc (sfp->location);
24132           if (bsp != NULL) {
24133             if (ISA_na (bsp->mol)) {
24134               if (StringICmp (key, "variation") == 0) {
24135                 just_nuc_letters = TRUE;
24136                 for (ptr = gbqual->val, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
24137                   if (StringChr ("acgt", ch) == NULL) {
24138                     just_nuc_letters = FALSE;
24139                   }
24140                 }
24141                 if (!just_nuc_letters) {
24142                   ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue,
24143                             "%s is not a legal value for qualifier %s - should only be composed of acgt unambiguous nucleotide bases",
24144                             gbqual->val, gbqual->qual);
24145                 }
24146               } else {
24147                 just_nuc_letters = TRUE;
24148                 for (ptr = gbqual->val, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
24149                   if (StringChr ("acgtmrwsykvhdbn", ch) == NULL) {
24150                     just_nuc_letters = FALSE;
24151                   }
24152                 }
24153                 if (!just_nuc_letters) {
24154                   ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue,
24155                             "%s is not a legal value for qualifier %s - should only be composed of acgtmrwsykvhdbn nucleotide bases",
24156                             gbqual->val, gbqual->qual);
24157                 }
24158               }
24159             } else if (ISA_aa (bsp->mol)) {
24160               just_prt_letters = TRUE;
24161               for (ptr = gbqual->val, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
24162                 if (StringChr ("acdefghiklmnpqrstuvwy*", ch) == NULL) {
24163                   just_prt_letters = FALSE;
24164                 }
24165               }
24166               if (!just_prt_letters) {
24167                 ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue,
24168                           "%s is not a legal value for qualifier %s - should only be composed of acdefghiklmnpqrstuvwy* amino acids",
24169                           gbqual->val, gbqual->qual);
24170               }
24171             }
24172             slp = sfp->location;
24173             fuzz = NULL;
24174             if (slp != NULL && slp->choice == SEQLOC_PNT) {
24175               spp = (SeqPntPtr) slp->data.ptrvalue;
24176               if (spp != NULL) {
24177                 fuzz = spp->fuzz;
24178               }
24179             }
24180             if (slp != NULL && StringLen (gbqual->val) == SeqLocLen (slp) && fuzz == NULL) {
24181               tmp = GetSequenceByFeature (sfp);
24182               if (tmp != NULL) {
24183                 if (StringICmp (tmp, gbqual->val) == 0) {
24184                   ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_SuspiciousQualifierValue, "/replace already matches underlying sequence (%s)", gbqual->val);
24185                 }
24186                 MemFree (tmp);
24187               }
24188             }
24189           }
24190         } else if (val == GBQUAL_cons_splice) {
24191           found = FALSE;
24192           for (i = 0; legal_cons_splice_strings[i] != NULL; i++) {
24193             if (StringICmp (gbqual->val, legal_cons_splice_strings[i]) == 0) {
24194               found = TRUE;
24195               break;
24196             }
24197           }
24198           if (!found) {
24199             ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "%s is not a legal value for qualifier %s", gbqual->val, gbqual->qual);
24200           }
24201         } else if (val == GBQUAL_mod_base) {
24202           found = FALSE;
24203           for (i = 0; legal_modified_bases[i] != NULL; i++) {
24204             if (StringICmp (gbqual->val, legal_modified_bases[i]) == 0) {
24205               found = TRUE;
24206               break;
24207             }
24208           }
24209           if (!found) {
24210             ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "%s is not a legal value for qualifier %s", gbqual->val, gbqual->qual);
24211           }
24212         } else if (val == GBQUAL_mobile_element_type) {
24213           found = FALSE;
24214           str = NULL;
24215           for (i = 0; legal_mobile_element_strings[i] != NULL; i++) {
24216             ptr = legal_mobile_element_strings[i];
24217             len = StringLen (ptr);
24218             if (StringNICmp (gbqual->val, ptr, len) == 0) {
24219               found = TRUE;
24220               str = gbqual->val + len;
24221               break;
24222             }
24223           }
24224           if (found) {
24225             if (StringDoesHaveText (str) && (str [0] != ':' || str [1] == '\0')) {
24226               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InvalidQualifierValue, "%s is not a legal value for qualifier %s", gbqual->val, gbqual->qual);
24227             } else if (StringNICmp (gbqual->val, "other", 5) == 0) {
24228               if (str [0] != ':' || str [1] == '\0') {
24229                 ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InvalidQualifierValue, "%s is not a legal value for qualifier %s", gbqual->val, gbqual->qual);
24230               }
24231             }
24232           }
24233           if (!found) {
24234             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InvalidQualifierValue, "%s is not a legal value for qualifier %s", gbqual->val, gbqual->qual);
24235           }
24236         } else if (val == GBQUAL_frequency) {
24237           if (StringCmp (gbqual->val, "1") == 0 || StringCmp (gbqual->val, "1.0") == 0 || StringCmp (gbqual->val, "1.00") == 0) {
24238             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InvalidQualifierValue, "%s is a suspicious value for qualifier %s", gbqual->val, gbqual->qual);
24239           }
24240         } else if (val == GBQUAL_compare) {
24241           multi_compare = FALSE;
24242           ptr = gbqual->val;
24243           ch = *ptr;
24244           if (ch == '(') {
24245             multi_compare = TRUE;
24246           }
24247           if (! multi_compare) {
24248             adv = ValidateAccnDotVer (gbqual->val);
24249             if (adv == -5) {
24250               ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "%s accession missing version for qualifier %s", gbqual->val, gbqual->qual);
24251             } else if (adv == -6) {
24252               ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "%s accession has bad version for qualifier %s", gbqual->val, gbqual->qual);
24253             } else if (adv != 0) {
24254               ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "%s is not a legal accession for qualifier %s", gbqual->val, gbqual->qual);
24255             } else if (StringChr (gbqual->val, '_') != NULL) {
24256               if (vsp->is_insd_in_sep) {
24257                 ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "RefSeq accession %s cannot be used for qualifier %s", gbqual->val, gbqual->qual);
24258               }
24259             }
24260           }
24261         }
24262       }
24263     }
24264   }
24265   if (index != -1 && ParFlat_GBFeat[index].mand_num > 0) {
24266     for (i = 0; i < ParFlat_GBFeat[index].mand_num; i++) {
24267       found = FALSE;
24268       qual = ParFlat_GBFeat[index].mand_qual[i];
24269       for (gbqual = sfp->qual; gbqual != NULL; gbqual = gbqual->next) {
24270         val = GBQualNameValid (gbqual->qual);
24271         if (qual == val) {
24272           found = TRUE;
24273           break;
24274         }
24275       }
24276       if (!found) {
24277         if (qual == GBQUAL_citation) {
24278           if (sfp->cit != NULL) {
24279             found = TRUE;
24280           } else if (! StringHasNoText (sfp->comment)) {
24281             /* RefSeq allows conflict with accession in comment instead of sfp->cit */
24282             if (StringICmp (key, "conflict") == 0) {
24283               bsp = BioseqFindFromSeqLoc (sfp->location);
24284               if (bsp != NULL) {
24285                 for (sip = bsp->id; sip != NULL; sip = sip->next) {
24286                   if (sip->choice == SEQID_OTHER) {
24287                     found = TRUE;
24288                   }
24289                 }
24290               }
24291             }
24292           }
24293         }
24294       }
24295       if (!found) {
24296         if (StringICmp (key, "conflict") == 0 || StringICmp (key, "old_sequence") == 0) {
24297           /* compare qualifier can now substitute for citation qualifier for conflict and old_sequence */
24298           for (gbqual = sfp->qual; gbqual != NULL; gbqual = gbqual->next) {
24299             if (StringICmp (gbqual->qual, "compare") == 0 && StringDoesHaveText (gbqual->val)) {
24300               found = TRUE;
24301             }
24302           }
24303         }
24304       }
24305       if (!found) {
24306         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_MissingQualOnImpFeat, "Missing qualifier %s for feature %s", ParFlat_GBQual_names[qual].name, key);
24307       }
24308     }
24309   }
24310   if (!StringHasNoText (ifp->loc)) {
24311     slp = sfp->location;
24312     if (StringStr (ifp->loc, "one-of") != NULL) {
24313       ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_ImpFeatBadLoc, "ImpFeat loc %s has obsolete 'one-of' text for feature %s", ifp->loc, key);
24314     } else if (slp != NULL && slp->choice == SEQLOC_INT) {
24315       sint = (SeqIntPtr) slp->data.ptrvalue;
24316       if (sint != NULL && sint->strand != Seq_strand_minus) {
24317         sprintf (range, "%ld..%ld", (long) (sint->from + 1), (long) (sint->to + 1));
24318         if (StringCmp (ifp->loc, range) != 0) {
24319           ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_ImpFeatBadLoc, "ImpFeat loc %s does not equal feature location %s for feature %s", ifp->loc, range, key);
24320         }
24321       }
24322     }
24323   }
24324 
24325   if (StringICmp (key, "misc_feature") == 0) {
24326     tmp = StringStr (sfp->comment, "cspA");
24327     if (tmp != NULL) {
24328       ok = FALSE;
24329       if (tmp == sfp->comment) {
24330         ch = tmp[4];
24331         if (ch == '\0' || IS_WHITESP (ch)) {
24332           ok = TRUE;
24333         }
24334       } else {
24335         ptr = tmp-1;
24336         ch = *ptr;
24337         if (IS_WHITESP (ch)) {
24338           ch = tmp[4];
24339           if (ch == '\0' || IS_WHITESP (ch)) {
24340             ok = TRUE;
24341           }
24342         }
24343       }
24344       if (ok) {
24345         cds = SeqMgrGetOverlappingFeature (sfp->location, FEATDEF_CDS, NULL, 0, NULL, SIMPLE_OVERLAP, &fcontext);
24346         if (cds != NULL) {
24347           bsp = BioseqFindFromSeqLoc (cds->product);
24348           if (bsp != NULL) {
24349             prt = SeqMgrGetBestProteinFeature (bsp, NULL);
24350             if (prt != NULL) {
24351               prp = (ProtRefPtr) prt->data.value.ptrvalue;
24352               if (prp != NULL) {
24353                 vnp = prp->name;
24354                 if (vnp != NULL) {
24355                   if (StringICmp ((CharPtr) vnp->data.ptrvalue, "cold-shock protein") == 0) {
24356                     ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_ColdShockProteinProblem, "cspA misc_feature overlapped by cold-shock protein CDS");
24357                   }
24358                 }
24359               }
24360             }
24361           }
24362         }
24363       }
24364     }
24365   }
24366 
24367   MemFree (key);
24368 }
24369 
ValidateNonImpFeat(ValidStructPtr vsp,GatherContextPtr gcp,SeqFeatPtr sfp)24370 static void ValidateNonImpFeat (ValidStructPtr vsp, GatherContextPtr gcp, SeqFeatPtr sfp)
24371 {
24372   Int2       adv;
24373   BioseqPtr  bsp;
24374   Char       ch;
24375   Boolean    failed;
24376   Boolean    found;
24377   GBQualPtr  gbqual;
24378   Int2       i;
24379   Int2       index;
24380   CharPtr    key;
24381   Boolean    multi_compare;
24382   Boolean    no_white_space;
24383   Boolean    only_digits;
24384   CharPtr    ptr;
24385   Int2       qual;
24386   RNAGenPtr  rgp;
24387   RnaRefPtr  rrp;
24388   ErrSev     sev;
24389   SeqIdPtr   sip;
24390   CharPtr    str;
24391   CharPtr    tmp;
24392   Int2       val;
24393   Int4       qvalid;
24394 
24395   if (vsp == NULL || gcp == NULL || sfp == NULL)
24396     return;
24397   key = StringSaveNoNull (FeatDefTypeLabel (sfp));
24398   if (StringCmp (key, "Gene") == 0) {
24399     *key = 'g';
24400   } else if (StringCmp (key, "preRNA") == 0) {
24401     key = MemFree (key);
24402     key = StringSave ("precursor_RNA");
24403   }
24404   index = GBFeatKeyNameValid (&key, FALSE);
24405   for (gbqual = sfp->qual; gbqual != NULL; gbqual = gbqual->next) {
24406     qvalid = IsQualValidForFeature (gbqual, sfp);
24407     if (qvalid == 0) {
24408       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_WrongQualOnFeature, "Wrong qualifier %s for feature %s", gbqual->qual, key);
24409     } else if (qvalid > 1) {
24410       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_WrongQualOnFeature, sWrongQualReasons[qvalid - 2]);
24411     } else if (sfp->data.choice == SEQFEAT_GENE && StringCmp (gbqual->qual, "product") == 0) {
24412       ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_SuspiciousQualifierValue, "A product qualifier is not normally used on a gene feature");
24413     }
24414 
24415     if (StringCmp (gbqual->qual, "gsdb_id") == 0) {
24416       continue;
24417     }
24418     val = GBQualNameValid (gbqual->qual);
24419     if (val == -1) {
24420       if (gbqual->qual != NULL) {
24421         if (sfp->data.choice == SEQFEAT_GENE) {
24422           if (StringCmp (gbqual->qual, "gen_map") == 0) continue;
24423           if (StringCmp (gbqual->qual, "cyt_map") == 0) continue;
24424           if (StringCmp (gbqual->qual, "rad_map") == 0) continue;
24425         }
24426         if (sfp->data.choice == SEQFEAT_CDREGION) {
24427           if (StringCmp (gbqual->qual, "orig_transcript_id") == 0) continue;
24428         }
24429         if (sfp->data.choice == SEQFEAT_RNA) {
24430           if (StringCmp (gbqual->qual, "orig_protein_id") == 0) continue;
24431           if (StringCmp (gbqual->qual, "orig_transcript_id") == 0) continue;
24432         }
24433         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnknownFeatureQual, "Unknown qualifier %s", gbqual->qual);
24434       } else {
24435         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnknownFeatureQual, "NULL qualifier");
24436       }
24437     } else if (index != -1) {
24438       if (gbqual->val != NULL) {
24439         if (val == GBQUAL_rpt_type) {
24440           failed = FALSE;
24441           tmp = StringSave (gbqual->val);
24442           str = tmp;
24443           if (*str == '(') {
24444             str++;
24445           }
24446           while (!StringHasNoText (str)) {
24447             ptr = StringChr (str, ',');
24448             if (ptr == NULL) {
24449               ptr = StringChr (str, ')');
24450             }
24451             if (ptr != NULL) {
24452               *ptr = '\0';
24453               ptr++;
24454             }
24455             found = FALSE;
24456             for (i = 0; legal_repeat_types[i] != NULL; i++) {
24457               if (StringICmp (str, legal_repeat_types[i]) == 0) {
24458                 found = TRUE;
24459                 break;
24460               }
24461             }
24462             if (!found) {
24463               failed = TRUE;
24464             }
24465             str = ptr;
24466           }
24467           MemFree (tmp);
24468           if (failed) {
24469             ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "%s is not a legal value for qualifier %s", gbqual->val, gbqual->qual);
24470           }
24471         } else if (val == GBQUAL_rpt_unit || val == GBQUAL_rpt_unit_range || val == GBQUAL_rpt_unit_seq) {
24472           ValidateRptUnit (vsp, gcp, sfp, gbqual, val, key);
24473         } else if (val == GBQUAL_pseudogene) {
24474           ValidatePseudogene (vsp, gcp, sfp, gbqual);
24475         } else if (val == GBQUAL_label) {
24476           no_white_space = TRUE;
24477           only_digits = TRUE;
24478           for (ptr = gbqual->val, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
24479             if (IS_WHITESP (ch)) {
24480               no_white_space = FALSE;
24481             }
24482             if (! IS_DIGIT (ch)) {
24483               only_digits = FALSE;
24484             }
24485           }
24486           if (only_digits || (! no_white_space)) {
24487             ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "Illegal value for qualifier %s", gbqual->qual);
24488           }
24489         } else if (val == GBQUAL_cons_splice) {
24490           found = FALSE;
24491           for (i = 0; legal_cons_splice_strings[i] != NULL; i++) {
24492             if (StringICmp (gbqual->val, legal_cons_splice_strings[i]) == 0) {
24493               found = TRUE;
24494               break;
24495             }
24496           }
24497           if (!found) {
24498             ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "%s is not a legal value for qualifier %s", gbqual->val, gbqual->qual);
24499           }
24500         } else if (val == GBQUAL_mod_base) {
24501           found = FALSE;
24502           for (i = 0; legal_modified_bases[i] != NULL; i++) {
24503             if (StringICmp (gbqual->val, legal_modified_bases[i]) == 0) {
24504               found = TRUE;
24505               break;
24506             }
24507           }
24508           if (!found) {
24509             ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "%s is not a legal value for qualifier %s", gbqual->val, gbqual->qual);
24510           }
24511         } else if (val == GBQUAL_compare) {
24512           multi_compare = FALSE;
24513           ptr = gbqual->val;
24514           ch = *ptr;
24515           if (ch == '(') {
24516             multi_compare = TRUE;
24517           }
24518           if (! multi_compare) {
24519             adv = ValidateAccnDotVer (gbqual->val);
24520             if (adv == -5) {
24521               ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "%s accession missing version for qualifier %s", gbqual->val, gbqual->qual);
24522             } else if (adv == -6) {
24523              ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "%s accession has bad version for qualifier %s", gbqual->val, gbqual->qual);
24524             } else if (adv != 0) {
24525               ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "%s is not a legal accession for qualifier %s", gbqual->val, gbqual->qual);
24526             } else if (StringChr (gbqual->val, '_') != NULL) {
24527               if (vsp->is_insd_in_sep) {
24528                 ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "RefSeq accession %s cannot be used for qualifier %s", gbqual->val, gbqual->qual);
24529               }
24530             }
24531           }
24532         }
24533       }
24534     }
24535   }
24536   if (index != -1 && ParFlat_GBFeat[index].mand_num > 0) {
24537     for (i = 0; i < ParFlat_GBFeat[index].mand_num; i++) {
24538       sev = SEV_WARNING;
24539       found = FALSE;
24540       qual = ParFlat_GBFeat[index].mand_qual[i];
24541       for (gbqual = sfp->qual; gbqual != NULL; gbqual = gbqual->next) {
24542         val = GBQualNameValid (gbqual->qual);
24543         if (qual == val) {
24544           found = TRUE;
24545           break;
24546         }
24547       }
24548       if (!found) {
24549         if (qual == GBQUAL_citation) {
24550             //LCOV_EXCL_START
24551             //citation is only mandatory for old_sequence and conflict, which are
24552             // import features, which are not handled here
24553           if (sfp->cit != NULL) {
24554             found = TRUE;
24555           } else if (! StringHasNoText (sfp->comment)) {
24556             /* RefSeq allows conflict with accession in comment instead of sfp->cit */
24557             if (StringICmp (key, "conflict") == 0) {
24558               bsp = BioseqFindFromSeqLoc (sfp->location);
24559               if (bsp != NULL) {
24560                 for (sip = bsp->id; sip != NULL; sip = sip->next) {
24561                   if (sip->choice == SEQID_OTHER) {
24562                     found = TRUE;
24563                   }
24564                 }
24565               }
24566             }
24567           }
24568           //LCOV_EXCL_STOP
24569         }
24570       }
24571       if (!found) {
24572         if (StringICmp (key, "conflict") == 0 || StringICmp (key, "old_sequence") == 0) {
24573             //LCOV_EXCL_START
24574             //conflict and old_sequence are import features not handled by this function
24575           /* compare qualifier can now substitute for citation qualifier for conflict and old_sequence */
24576           for (gbqual = sfp->qual; gbqual != NULL; gbqual = gbqual->next) {
24577             if (StringICmp (gbqual->qual, "compare") == 0 && StringDoesHaveText (gbqual->val)) {
24578               found = TRUE;
24579             }
24580           }
24581           //LCOV_EXCL_STOP
24582         }
24583       }
24584       if (!found) {
24585         if (qual == GBQUAL_ncRNA_class) {
24586           sev = SEV_ERROR;
24587           if (sfp->data.choice == SEQFEAT_RNA) {
24588             rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
24589             if (rrp != NULL) {
24590               if (rrp->ext.choice == 3) {
24591                 rgp = (RNAGenPtr) rrp->ext.value.ptrvalue;
24592                 if (rgp != NULL) {
24593                   if (StringDoesHaveText (rgp->_class)) {
24594                     found = TRUE;
24595                   }
24596                 }
24597               }
24598             }
24599           }
24600         }
24601       }
24602       if (!found) {
24603         ValidErr (vsp, sev, ERR_SEQ_FEAT_MissingQualOnFeature,
24604                   "Missing qualifier %s for feature %s", ParFlat_GBQual_names[qual].name, key);
24605       }
24606     }
24607   }
24608   if (StringICmp (key, "mat_peptide") == 0 ||
24609       StringICmp (key, "sig_peptide") == 0 ||
24610       StringICmp (key, "transit_peptide") == 0) {
24611     bsp = BioseqFindFromSeqLoc (sfp->location);
24612     if (bsp != NULL) {
24613       if (ISA_na (bsp->mol)) {
24614         sev = SEV_WARNING;
24615         if (vsp->is_refseq_in_sep) {
24616           sev = SEV_ERROR;
24617         }
24618         ValidErr (vsp, sev, ERR_SEQ_FEAT_InvalidForType, "Peptide processing feature should be remapped to the appropriate protein bioseq");
24619         CheckPeptideOnCodonBoundary (vsp, gcp, sfp, key);
24620       }
24621     }
24622   } else if (StringICmp (key, "preprotein") == 0 ||
24623       StringICmp (key, "proprotein") == 0) {
24624     bsp = BioseqFindFromSeqLoc (sfp->location);
24625     if (bsp != NULL) {
24626       if (ISA_na (bsp->mol)) {
24627         sev = SEV_WARNING;
24628         if (vsp->is_refseq_in_sep) {
24629           sev = SEV_ERROR;
24630         }
24631         ValidErr (vsp, sev, ERR_SEQ_FEAT_InvalidForType, "Peptide processing feature should be remapped to the appropriate protein bioseq");
24632       }
24633     }
24634   }
24635   MemFree (key);
24636 }
24637 
24638 /* PartialAtSpliceSiteOrGap uses code taken from SpliceCheckEx */
PartialAtSpliceSiteOrGap(ValidStructPtr vsp,SeqLocPtr head,Uint2 slpTag,BoolPtr isgapP,BoolPtr badseqP)24639 static Boolean PartialAtSpliceSiteOrGap (ValidStructPtr vsp, SeqLocPtr head, Uint2 slpTag, BoolPtr isgapP, BoolPtr badseqP)
24640 {
24641   BioseqPtr       bsp;
24642   Int2            residue1, residue2;
24643   Boolean         rsult = FALSE;
24644   SeqIdPtr        sip;
24645   SeqLocPtr       slp = NULL, first = NULL, last = NULL;
24646   /*
24647   SeqPortPtr      spp = NULL;
24648   */
24649   Uint1           strand;
24650   Int4            strt, stp, donor, acceptor, len;
24651   StreamCache     sc;
24652   SeqInt          sint;
24653   ValNode         vn;
24654 
24655   if (isgapP != NULL) {
24656     *isgapP = FALSE;
24657   }
24658   if (badseqP != NULL) {
24659     *badseqP = FALSE;
24660   }
24661   if (slpTag != SLP_NOSTART && slpTag != SLP_NOSTOP)
24662     return FALSE;
24663   while ((slp = SeqLocFindPart (head, slp, EQUIV_IS_ONE)) != NULL) {
24664     if (first == NULL) {
24665       first = slp;
24666     }
24667     last = slp;
24668   }
24669   if (first == NULL)
24670     return FALSE;
24671 
24672   strand = SeqLocStrand (first);
24673   if (SeqLocStrand (last) != strand)
24674     return FALSE;
24675 
24676   if (slpTag == SLP_NOSTART) {
24677     slp = first;
24678   } else {
24679     slp = last;
24680   }
24681   sip = SeqLocId (slp);
24682   if (sip == NULL)
24683     return FALSE;
24684 
24685   bsp = NULL;
24686   if (sip != NULL && (sip->choice != SEQID_GI || sip->data.intvalue > 0)) {
24687     bsp = BioseqLockById (sip);
24688   }
24689   if (bsp == NULL)
24690     return FALSE;
24691   len = bsp->length;
24692 
24693   acceptor = SeqLocStart (slp);
24694   donor = SeqLocStop (slp);
24695 
24696   if (acceptor < 0 || acceptor >= len || donor < 0 || donor >= len) {
24697     /*
24698     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_Range,
24699               "Unable to check splice consensus because feature outside range of sequence");
24700     */
24701     BioseqUnlock (bsp);
24702     return FALSE;
24703   }
24704 
24705   if (strand != Seq_strand_minus) {
24706     if (! StreamCacheSetup (bsp, NULL, EXPAND_GAPS_TO_DASHES, &sc)) {
24707       BioseqUnlock (bsp);
24708       return FALSE;
24709     }
24710   } else {
24711     sint.from = 0;
24712     sint.to = len - 1;
24713     sint.strand = strand;
24714     sint.id = sip;
24715     vn.choice = SEQLOC_INT;
24716     vn.data.ptrvalue = (Pointer) &sint;
24717     vn.next = NULL;
24718     if (! StreamCacheSetup (NULL, &vn, EXPAND_GAPS_TO_DASHES, &sc)) {
24719       BioseqUnlock (bsp);
24720       return FALSE;
24721     }
24722   }
24723   /* spp = SeqPortNew (bsp, 0, -1, strand, Seq_code_ncbi4na); */
24724   BioseqUnlock (bsp);
24725   /*
24726   if (spp == NULL)
24727     return FALSE;
24728   */
24729 
24730   if (strand != Seq_strand_minus) {
24731     strt = acceptor;
24732     stp = donor;
24733   } else {
24734     strt = donor;
24735     donor = acceptor;
24736     acceptor = strt;
24737     stp = len - donor - 1;
24738     strt = len - acceptor - 1;
24739   }
24740 
24741   if (slpTag == SLP_NOSTOP && stp < len - 2) {
24742     StreamCacheSetPosition (&sc, stp + 1);
24743     residue1 = StreamCacheGetResidue (&sc);
24744     residue2 = StreamCacheGetResidue (&sc);
24745     /*
24746     SeqPortSeek (spp, (stp + 1), SEEK_SET);
24747     residue1 = SeqPortGetResidue (spp);
24748     residue2 = SeqPortGetResidue (spp);
24749     */
24750     if (residue1 == '-' || residue2 == '-') {
24751       if (isgapP != NULL) {
24752         *isgapP = TRUE;
24753       }
24754       rsult = TRUE;
24755     } else if (IS_residue (residue1) && IS_residue (residue2) && IS_ALPHA ((Char) residue1) && IS_ALPHA ((Char) residue2)) {
24756       if (ConsistentWithG ((Char)residue1) && ConsistentWithT ((Char)residue2)) {
24757         rsult = TRUE;
24758       } else if ((residue1 == 'G') && (residue2 == 'C')) {
24759         rsult = TRUE;
24760       }
24761     } else if (badseqP != NULL) {
24762       *badseqP = TRUE;
24763     }
24764   } else if (slpTag == SLP_NOSTART && strt > 1) {
24765     StreamCacheSetPosition (&sc, strt - 2);
24766     residue1 = StreamCacheGetResidue (&sc);
24767     residue2 = StreamCacheGetResidue (&sc);
24768     /*
24769     SeqPortSeek (spp, (strt - 2), SEEK_SET);
24770     residue1 = SeqPortGetResidue (spp);
24771     residue2 = SeqPortGetResidue (spp);
24772     */
24773     if (residue1 == '-' || residue2 == '-') {
24774       if (isgapP != NULL) {
24775         *isgapP = TRUE;
24776       }
24777       rsult = TRUE;
24778     } else if (IS_residue (residue1) && IS_residue (residue2) && IS_ALPHA ((Char) residue1) && IS_ALPHA ((Char) residue2)) {
24779       if (ConsistentWithA ((Char)residue1) && ConsistentWithG ((Char)residue2)) {
24780         rsult = TRUE;
24781       }
24782     } else if (badseqP != NULL) {
24783       *badseqP = TRUE;
24784     }
24785   }
24786 
24787   /* spp = SeqPortFree (spp); */
24788   return rsult;
24789 }
24790 
PartialAtGapOrNs(ValidStructPtr vsp,SeqLocPtr head,Uint2 slpTag)24791 static Boolean PartialAtGapOrNs (ValidStructPtr vsp, SeqLocPtr head, Uint2 slpTag)
24792 
24793 {
24794   BioseqPtr       bsp;
24795   Int2            residue;
24796   Boolean         rsult = FALSE;
24797   SeqIdPtr        sip;
24798   SeqLocPtr       slp = NULL, first = NULL, last = NULL;
24799   Uint1           strand;
24800   Int4            strt, stp, donor, acceptor, len;
24801   StreamCache     sc;
24802   SeqInt          sint;
24803   ValNode         vn;
24804 
24805   if (slpTag != SLP_NOSTART && slpTag != SLP_NOSTOP) return FALSE;
24806 
24807   while ((slp = SeqLocFindPart (head, slp, EQUIV_IS_ONE)) != NULL) {
24808     if (first == NULL) {
24809       first = slp;
24810     }
24811     last = slp;
24812   }
24813   if (first == NULL) return FALSE;
24814 
24815   strand = SeqLocStrand (first);
24816   if (SeqLocStrand (last) != strand) return FALSE;
24817 
24818   if (slpTag == SLP_NOSTART) {
24819     slp = first;
24820   } else {
24821     slp = last;
24822   }
24823   sip = SeqLocId (slp);
24824   if (sip == NULL) return FALSE;
24825 
24826   bsp = NULL;
24827   if (sip != NULL && (sip->choice != SEQID_GI || sip->data.intvalue > 0)) {
24828     bsp = BioseqLockById (sip);
24829   }
24830   if (bsp == NULL) return FALSE;
24831   len = bsp->length;
24832 
24833   acceptor = SeqLocStart (slp);
24834   donor = SeqLocStop (slp);
24835 
24836   if (acceptor < 0 || acceptor >= len || donor < 0 || donor >= len) {
24837     BioseqUnlock (bsp);
24838     return FALSE;
24839   }
24840 
24841   if (strand != Seq_strand_minus) {
24842     if (! StreamCacheSetup (bsp, NULL, EXPAND_GAPS_TO_DASHES, &sc)) {
24843       BioseqUnlock (bsp);
24844       return FALSE;
24845     }
24846   } else {
24847     sint.from = 0;
24848     sint.to = len - 1;
24849     sint.strand = strand;
24850     sint.id = sip;
24851     vn.choice = SEQLOC_INT;
24852     vn.data.ptrvalue = (Pointer) &sint;
24853     vn.next = NULL;
24854     if (! StreamCacheSetup (NULL, &vn, EXPAND_GAPS_TO_DASHES, &sc)) {
24855       BioseqUnlock (bsp);
24856       return FALSE;
24857     }
24858   }
24859   BioseqUnlock (bsp);
24860 
24861   if (strand != Seq_strand_minus) {
24862     strt = acceptor;
24863     stp = donor;
24864   } else {
24865     strt = donor;
24866     donor = acceptor;
24867     acceptor = strt;
24868     stp = len - donor - 1;
24869     strt = len - acceptor - 1;
24870   }
24871 
24872   if (slpTag == SLP_NOSTOP && stp < len - 2) {
24873     StreamCacheSetPosition (&sc, stp + 1);
24874     residue = StreamCacheGetResidue (&sc);
24875     if (residue == '-' || residue == 'N') {
24876       rsult = TRUE;
24877     }
24878   } else if (slpTag == SLP_NOSTART && strt > 1) {
24879     StreamCacheSetPosition (&sc, strt - 1);
24880     residue = StreamCacheGetResidue (&sc);
24881     if (residue == '-' || residue == 'N') {
24882       rsult = TRUE;
24883     }
24884   }
24885 
24886   return rsult;
24887 }
24888 
24889 
24890 #if 0
24891 static void CheckTrnaCodons (ValidStructPtr vsp, GatherContextPtr gcp, SeqFeatPtr sfp, tRNAPtr trp)
24892 {
24893   Uint1           aa = 0;
24894   BioseqPtr       bsp;
24895   Int2            code = 0;
24896   CharPtr         codes = NULL;
24897   Uint1           codon [4];
24898   Uint1           from;
24899   CharPtr         gen_code_name = NULL;
24900   GeneticCodePtr  gncp;
24901   Uint2           idx;
24902   Int2            j;
24903   Int2            k;
24904   ErrSev          sev = SEV_ERROR;
24905   SeqMapTablePtr  smtp;
24906   Uint1           taa;
24907   CharPtr         three_letter_aa = NULL;
24908   ValNodePtr      vnp;
24909 
24910   if (vsp == NULL || gcp == NULL || sfp == NULL || trp == NULL)
24911     return;
24912 
24913   aa = 0;
24914   if (trp->aatype == 2) {
24915     aa = trp->aa;
24916   } else {
24917     from = 0;
24918     switch (trp->aatype) {
24919     case 0:
24920       from = 0;
24921       break;
24922     case 1:
24923       from = Seq_code_iupacaa;
24924       break;
24925     case 2:
24926       from = Seq_code_ncbieaa;
24927       break;
24928     case 3:
24929       from = Seq_code_ncbi8aa;
24930       break;
24931     case 4:
24932       from = Seq_code_ncbistdaa;
24933       break;
24934     default:
24935       break;
24936     }
24937     smtp = SeqMapTableFind (Seq_code_ncbieaa, from);
24938     if (smtp != NULL) {
24939       aa = SeqMapTableConvert (smtp, trp->aa);
24940     }
24941   }
24942 
24943   for (j = 0; j < 6; j++) {
24944     if (trp->codon[j] < 64) {
24945       if (codes == NULL) {
24946         bsp = GetBioseqGivenSeqLoc (sfp->location, gcp->entityID);
24947         /*
24948         sep = GetBestTopParentForData (gcp->entityID, bsp);
24949         code = SeqEntryToGeneticCode (sep, NULL, NULL, 0);
24950         */
24951         BioseqToGeneticCode (bsp, &code, NULL, NULL, NULL, 0, NULL);
24952         gncp = GeneticCodeFind (code, NULL);
24953         if (gncp == NULL) {
24954           gncp = GeneticCodeFind (1, NULL);
24955           code = 1;
24956         }
24957         if (gncp == NULL)
24958           return;
24959         for (vnp = (ValNodePtr) gncp->data.ptrvalue; vnp != NULL; vnp = vnp->next) {
24960           if (vnp->choice == 3) {
24961             codes = (CharPtr) vnp->data.ptrvalue;
24962           }
24963         }
24964       }
24965       if (codes == NULL)
24966         return;
24967       taa = codes[trp->codon[j]];
24968       if (aa > 0 && aa != 255) {
24969         if (taa != aa) {
24970           if (aa == 'U' || aa == 'O') {
24971             sev = SEV_WARNING;
24972           }
24973           if (aa == 'U' && taa == '*' && trp->codon [j] == 14) {
24974             /* selenocysteine normally uses TGA (14), so ignore without requiring exception in record */
24975           } else if (aa == 'O' && taa == '*' && trp->codon [j] == 11) {
24976             /* pyrrolysine normally uses TAG (11) in archaebacteria, so ignore without requiring exception in record */
24977 
24978             /* TAA (10) is not yet known to be used for an exceptional amino acid */
24979           } else if (StringISearch (sfp->except_text, "modified codon recognition") == NULL) {
24980             codon [0] = '\0';
24981             if (CodonForIndex (trp->codon [j], Seq_code_iupacna, codon)) {
24982               for (k = 0; k < 3; k++) {
24983                 if (codon [k] == 'T') {
24984                   codon [k] = 'U';
24985                 }
24986               }
24987               codon [3] = '\0';
24988             } else {
24989               StringCpy ((CharPtr) codon, "?");
24990             }
24991             three_letter_aa = Get3LetterSymbol (NULL, Seq_code_ncbieaa, NULL, aa);
24992             if (StringHasNoText (three_letter_aa)) {
24993               three_letter_aa = "?";
24994             }
24995             for (vnp = genetic_code_name_list; vnp != NULL; vnp = vnp->next) {
24996               if (vnp->choice != (Uint1) code) continue;
24997               gen_code_name = (CharPtr) vnp->data.ptrvalue;
24998               break;
24999             }
25000             if (StringHasNoText (gen_code_name)) {
25001               gen_code_name = "?";
25002             }
25003             ValidErr (vsp, sev, ERR_SEQ_FEAT_TrnaCodonWrong,
25004                       "Codon recognized by tRNA (%s) does not match amino acid (%c/%s) specified by genetic code (%d/%s)",
25005                       (char *) codon, (char) aa, (char *) three_letter_aa, (int) code, (char *) gen_code_name);
25006           }
25007         }
25008       }
25009     } else if (trp->codon [j] < 255) {
25010       ValidErr (vsp, sev, ERR_SEQ_FEAT_BadTrnaCodon, "tRNA codon value %d is greater than maximum 63", (int) trp->codon [j]);
25011     }
25012   }
25013 
25014   if (sfp->pseudo) return;
25015 
25016   if (aa > 0 && aa != 255) {
25017     /* - no gaps now that O and J are added
25018     if (aa <= 74) {
25019       shift = 0;
25020     } else if (aa > 79) {
25021       shift = 2;
25022     } else {
25023       shift = 1;
25024     }
25025     */
25026     if (aa != '*') {
25027       idx = aa - (64 /* + shift */);
25028     } else {
25029       idx = 25; /* termination */
25030     }
25031     if (idx > 0 && idx < 28) {
25032       /* valid trna amino acid */
25033     } else {
25034       ValidErr (vsp, sev, ERR_SEQ_FEAT_BadTrnaAA, "Invalid tRNA amino acid");
25035     }
25036   } else {
25037     ValidErr (vsp, sev, ERR_SEQ_FEAT_BadTrnaAA, "Missing tRNA amino acid");
25038   }
25039 }
25040 #endif
25041 
TwoListsHaveCommonItem(ValNodePtr list1,ValNodePtr list2)25042 static Boolean TwoListsHaveCommonItem (
25043   ValNodePtr list1,
25044   ValNodePtr list2
25045 )
25046 
25047 {
25048   CharPtr     str1;
25049   CharPtr     str2;
25050   ValNodePtr  vnp1;
25051   ValNodePtr  vnp2;
25052 
25053   for (vnp1 = list1; vnp1 != NULL; vnp1 = vnp1->next) {
25054     str1 = (CharPtr) vnp1->data.ptrvalue;
25055     if (StringHasNoText (str1)) continue;
25056     for (vnp2 = list2; vnp2 != NULL; vnp2 = vnp2->next) {
25057       str2 = (CharPtr) vnp2->data.ptrvalue;
25058       if (StringHasNoText (str2)) continue;
25059       if (StringICmp (str1, str2) == 0) return TRUE;
25060     }
25061   }
25062 
25063   return FALSE;
25064 }
25065 
25066 
CheckTrnaCodons(ValidStructPtr vsp,GatherContextPtr gcp,SeqFeatPtr sfp,tRNAPtr trp)25067 static void CheckTrnaCodons (
25068   ValidStructPtr vsp,
25069   GatherContextPtr gcp,
25070   SeqFeatPtr sfp,
25071   tRNAPtr trp
25072 )
25073 
25074 {
25075   Uint1           aa = 0;
25076   Uint1           anticodon [4];
25077   Char            ch;
25078   Int2            code = 0;
25079   CharPtr         codes = NULL;
25080   Uint1           codon [4];
25081   CharPtr         complementBase = " TVGH  CD  M KN   YSAABW R ";
25082   CharPtr         gen_code_name = NULL;
25083   Int2            i;
25084   Uint2           idx;
25085   Uint1           index;
25086   Int2            j;
25087   Int2            k;
25088   Uint1           letterToComp [256];
25089   Char            lttr;
25090   Boolean         okay;
25091   ValNodePtr      possibles = NULL;
25092   ValNodePtr      recognizes = NULL;
25093   StreamCache     sc;
25094   ErrSev          sev = SEV_ERROR;
25095   SeqLocPtr       slp;
25096   CharPtr         str;
25097   Uint1           taa;
25098   CharPtr         three_letter_aa = NULL;
25099   ValNodePtr      vnp;
25100   CharPtr         wobble = NULL;
25101   Boolean         rna_editing = FALSE;
25102 
25103   if (vsp == NULL || gcp == NULL || sfp == NULL || trp == NULL) return;
25104 
25105   anticodon [0] = '\0';
25106 
25107   /* extract indicated amino acid */
25108 
25109   aa = GetAaFromtRNA (trp);
25110 
25111   three_letter_aa = Get3LetterSymbol (NULL, Seq_code_ncbieaa, NULL, aa);
25112   if (StringHasNoText (three_letter_aa)) {
25113     three_letter_aa = "?";
25114   }
25115 
25116   /* find genetic code table */
25117   codes = GetCodesFortRNA(sfp, &code);
25118 
25119   if (codes == NULL) return;
25120 
25121   for (vnp = genetic_code_name_list; vnp != NULL; vnp = vnp->next) {
25122     if (vnp->choice != (Uint1) code) continue;
25123     gen_code_name = (CharPtr) vnp->data.ptrvalue;
25124     break;
25125   }
25126   if (StringHasNoText (gen_code_name)) {
25127     gen_code_name = "?";
25128   }
25129 
25130   /* set up nucleotide complementation lookup table */
25131 
25132   for (i = 0; i < 256; i++) {
25133     letterToComp [i] = '\0';
25134   }
25135   for (ch = 'A', i = 1; ch <= 'Z'; ch++, i++) {
25136     lttr = complementBase [i];
25137     if (lttr != ' ') {
25138       letterToComp [(int) (Uint1) ch] = lttr;
25139     }
25140   }
25141   for (ch = 'a', i = 1; ch <= 'z'; ch++, i++) {
25142     lttr = complementBase [i];
25143     if (lttr != ' ') {
25144       letterToComp [(int) (Uint1) ch] = lttr;
25145     }
25146   }
25147 
25148   if (StringCmp (sfp->except_text, "RNA editing") == 0) {
25149     rna_editing = TRUE;
25150   }
25151 
25152   /* loop through codon_recognized array */
25153 
25154   for (j = 0; j < 6; j++) {
25155     index = (Uint1) trp->codon [j];
25156 
25157     if (index == 255) continue;
25158 
25159     if (index >= 64) {
25160       ValidErr (vsp, sev, ERR_SEQ_FEAT_BadTrnaCodon,
25161                 "tRNA codon value %d is greater than maximum 63",
25162                 (int) (index));
25163       continue;
25164     }
25165     if (rna_editing) {
25166       continue;
25167     }
25168 
25169     taa = codes [index];
25170 
25171     codon [0] = '\0';
25172     if (CodonForIndex (index, Seq_code_iupacna, codon)) {
25173       /*
25174       for (k = 0; k < 3; k++) {
25175         if (codon [k] == 'T') {
25176           codon [k] = 'U';
25177         }
25178       }
25179       */
25180       codon [3] = '\0';
25181     } else {
25182       StringCpy ((CharPtr) codon, "?");
25183     }
25184 
25185     /* save codon recognized and translated amino acid for anticodon reality check */
25186 
25187     ValNodeCopyStr (&recognizes, taa, (CharPtr) codon);
25188 
25189     if (aa == 0 || aa == 255) continue;
25190 
25191     /* only report if encoded amino acid does not match indicated amino acid */
25192 
25193     if (taa == aa) continue;
25194 
25195     if (aa == 'U' || aa == 'O') {
25196       sev = SEV_WARNING;
25197     }
25198 
25199     /* selenocysteine normally uses TGA (14), so ignore without requiring exception in record */
25200     if (aa == 'U' && taa == '*' && index == 14) continue;
25201 
25202     /* pyrrolysine normally uses TAG (11) in archaebacteria, ignore without requiring exception */
25203     if (aa == 'O' && taa == '*' && index == 11) continue;
25204 
25205     /* TAA (10) is not yet known to be used for an exceptional amino acid, but the night is young */
25206 
25207     /* ignore if modified codon recognition exception is present */
25208     if (StringISearch (sfp->except_text, "modified codon recognition") != NULL) continue;
25209 
25210     for (k = 0; k < 3; k++) {
25211       if (codon [k] == 'T') {
25212         codon [k] = 'U';
25213       }
25214     }
25215     codon [3] = '\0';
25216 
25217     ValidErr (vsp, sev, ERR_SEQ_FEAT_TrnaCodonWrong,
25218               "Codon recognized by tRNA (%s) does not match amino acid (%c/%s) specified by genetic code (%d/%s)",
25219               (char *) codon, (char) aa, (char *) three_letter_aa, (int) code, (char *) gen_code_name);
25220   }
25221 
25222   /* see if anticodon is compatible with codons recognized and amino acid */
25223 
25224   slp = trp->anticodon;
25225   if (slp != NULL && SeqLocLen (slp) == 3) {
25226 
25227     /* read sequence under anticodon */
25228 
25229     if (StreamCacheSetup (NULL, slp, 0, &sc)) {
25230       for (i = 0; i < 3; i++) {
25231         ch = (Char) StreamCacheGetResidue (&sc);
25232         anticodon [i] = ch;
25233       }
25234       anticodon [3] = '\0';
25235 
25236       /* reverse complement non-wobble bases */
25237 
25238       codon [0] = letterToComp [(int) (Uint1) anticodon [2]];
25239       codon [1] = letterToComp [(int) (Uint1) anticodon [1]];
25240       codon [3] = '\0';
25241 
25242       /* expand wobble base to known binding partners */
25243 
25244       ch = anticodon [0];
25245       switch (ch) {
25246         case 'A' :
25247           wobble = "ACT";
25248           break;
25249         case 'C' :
25250           wobble = "G";
25251           break;
25252         case 'G' :
25253           wobble = "CT";
25254           break;
25255         case 'T' :
25256           wobble = "AG";
25257           break;
25258         default :
25259           break;
25260       }
25261 
25262       if (wobble != NULL) {
25263         for (i = 0; wobble [i] != '\0'; i++) {
25264           codon [2] = wobble [i];
25265           index = IndexForCodon (codon, Seq_code_iupacna);
25266           if (index < 64) {
25267             taa = codes [index];
25268 
25269             /* save possible codon recognized and translated amino acid */
25270 
25271             ValNodeCopyStr (&possibles, taa, (CharPtr) codon);
25272           }
25273         }
25274       }
25275     }
25276   }
25277 
25278   for (k = 0; k < 3; k++) {
25279     if (anticodon [k] == 'T') {
25280       anticodon [k] = 'U';
25281     }
25282   }
25283   anticodon [3] = '\0';
25284 
25285   if (StringHasNoText ((CharPtr) anticodon)) {
25286     StringCpy ((CharPtr) anticodon, "?");
25287   }
25288 
25289   /* check that codons predicted from anticodon can transfer indicated amino acid */
25290 
25291   if (possibles != NULL) {
25292     okay = FALSE;
25293     for (vnp = possibles; vnp != NULL; vnp = vnp->next) {
25294       str = (CharPtr) vnp->data.ptrvalue;
25295       if (StringHasNoText (str)) continue;
25296       taa = vnp->choice;
25297       if (taa == aa) {
25298         okay = TRUE;
25299       }
25300     }
25301     if (! okay) {
25302       if (aa == 'U' && StringCmp ((CharPtr) anticodon, "UCA") == 0) {
25303         /* ignore TGA codon for selenocysteine */
25304       } else if (aa == 'O' && StringCmp ((CharPtr) anticodon, "CUA") == 0) {
25305         /* ignore TAG codon for pyrrolysine */
25306       } else if (StringISearch (sfp->except_text, "modified codon recognition") == NULL &&
25307                  StringISearch (sfp->except_text, "RNA editing") == NULL) {
25308         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadAnticodonAA,
25309                   "Codons predicted from anticodon (%s) cannot produce amino acid (%c/%s)",
25310                   (char *) anticodon, (char) aa, (char *) three_letter_aa);
25311       }
25312     }
25313   }
25314 
25315   /* check that codons recognized match codons predicted from anticodon */
25316 
25317   if (recognizes != NULL && possibles != NULL) {
25318     okay = FALSE;
25319     if (TwoListsHaveCommonItem (recognizes, possibles)) {
25320       okay = TRUE;
25321     }
25322     if (! okay) {
25323       if (StringISearch (sfp->except_text, "RNA editing") == NULL) {
25324         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadAnticodonCodon,
25325                   "Codon recognized cannot be produced from anticodon (%s)",
25326                   (char*) anticodon);
25327       }
25328     }
25329   }
25330 
25331   ValNodeFreeData (recognizes);
25332   ValNodeFreeData (possibles);
25333 
25334   if (sfp->pseudo) return;
25335 
25336   if (aa == 0 || aa == 255) {
25337     ValidErr (vsp, sev, ERR_SEQ_FEAT_BadTrnaAA, "Missing tRNA amino acid");
25338     return;
25339   }
25340 
25341   /* verify that legal amino acid is indicated */
25342 
25343   /* - no gaps now that O and J are added
25344   if (aa <= 74) {
25345     shift = 0;
25346   } else if (aa > 79) {
25347     shift = 2;
25348   } else {
25349     shift = 1;
25350   }
25351   */
25352   if (aa != '*') {
25353     idx = aa - (64 /* + shift */);
25354   } else {
25355     idx = 25; /* termination */
25356   }
25357   if (idx == 0 || idx >= 28) {
25358     ValidErr (vsp, sev, ERR_SEQ_FEAT_BadTrnaAA, "Invalid tRNA amino acid");
25359   }
25360 }
25361 
CheckRnaProductType(ValidStructPtr vsp,GatherContextPtr gcp,SeqFeatPtr sfp,RnaRefPtr rrp)25362 static void CheckRnaProductType (ValidStructPtr vsp, GatherContextPtr gcp, SeqFeatPtr sfp, RnaRefPtr rrp)
25363 
25364 {
25365   BioseqPtr          bsp;
25366   SeqMgrDescContext  context;
25367   MolInfoPtr         mip;
25368   SeqDescrPtr        sdp;
25369 
25370   if (vsp == NULL || gcp == NULL || sfp == NULL || rrp == NULL) return;
25371   bsp = BioseqFindFromSeqLoc (sfp->product);
25372   if (bsp == NULL) return;
25373   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &context);
25374   if (sdp == NULL) return;
25375   mip = (MolInfoPtr) sdp->data.ptrvalue;
25376   if (mip == NULL) return;
25377   switch (rrp->type) {
25378     case 2 : /* mRNA */
25379       if (mip->biomol == MOLECULE_TYPE_MRNA) return;
25380       break;
25381     case 3 : /* tRNA */
25382       if (mip->biomol == MOLECULE_TYPE_TRNA) return;
25383       break;
25384     case 4 : /* rRNA */
25385       if (mip->biomol == MOLECULE_TYPE_RRNA) return;
25386       break;
25387     default :
25388       return;
25389   }
25390   ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_RnaProductMismatch, "Type of RNA does not match MolInfo of product Bioseq");
25391 }
25392 
GetParentNPS(BioseqPtr bsp)25393 static BioseqSetPtr GetParentNPS (BioseqPtr bsp)
25394 {
25395   BioseqSetPtr    bssp;
25396 
25397   if (bsp == NULL)
25398     return NULL;
25399   if (bsp->idx.parenttype != OBJ_BIOSEQSET)
25400     return NULL;
25401   bssp = (BioseqSetPtr) bsp->idx.parentptr;
25402   while (bssp != NULL && bssp->_class != BioseqseqSet_class_nuc_prot && bssp->idx.parenttype == OBJ_BIOSEQSET) {
25403     bssp = (BioseqSetPtr) bssp->idx.parentptr;
25404   }
25405   if (bssp != NULL && bssp->_class == BioseqseqSet_class_nuc_prot)
25406     return bssp;
25407   return NULL;
25408 }
25409 
NucAndProtNotInNPS(BioseqPtr nuc,BioseqPtr prot)25410 static Boolean NucAndProtNotInNPS (BioseqPtr nuc, BioseqPtr prot)
25411 {
25412   BioseqSetPtr    bssp;
25413 
25414   if (nuc == NULL || prot == NULL)
25415     return FALSE;
25416   bssp = GetParentNPS (nuc);
25417   if (bssp == NULL)
25418     return TRUE;
25419   if (GetParentNPS (prot) != bssp)
25420     return TRUE;
25421   return FALSE;
25422 }
25423 
CDS5primePartialTest(SeqFeatPtr sfp)25424 static Boolean CDS5primePartialTest (
25425   SeqFeatPtr sfp
25426 )
25427 
25428 {
25429   BioseqPtr  nbsp;
25430   SeqLocPtr  slp = NULL;
25431 
25432   if (sfp == NULL) return FALSE;
25433   nbsp = BioseqFindFromSeqLoc (sfp->location);
25434   if (nbsp != NULL) {
25435     slp = SeqLocFindNext (sfp->location, NULL);
25436     if (slp != NULL) {
25437       if (SeqLocStrand (slp) == Seq_strand_minus) {
25438         if (SeqLocStop (slp) == nbsp->length - 1) {
25439           return TRUE;
25440         }
25441       } else {
25442         if (SeqLocStart (slp) == 0) {
25443           return TRUE;
25444         }
25445       }
25446     }
25447   }
25448   return FALSE;
25449 }
25450 
CDS3primePartialTest(SeqFeatPtr sfp)25451 static Boolean CDS3primePartialTest (
25452   SeqFeatPtr sfp
25453 )
25454 
25455 {
25456   BioseqPtr  nbsp;
25457   SeqLocPtr  last = NULL;
25458   SeqLocPtr  slp = NULL;
25459 
25460   if (sfp == NULL) return FALSE;
25461   nbsp = BioseqFindFromSeqLoc (sfp->location);
25462   if (nbsp != NULL) {
25463     last = NULL;
25464     slp = SeqLocFindNext (sfp->location, NULL);
25465     while (slp != NULL) {
25466       last = slp;
25467       slp = SeqLocFindNext (sfp->location, last);
25468     }
25469     if (last != NULL) {
25470       if (SeqLocStrand (last) == Seq_strand_minus) {
25471         if (SeqLocStart (last) == 0) {
25472           return TRUE;
25473         }
25474       } else {
25475         if (SeqLocStop (last) == nbsp->length - 1) {
25476           return TRUE;
25477         }
25478       }
25479     }
25480   }
25481   return FALSE;
25482 }
25483 
25484 static CharPtr bypass_cds_partial_check [] = {
25485   "RNA editing",
25486   "reasons given in citation",
25487   "artificial frameshift",
25488   "rearrangement required for product",
25489   "translated product replaced",
25490   "unclassified translation discrepancy",
25491   "mismatches in translation",
25492   /*
25493   "adjusted for low-quality genome",
25494   */
25495   "annotated by transcript or proteomic data",
25496   /*
25497   "heterogeneous population sequenced",
25498   "low-quality sequence region",
25499   "artificial location",
25500   */
25501   NULL
25502 };
25503 
CheckCDSPartial(ValidStructPtr vsp,SeqFeatPtr sfp)25504 static void CheckCDSPartial (ValidStructPtr vsp, SeqFeatPtr sfp)
25505 
25506 {
25507   BioseqPtr          bsp;
25508   SeqMgrDescContext  context;
25509   Int4               i;
25510   MolInfoPtr         mip;
25511   Boolean            partial5;
25512   Boolean            partial3;
25513   SeqDescrPtr        sdp;
25514   ErrSev             sev;
25515   Boolean            need_to_unlock = FALSE;
25516 
25517   if (vsp == NULL || sfp == NULL) return;
25518   if (sfp->product == NULL) return;
25519   if (!vsp->useSeqMgrIndexes) return;
25520 
25521   if (sfp->excpt && (! vsp->ignoreExceptions) && (! StringHasNoText (sfp->except_text))) {
25522     for (i = 0; bypass_cds_partial_check [i] != NULL; i++) {
25523       if (StringISearch (sfp->except_text,  bypass_cds_partial_check [i]) != NULL) {
25524         return;  /* biological exception */
25525       }
25526     }
25527   }
25528 
25529   bsp = BioseqFindFromSeqLoc (sfp->product);
25530   if (bsp == NULL && vsp->farFetchCDSproducts) {
25531     bsp = BioseqLockById (SeqLocId(sfp->product));
25532     if (bsp != NULL) {
25533       need_to_unlock = TRUE;
25534     }
25535   }
25536   if (bsp == NULL) return;
25537   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &context);
25538   if (sdp == NULL) {
25539     if (need_to_unlock) {
25540       BioseqUnlock(bsp);
25541     }
25542     return;
25543   }
25544   mip = (MolInfoPtr) sdp->data.ptrvalue;
25545   if (mip == NULL) {
25546     if (need_to_unlock) {
25547       BioseqUnlock (bsp);
25548     }
25549     return;
25550   }
25551   CheckSeqLocForPartial (sfp->location, &partial5, &partial3);
25552 
25553   switch (mip->completeness) {
25554     case 0 : /* unknown */
25555       break;
25556     case 1 : /* complete */
25557       if (partial5 || partial3) {
25558         ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_PartialProblem, "CDS is partial but protein is complete");
25559       }
25560       break;
25561     case 2 : /* partial */
25562       break;
25563     case 3 : /* no-left */
25564       if (! partial5) {
25565         ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_PartialProblem, "CDS is 5' complete but protein is NH2 partial");
25566       }
25567       if (partial3) {
25568         sev = SEV_ERROR;
25569         if (CDS3primePartialTest (sfp)) {
25570           sev = SEV_WARNING;
25571         }
25572         ValidErr (vsp, sev, ERR_SEQ_FEAT_PartialProblem, "CDS is 3' partial but protein is NH2 partial");
25573       }
25574       break;
25575     case 4 : /* no-right */
25576       if (! partial3) {
25577         ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_PartialProblem, "CDS is 3' complete but protein is CO2 partial");
25578       }
25579       if (partial5) {
25580         sev = SEV_ERROR;
25581         if (CDS5primePartialTest (sfp)) {
25582           sev = SEV_WARNING;
25583         }
25584         ValidErr (vsp, sev, ERR_SEQ_FEAT_PartialProblem, "CDS is 5' partial but protein is CO2 partial");
25585       }
25586       break;
25587     case 5 : /* no-ends */
25588       if (partial5 && partial3) {
25589       } else if (partial5) {
25590         sev = SEV_ERROR;
25591         if (CDS5primePartialTest (sfp)) {
25592           sev = SEV_WARNING;
25593         }
25594         ValidErr (vsp, sev, ERR_SEQ_FEAT_PartialProblem, "CDS is 5' partial but protein has neither end");
25595       } else if (partial3) {
25596         sev = SEV_ERROR;
25597         if (CDS3primePartialTest (sfp)) {
25598           sev = SEV_WARNING;
25599         }
25600         ValidErr (vsp, sev, ERR_SEQ_FEAT_PartialProblem, "CDS is 3' partial but protein has neither end");
25601       } else {
25602         ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_PartialProblem, "CDS is complete but protein has neither end");
25603       }
25604       break;
25605     case 6 : /* has-left */
25606       break;
25607     case 7 : /* has-right */
25608       break;
25609     default :
25610       break;
25611   }
25612   if (need_to_unlock) {
25613     BioseqUnlock (bsp);
25614   }
25615 }
25616 
CheckForCommonCDSProduct(ValidStructPtr vsp,SeqFeatPtr sfp)25617 static void CheckForCommonCDSProduct (ValidStructPtr vsp, SeqFeatPtr sfp)
25618 {
25619   BioseqPtr       bsp;
25620   BioseqSetPtr    bssp;
25621   SeqFeatPtr      cds;
25622   CdRegionPtr     crp;
25623   SeqFeatPtr      gene;
25624   GeneRefPtr      grp;
25625   Boolean         is_nc = FALSE;
25626   Boolean         is_nc_gps = FALSE;
25627   Boolean         is_nt = FALSE;
25628   Boolean         is_nw = FALSE;
25629   BioseqPtr       nuc;
25630   SeqEntryPtr     sep;
25631   SeqIdPtr        sip;
25632   TextSeqIdPtr    tsip;
25633 
25634   if (sfp == NULL || sfp->pseudo)
25635     return;
25636   if (!vsp->useSeqMgrIndexes)
25637     return;
25638   crp = (CdRegionPtr) sfp->data.value.ptrvalue;
25639   if (crp != NULL && crp->orf)
25640     return;
25641 
25642   grp = SeqMgrGetGeneXref (sfp);
25643   if (grp == NULL || (!SeqMgrGeneIsSuppressed (grp))) {
25644     gene = SeqMgrGetOverlappingGene (sfp->location, NULL);
25645     if (gene != NULL) {
25646       if (gene->pseudo) return;
25647       grp = (GeneRefPtr) gene->data.value.ptrvalue;
25648       if (grp != NULL && grp->pseudo) return;
25649     }
25650   }
25651   if (sfp->product == NULL) return;
25652   bsp = BioseqFindFromSeqLoc (sfp->product);
25653   if (bsp == NULL) {
25654     sip = SeqLocId (sfp->product);
25655     /* okay to have far RefSeq product... */
25656     if (sip == NULL || sip->choice != SEQID_OTHER) {
25657       sep = vsp->sep;
25658       if (sep != NULL && IS_Bioseq_set (sep)) {
25659         bssp = (BioseqSetPtr) sep->data.ptrvalue;
25660         /* but only if genomic product set */
25661         if (bssp != NULL && bssp->_class == BioseqseqSet_class_gen_prod_set)
25662           return;
25663         if (bssp != NULL && bssp->_class == BioseqseqSet_class_genbank) {
25664           sep = bssp->seq_set;
25665           if (sep != NULL && IS_Bioseq_set (sep)) {
25666             bssp = (BioseqSetPtr) sep->data.ptrvalue;
25667             if (bssp != NULL && bssp->_class == BioseqseqSet_class_gen_prod_set)
25668               return;
25669           }
25670         }
25671       }
25672       /* or just a bioseq */
25673       if (sep != NULL && IS_Bioseq (sep))
25674         return;
25675       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_MissingCDSproduct, "Unable to find product Bioseq from CDS feature");
25676     }
25677     return;
25678   }
25679   nuc = BioseqFindFromSeqLoc (sfp->location);
25680   if (nuc != NULL) {
25681     for (sip = nuc->id; sip != NULL; sip = sip->next) {
25682       if (sip->choice == SEQID_OTHER) {
25683         tsip = (TextSeqIdPtr) sip->data.ptrvalue;
25684         if (tsip != NULL && tsip->accession != NULL) {
25685           if (StringNICmp (tsip->accession, "NT_", 3) == 0) {
25686             is_nt = TRUE;
25687           } else if (StringNICmp (tsip->accession, "NC_", 3) == 0) {
25688             is_nc = TRUE;
25689           } else if (StringNICmp (tsip->accession, "NW_", 3) == 0) {
25690             is_nw = TRUE;
25691           }
25692         }
25693       }
25694     }
25695     if (/* (is_nc || is_nw) && */ nuc->idx.parenttype == OBJ_BIOSEQSET) {
25696       bssp = (BioseqSetPtr) nuc->idx.parentptr;
25697       if (bssp != NULL) {
25698         if (bssp->_class == BioseqseqSet_class_gen_prod_set) {
25699           is_nc_gps = TRUE;
25700         }
25701       }
25702     }
25703     if (NucAndProtNotInNPS (nuc, bsp) && (! is_nt) && (! is_nc_gps)) {
25704       if (vsp->is_small_genome_set) {
25705         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_CDSproductPackagingProblem, "Protein product not packaged in nuc-prot set with nucleotide in small genome set");
25706       } else {
25707         ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_CDSproductPackagingProblem, "Protein product not packaged in nuc-prot set with nucleotide");
25708       }
25709     }
25710   }
25711   cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
25712   if (cds == NULL) return;
25713   if (cds != sfp) {
25714     /* if genomic product set, with one cds on contig and one on cdna, do not report */
25715     sep = vsp->sep;
25716     if (sep != NULL && IS_Bioseq_set (sep)) {
25717       bssp = (BioseqSetPtr) sep->data.ptrvalue;
25718       if (bssp != NULL && bssp->_class == BioseqseqSet_class_gen_prod_set) {
25719         /* feature packaging test will do final contig vs. cdna check */
25720         if (BioseqFindFromSeqLoc (cds->location) != BioseqFindFromSeqLoc (sfp->location))
25721           return;
25722       }
25723       if (bssp != NULL && bssp->_class == BioseqseqSet_class_genbank) {
25724         sep = bssp->seq_set;
25725         if (sep != NULL && IS_Bioseq_set (sep)) {
25726           bssp = (BioseqSetPtr) sep->data.ptrvalue;
25727           if (bssp != NULL && bssp->_class == BioseqseqSet_class_gen_prod_set)
25728             if (BioseqFindFromSeqLoc (cds->location) != BioseqFindFromSeqLoc (sfp->location))
25729               return;
25730         }
25731       }
25732     }
25733 
25734     ValidErr (vsp, SEV_REJECT, ERR_SEQ_FEAT_MultipleCDSproducts, "Same product Bioseq from multiple CDS features");
25735   }
25736 }
25737 
CheckForCommonMRNAProduct(ValidStructPtr vsp,SeqFeatPtr sfp)25738 static void CheckForCommonMRNAProduct (ValidStructPtr vsp, SeqFeatPtr sfp)
25739 {
25740   BioseqPtr       bsp;
25741   BioseqSetPtr    bssp;
25742   SeqFeatPtr      gene;
25743   GeneRefPtr      grp;
25744   SeqFeatPtr      mrna;
25745   SeqEntryPtr     oldscope;
25746   SeqEntryPtr     sep;
25747   SeqIdPtr        sip;
25748 
25749   if (sfp == NULL || sfp->pseudo)
25750     return;
25751   if (!vsp->useSeqMgrIndexes)
25752     return;
25753   grp = SeqMgrGetGeneXref (sfp);
25754   if (grp == NULL || (!SeqMgrGeneIsSuppressed (grp))) {
25755     gene = SeqMgrGetOverlappingGene (sfp->location, NULL);
25756     if (gene == NULL || gene->pseudo)
25757       return;
25758     grp = (GeneRefPtr) gene->data.value.ptrvalue;
25759     if (grp != NULL && grp->pseudo)
25760       return;
25761   }
25762   if (sfp->product == NULL)
25763     return;
25764 
25765   oldscope = SeqEntrySetScope (vsp->sep);
25766   bsp = BioseqFindFromSeqLoc (sfp->product);
25767   SeqEntrySetScope (oldscope);
25768   if (bsp == NULL) {
25769     sip = SeqLocId (sfp->product);
25770     if (sip != NULL && sip->choice == SEQID_LOCAL) {
25771       sep = vsp->sep;
25772       if (sep != NULL && IS_Bioseq_set (sep)) {
25773         bssp = (BioseqSetPtr) sep->data.ptrvalue;
25774         if (bssp != NULL) {
25775           if (bssp->_class == BioseqseqSet_class_gen_prod_set ||
25776               bssp->_class == BioseqseqSet_class_other) {
25777             ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_MissingMRNAproduct,
25778             "Product Bioseq of mRNA feature is not packaged in the record");
25779           }
25780         }
25781       }
25782     }
25783     return;
25784   }
25785 
25786   mrna = SeqMgrGetRNAgivenProduct (bsp, NULL);
25787   if (mrna == NULL)
25788     return;
25789   if (mrna != sfp) {
25790     ValidErr (vsp, SEV_REJECT, ERR_SEQ_FEAT_MultipleMRNAproducts, "Same product Bioseq from multiple mRNA features");
25791   }
25792 }
25793 
CheckForBadGeneOverlap(ValidStructPtr vsp,SeqFeatPtr sfp)25794 static void CheckForBadGeneOverlap (ValidStructPtr vsp, SeqFeatPtr sfp)
25795 {
25796   SeqMgrFeatContext fcontext;
25797   SeqFeatPtr      gene, operon;
25798   GeneRefPtr      grp;
25799   ErrSev          sev = /* SEV_ERROR */ SEV_WARNING;
25800 
25801   if (sfp == NULL)
25802     return;
25803   grp = SeqMgrGetGeneXref (sfp);
25804   if (grp != NULL)
25805     return;
25806   gene = SeqMgrGetOverlappingGene (sfp->location, &fcontext);
25807   if (gene != NULL)
25808     return;
25809   gene = SeqMgrGetOverlappingFeature (sfp->location, FEATDEF_GENE, NULL, 0, NULL, SIMPLE_OVERLAP, &fcontext);
25810   if (gene == NULL)
25811     return;
25812   if (IsNCorNT (vsp->sep, sfp->location)) {
25813     sev = SEV_WARNING;
25814   }
25815   if (sfp->data.choice == SEQFEAT_CDREGION) {
25816     ValidErr (vsp, sev, ERR_SEQ_FEAT_CDSgeneRange, "gene overlaps CDS but does not completely contain it");
25817   } else if (sfp->data.choice == SEQFEAT_RNA) {
25818     operon = SeqMgrGetOverlappingOperon (sfp->location, &fcontext);
25819     if (operon != NULL)
25820       return;
25821     ValidErr (vsp, sev, ERR_SEQ_FEAT_mRNAgeneRange, "gene overlaps mRNA but does not completely contain it");
25822   }
25823 }
25824 
CheckForBadMRNAOverlap(ValidStructPtr vsp,SeqFeatPtr sfp)25825 static void CheckForBadMRNAOverlap (ValidStructPtr vsp, SeqFeatPtr sfp)
25826 
25827 {
25828   BioseqPtr          bsp;
25829   SeqMgrFeatContext  fcontext;
25830   SeqFeatPtr         gene = NULL;
25831   GeneRefPtr         grp;
25832   SeqFeatPtr         mrna;
25833   Boolean            pseudo = FALSE;
25834   ErrSev             sev = /* SEV_ERROR */ SEV_WARNING;
25835 
25836   if (sfp == NULL)
25837     return;
25838 
25839   if (sfp->pseudo) {
25840     pseudo = TRUE;
25841   }
25842   grp = SeqMgrGetGeneXref (sfp);
25843   if (grp != NULL) {
25844     if (SeqMgrGeneIsSuppressed (grp)) {
25845     } else {
25846       if (grp->pseudo) return;
25847       bsp = BioseqFindFromSeqLoc (sfp->location);
25848       if (bsp != NULL) {
25849         if (StringDoesHaveText (grp->locus_tag)) {
25850           gene = SeqMgrGetGeneByLocusTag (bsp, grp->locus_tag, &fcontext);
25851         } else if (StringDoesHaveText (grp->locus)) {
25852           gene = SeqMgrGetFeatureByLabel (bsp, grp->locus_tag, SEQFEAT_GENE, 0, &fcontext);
25853         }
25854         if (gene != NULL) {
25855           grp = (GeneRefPtr) gene->data.value.ptrvalue;
25856           if (grp != NULL && grp->pseudo) {
25857             pseudo = TRUE;
25858           }
25859         }
25860       }
25861     }
25862   }
25863 
25864   mrna = SeqMgrGetOverlappingFeature (sfp->location, FEATDEF_mRNA, NULL, 0, NULL, SIMPLE_OVERLAP, &fcontext);
25865   if (mrna == NULL)
25866     return;
25867   mrna = SeqMgrGetOverlappingFeature (sfp->location, FEATDEF_mRNA, NULL, 0, NULL, CHECK_INTERVALS, &fcontext);
25868   if (mrna != NULL)
25869     return;
25870   mrna = SeqMgrGetOverlappingFeature (sfp->location, FEATDEF_mRNA, NULL, 0, NULL, INTERVAL_OVERLAP, &fcontext);
25871   if (mrna == NULL)
25872     return;
25873   if (IsNCorNTorNW (vsp->sep, sfp->location)) {
25874     sev = SEV_WARNING;
25875   }
25876   if (sfp->excpt) {
25877     sev = SEV_WARNING;
25878   }
25879   mrna = SeqMgrGetOverlappingFeature (sfp->location, FEATDEF_mRNA, NULL, 0, NULL, LOCATION_SUBSET, &fcontext);
25880   if (mrna != NULL) {
25881     if (StringISearch (sfp->except_text, "ribosomal slippage") == NULL && StringISearch (sfp->except_text, "trans-splicing") == NULL) {
25882       if (pseudo) {
25883           //LCOV_EXCL_START
25884           //code never reached, because if pseudo function would have returned
25885         ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_PseudoCDSmRNArange, "mRNA contains CDS but internal intron-exon boundaries do not match");
25886         //LCOV_EXCL_STOP
25887       } else {
25888         ValidErr (vsp, sev, ERR_SEQ_FEAT_CDSmRNArange, "mRNA contains CDS but internal intron-exon boundaries do not match");
25889       }
25890     }
25891   } else {
25892     if (pseudo) {
25893         //LCOV_EXCL_START
25894         //code never reached, because if pseudo function would have returned
25895       ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_PseudoCDSmRNArange, "mRNA overlaps or contains CDS but does not completely contain intervals");
25896       //LCOV_EXCL_STOP
25897     } else {
25898       ValidErr (vsp, sev, ERR_SEQ_FEAT_CDSmRNArange, "mRNA overlaps or contains CDS but does not completely contain intervals");
25899     }
25900   }
25901 }
25902 
25903 /*
25904 static void CheckForBothStrands (ValidStructPtr vsp, SeqFeatPtr sfp)
25905 {
25906   Boolean    bothstrands = FALSE, bothreverse = FALSE;
25907   SeqLocPtr  location, slp = NULL;
25908   Uint1      strand;
25909 
25910   if (sfp == NULL)
25911     return;
25912   location = sfp->location;
25913   if (location == NULL)
25914     return;
25915   while ((slp = SeqLocFindNext (location, slp)) != NULL) {
25916     strand = SeqLocStrand (slp);
25917     if (strand == Seq_strand_both) {
25918       bothstrands = TRUE;
25919     } else if (strand == Seq_strand_both_rev) {
25920       bothreverse = TRUE;
25921     }
25922   }
25923   if (bothstrands && bothreverse) {
25924     ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_BothStrands, "mRNA or CDS may not be on both (forward and reverse) strands");
25925   } else if (bothstrands) {
25926     ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_BothStrands, "mRNA or CDS may not be on both (forward) strands");
25927   } else if (bothreverse) {
25928     ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_BothStrands, "mRNA or CDS may not be on both (reverse) strands");
25929   }
25930 }
25931 */
25932 
CheckForBothOrBothRev(ValidStructPtr vsp,SeqFeatPtr sfp)25933 static void CheckForBothOrBothRev (ValidStructPtr vsp, SeqFeatPtr sfp)
25934 
25935 {
25936   Boolean    bothstrands = FALSE, bothreverse = FALSE, iswhole = FALSE;
25937   SeqLocPtr  location, slp = NULL;
25938   CharPtr    prefix = "Feature";
25939   ErrSev     sev = SEV_WARNING;
25940   CharPtr    suffix = "";
25941   Uint1      strand;
25942 
25943   if (sfp == NULL) return;
25944   location = sfp->location;
25945   if (location == NULL) return;
25946 
25947   if (sfp->idx.subtype == FEATDEF_CDS) {
25948     sev = SEV_ERROR;
25949     prefix = "CDS";
25950   } else if (sfp->idx.subtype == FEATDEF_mRNA) {
25951     sev = SEV_ERROR;
25952     prefix = "mRNA";
25953   }
25954 
25955   while ((slp = SeqLocFindNext (location, slp)) != NULL) {
25956     if (slp->choice == SEQLOC_WHOLE) {
25957       iswhole = TRUE;
25958     } else {
25959       strand = SeqLocStrand (slp);
25960       if (strand == Seq_strand_both) {
25961         bothstrands = TRUE;
25962       } else if (strand == Seq_strand_both_rev) {
25963         bothreverse = TRUE;
25964       }
25965     }
25966   }
25967   if (bothstrands && bothreverse) {
25968     suffix = "(forward and reverse)";
25969   } else if (bothstrands) {
25970     suffix = "(forward)";
25971   } else if (bothreverse) {
25972     suffix = "(reverse)";
25973   }
25974   //LCOV_EXCL_START
25975   //cannot test with regression because basic cleanup corrects problems
25976   if (bothstrands || bothreverse) {
25977     ValidErr (vsp, sev, ERR_SEQ_FEAT_BothStrands, "%s may not be on both %s strands", prefix, suffix);
25978   }
25979   if (iswhole) {
25980     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_WholeLocation, "%s may not have whole location", prefix);
25981   }
25982   //LCOV_EXCL_STOP
25983 }
25984 
OverlappingGeneIsPseudo(SeqFeatPtr sfp)25985 static Boolean OverlappingGeneIsPseudo (SeqFeatPtr sfp)
25986 {
25987   SeqFeatPtr      gene;
25988   GeneRefPtr      grp;
25989 
25990   if (sfp == NULL)
25991     return FALSE;
25992   grp = SeqMgrGetGeneXref (sfp);
25993   if (grp != NULL) {
25994     if (grp->pseudo)
25995       return TRUE;
25996     return FALSE;
25997   }
25998   gene = SeqMgrGetOverlappingGene (sfp->location, NULL);
25999   if (gene != NULL) {
26000     if (gene->pseudo)
26001       return TRUE;
26002     grp = (GeneRefPtr) gene->data.value.ptrvalue;
26003     if (grp != NULL) {
26004       if (grp->pseudo)
26005         return TRUE;
26006     }
26007   }
26008   return FALSE;
26009 }
26010 
CheckForIllegalDbxref(ValidStructPtr vsp,GatherContextPtr gcp,ValNodePtr dbxref)26011 static void CheckForIllegalDbxref (ValidStructPtr vsp, GatherContextPtr gcp, ValNodePtr dbxref)
26012 
26013 {
26014   Char         buf [32];
26015   DbtagPtr     db;
26016   CharPtr      good;
26017   Int4         id;
26018   Boolean      is_bc;
26019   Boolean      is_rf;
26020   Boolean      is_sc;
26021   ObjectIdPtr  oip;
26022   ValNodePtr   vnp;
26023 
26024   for (vnp = dbxref; vnp != NULL; vnp = vnp->next) {
26025     id = -1;
26026     db = (DbtagPtr) vnp->data.ptrvalue;
26027     if (db != NULL && db->db != NULL) {
26028 
26029       if (DbxrefIsValid (db->db, &is_rf, &is_sc, &is_bc, &good)) {
26030         if (is_bc) {
26031           if (StringHasNoText (good)) {
26032             good = "?";
26033           }
26034           if (is_sc && StringICmp (db->db, "taxon") == 0) {
26035             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_IllegalDbXref,
26036                       "Illegal db_xref type %s (%s), legal capitalization is %s, but should only be used on an OrgRef",
26037                       db->db, ValGetDbtagStr (db, buf), good);
26038           } else {
26039             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_IllegalDbXref,
26040                       "Illegal db_xref type %s (%s), legal capitalization is %s",
26041                       db->db, ValGetDbtagStr (db, buf), good);
26042           }
26043         } else if (is_rf) {
26044           if (vsp->is_refseq_in_sep || vsp->is_gps_in_sep) {
26045           } else {
26046             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_IllegalDbXref,
26047                       "db_xref type %s (%s) is only legal for RefSeq", db->db, ValGetDbtagStr (db, buf));
26048           }
26049         } else if (is_sc && StringICmp (db->db, "taxon") == 0) {
26050           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_IllegalDbXref,
26051                     "db_xref type %s (%s) should only be used on an OrgRef", db->db, ValGetDbtagStr (db, buf));
26052         } else {
26053         }
26054       } else {
26055         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_IllegalDbXref, "Illegal db_xref type %s (%s)", db->db, ValGetDbtagStr (db, buf));
26056       }
26057 
26058       if (StringDoesHaveText (db->db)) {
26059         if (StringHasSgml (vsp, db->db)) {
26060           ValidErr (vsp, SEV_WARNING, ERR_GENERIC_SgmlPresentInText, "dbxref database %s has SGML", db->db);
26061         }
26062       }
26063 
26064       oip = db->tag;
26065       if (oip != NULL && StringDoesHaveText (oip->str)) {
26066         if (StringHasSgml (vsp, oip->str)) {
26067           ValidErr (vsp, SEV_WARNING, ERR_GENERIC_SgmlPresentInText, "dbxref value %s has SGML", oip->str);
26068         }
26069       }
26070 
26071       /*
26072       dbxerr = NULL;
26073       valid = IsDbxrefValid (db->db, sfp, NULL,
26074                              GPSorRefSeq (vsp->sep, sfp->location),
26075                              &dbxerr);
26076       if (dbxerr != NULL) {
26077         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_IllegalDbXref, dbxerr);
26078         dbxerr = MemFree (dbxerr);
26079       }
26080       */
26081     }
26082   }
26083 }
26084 
26085 static CharPtr plastidtxt [] = {
26086   "",
26087   "",
26088   "chloroplast",
26089   "chromoplast",
26090   "",
26091   "",
26092   "plastid",
26093   "",
26094   "",
26095   "",
26096   "",
26097   "",
26098   "cyanelle",
26099   "",
26100   "",
26101   "",
26102   "apicoplast",
26103   "leucoplast",
26104   "proplastid",
26105   "",
26106   ""
26107 };
26108 
26109 static CharPtr legal_exception_strings [] = {
26110   "RNA editing",
26111   "reasons given in citation",
26112   "rearrangement required for product",
26113   "ribosomal slippage",
26114   "trans-splicing",
26115   "alternative processing",
26116   "artificial frameshift",
26117   "nonconsensus splice site",
26118   "modified codon recognition",
26119   "alternative start codon",
26120   "dicistronic gene",
26121   "transcribed product replaced",
26122   "translated product replaced",
26123   "transcribed pseudogene",
26124   "annotated by transcript or proteomic data",
26125   "heterogeneous population sequenced",
26126   "low-quality sequence region",
26127   "unextendable partial coding region",
26128   "artificial location",
26129   "gene split at contig boundary",
26130   "gene split at sequence boundary",
26131   kAllowManualGenCodeException,
26132   NULL
26133 };
26134 
26135 static CharPtr refseq_exception_strings [] = {
26136   "unclassified transcription discrepancy",
26137   "unclassified translation discrepancy",
26138   "mismatches in transcription",
26139   "mismatches in translation",
26140   "adjusted for low-quality genome",
26141   "translation initiation by tRNA-Leu at CUG codon",
26142   "16S ribosomal RNA and 23S ribosomal RNA overlap",
26143   "16S ribosomal RNA and 5S ribosomal RNA overlap",
26144   "23S ribosomal RNA and 16S ribosomal RNA overlap",
26145   "23S ribosomal RNA and 5S ribosomal RNA overlap",
26146   "5S ribosomal RNA and 16S ribosomal RNA overlap",
26147   "5S ribosomal RNA and 23S ribosomal RNA overlap",
26148   NULL
26149 };
26150 
ValidateExceptText(ValidStructPtr vsp,GatherContextPtr gcp,SeqFeatPtr sfp)26151 static void ValidateExceptText (ValidStructPtr vsp, GatherContextPtr gcp, SeqFeatPtr sfp)
26152 
26153 {
26154   Boolean    art_loc_except = FALSE;
26155   Boolean    found;
26156   GBQualPtr  gbq;
26157   Int2       i;
26158   Boolean    has_inference = FALSE;
26159   CharPtr    ptr;
26160   Boolean    reasons_given_except = FALSE;
26161   Boolean    redundant_with_comment = FALSE;
26162   Boolean    refseq_except = FALSE;
26163   ErrSev     sev = SEV_ERROR;
26164   Boolean    trans_prot_except = FALSE;
26165   CharPtr    str;
26166   CharPtr    tmp;
26167 
26168   str = StringSave (sfp->except_text);
26169   if (str == NULL) return;
26170   tmp = str;
26171   while (! StringHasNoText (tmp)) {
26172     ptr = StringChr (tmp, ',');
26173     if (ptr != NULL) {
26174       *ptr = '\0';
26175       ptr++;
26176     }
26177     TrimSpacesAroundString (tmp);
26178     found = FALSE;
26179     for (i = 0; legal_exception_strings[i] != NULL; i++) {
26180       if (StringICmp (tmp, legal_exception_strings[i]) == 0) {
26181         found = TRUE;
26182         if (StringICmp (tmp, "reasons given in citation") == 0) {
26183           reasons_given_except = TRUE;
26184         } else if (StringICmp (tmp, "annotated by transcript or proteomic data") == 0) {
26185           trans_prot_except = TRUE;
26186         } else if (StringICmp (tmp, "artificial location") == 0) {
26187           art_loc_except = TRUE;
26188         }
26189         break;
26190       }
26191     }
26192     if (!found) {
26193       if (GPSorRefSeq (vsp->sep, sfp->location)) {
26194         for (i = 0; refseq_exception_strings[i] != NULL; i++) {
26195           if (StringICmp (tmp, refseq_exception_strings[i]) == 0) {
26196             found = TRUE;
26197             refseq_except = TRUE;
26198             break;
26199           }
26200         }
26201       }
26202       if (! found) {
26203         if (IsNCorNT (vsp->sep, sfp->location)) {
26204           sev = SEV_WARNING;
26205         }
26206         ValidErr (vsp, sev, ERR_SEQ_FEAT_ExceptionProblem, "%s is not a legal exception explanation", tmp);
26207       }
26208     }
26209     if (sfp->comment != NULL && StringISearch (sfp->comment, tmp) != NULL) {
26210       if (StringICmp (tmp, "ribosomal slippage") != 0 &&
26211           StringICmp (tmp, "trans-splicing") != 0 &&
26212           StringICmp (tmp, "RNA editing") != 0 &&
26213           StringICmp (tmp, "artificial location") != 0) {
26214         redundant_with_comment = TRUE;
26215       } else if (StringICmp (sfp->comment, tmp) == 0) {
26216         redundant_with_comment = TRUE;
26217       }
26218     }
26219     tmp = ptr;
26220   }
26221   MemFree (str);
26222   if (redundant_with_comment) {
26223     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ExceptionProblem, "Exception explanation text is also found in feature comment");
26224   }
26225   if (refseq_except) {
26226     found = FALSE;
26227     for (i = 0; refseq_exception_strings[i] != NULL; i++) {
26228       if (StringICmp (sfp->except_text, refseq_exception_strings[i]) == 0) {
26229         found = TRUE;
26230         refseq_except = TRUE;
26231         break;
26232       }
26233     }
26234     if (! found) {
26235       if (! vsp->is_gpipe_in_sep) {
26236         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ExceptionProblem, "Genome processing exception should not be combined with other explanations");
26237       }
26238     }
26239   }
26240   if (reasons_given_except && sfp->cit == NULL) {
26241     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ExceptionProblem, "Reasons given in citation exception does not have the required citation");
26242   }
26243   if (trans_prot_except) {
26244     for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
26245       if (StringICmp (gbq->qual, "inference") == 0) {
26246         has_inference = TRUE;
26247       }
26248     }
26249     if (! has_inference) {
26250       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ExceptionProblem,
26251                 "Annotated by transcript or proteomic data exception does not have the required inference qualifier");
26252     }
26253   }
26254   if (art_loc_except) {
26255     if (! vsp->is_embl_ddbj_in_sep) {
26256       ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_ExceptionProblem,
26257                 "Artificial location should only be used directly by EMBL or DDBJ records");
26258     }
26259   }
26260 }
26261 
26262 typedef struct samecds {
26263   Boolean               found;
26264   SeqMgrFeatContextPtr  gcontext;
26265   Uint2                 slpTag;
26266   Uint1                 subtype;
26267   Boolean               bypassGeneTest;
26268 } SameCds, PNTR SameCdsPtr;
26269 
FindSameCDS(SeqFeatPtr sfp,SeqMgrFeatContextPtr ccontext)26270 static Boolean LIBCALLBACK FindSameCDS (SeqFeatPtr sfp, SeqMgrFeatContextPtr ccontext)
26271 
26272 {
26273   SeqMgrFeatContextPtr  gcontext;
26274   Int2                  i;
26275   SameCdsPtr            same;
26276 
26277   if (sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION) return TRUE;
26278   same = (SameCdsPtr) ccontext->userdata;
26279   gcontext = same->gcontext;
26280   if (gcontext == NULL || gcontext->sfp == NULL ||
26281       gcontext->ivals == NULL || ccontext->ivals == NULL) return TRUE;
26282   if (gcontext->strand == ccontext->strand ||
26283       (ccontext->strand == Seq_strand_unknown && gcontext->strand != Seq_strand_minus) ||
26284       (gcontext->strand == Seq_strand_unknown && ccontext->strand != Seq_strand_minus) ||
26285       ccontext->strand == Seq_strand_both) {
26286     /* test for strands from SeqMgrGetBestOverlappingFeat, keep going if okay */
26287   } else {
26288     return TRUE;
26289   }
26290   if (same->subtype == FEATDEF_GENE) {
26291     if (gcontext->left == ccontext->left &&
26292         gcontext->right == ccontext->right) {
26293       same->found = TRUE;
26294       return FALSE;
26295     }
26296   } else if (same->subtype == FEATDEF_mRNA) {
26297     if (gcontext->left == ccontext->left &&
26298         gcontext->right == ccontext->right &&
26299         gcontext->numivals == ccontext->numivals) {
26300       if (SeqLocAinB (sfp->location, gcontext->sfp->location) >= 0) {
26301         if (gcontext->numivals == 1) {
26302           same->found = TRUE;
26303           return FALSE;
26304         } else {
26305           for (i = 0; i < gcontext->numivals; i++) {
26306             if (gcontext->ivals [2 * i] != ccontext->ivals [2 * i]) return TRUE;
26307             if (gcontext->ivals [2 * i + 1] != ccontext->ivals [2 * i + 1]) return TRUE;
26308           }
26309           same->found = TRUE;
26310           return FALSE;
26311         }
26312       }
26313     } else if (SeqLocAinB (sfp->location, gcontext->sfp->location) > 0) {
26314 
26315       if (ccontext->strand == Seq_strand_minus || gcontext->strand == Seq_strand_minus) {
26316         if (same->slpTag == SLP_NOSTART && gcontext->partialL) {
26317           if (gcontext->right == ccontext->right) {
26318             same->found = TRUE;
26319             return FALSE;
26320           }
26321           if (gcontext->right > ccontext->right) {
26322             same->bypassGeneTest = TRUE;
26323             return FALSE;
26324           }
26325         } else if (same->slpTag == SLP_NOSTOP && gcontext->partialR) {
26326           if (gcontext->left == ccontext->left) {
26327             same->found = TRUE;
26328             return FALSE;
26329           }
26330           if (gcontext->left < ccontext->left) {
26331             same->bypassGeneTest = TRUE;
26332             return FALSE;
26333           }
26334         }
26335 
26336       } else {
26337 
26338         if (same->slpTag == SLP_NOSTART && gcontext->partialL) {
26339           if (gcontext->left == ccontext->left) {
26340             same->found = TRUE;
26341             return FALSE;
26342           }
26343           if (gcontext->left < ccontext->left) {
26344             same->bypassGeneTest = TRUE;
26345             return FALSE;
26346           }
26347         } else if (same->slpTag == SLP_NOSTOP && gcontext->partialR) {
26348           if (gcontext->right == ccontext->right) {
26349             same->found = TRUE;
26350             return FALSE;
26351           }
26352           if (gcontext->right > ccontext->right) {
26353             same->bypassGeneTest = TRUE;
26354             return FALSE;
26355           }
26356         }
26357       }
26358     }
26359   }
26360   return TRUE;
26361 }
26362 
SameAsCDS(SeqFeatPtr sfp,Uint2 slpTag,BoolPtr bypassGeneTestP)26363 static Boolean SameAsCDS (SeqFeatPtr sfp, Uint2 slpTag, BoolPtr bypassGeneTestP)
26364 
26365 {
26366   BioseqPtr          bsp;
26367   SeqMgrFeatContext  ccontext;
26368   SeqFeatPtr         cds;
26369   Boolean            cdsFilt [SEQFEAT_MAX];
26370   SeqMgrFeatContext  gcontext;
26371   SameCds            same;
26372   VvmDataPtr         vdp;
26373   SeqFeatXrefPtr     xref;
26374 
26375   bsp = BioseqFindFromSeqLoc (sfp->location);
26376   if (bsp == NULL) return FALSE;
26377   if (SeqMgrGetDesiredFeature (0, bsp, 0, 0, sfp, &gcontext) != sfp) return FALSE;
26378   same.found = FALSE;
26379   same.gcontext = &gcontext;
26380   same.slpTag = slpTag;
26381   same.subtype = sfp->idx.subtype;
26382   same.bypassGeneTest = FALSE;
26383 
26384   vdp = (VvmDataPtr) sfp->idx.scratch;
26385   if (vdp != NULL && vdp->nearbycds != NULL) {
26386     cds = SeqMgrGetDesiredFeature (0, bsp, 0, 0, vdp->nearbycds, &ccontext);
26387     if (cds != NULL && cds->idx.subtype == FEATDEF_CDS && cds == vdp->nearbycds) {
26388       ccontext.userdata = (Pointer) &same;
26389       FindSameCDS (cds, &ccontext);
26390       if (same.found) {
26391         if (bypassGeneTestP != NULL) {
26392           *bypassGeneTestP = same.bypassGeneTest;
26393         }
26394         return same.found;
26395       }
26396       same.bypassGeneTest = FALSE;
26397     }
26398   }
26399 
26400   for (xref = sfp->xref; xref != NULL; xref = xref->next) {
26401     if (xref->id.choice == 0) continue;
26402     cds = SeqMgrGetFeatureByFeatID (sfp->idx.entityID, NULL, NULL, xref, &ccontext);
26403     if (cds == NULL || cds->idx.subtype != FEATDEF_CDS) continue;
26404     ccontext.userdata = (Pointer) &same;
26405     FindSameCDS (cds, &ccontext);
26406     if (same.found) {
26407       if (bypassGeneTestP != NULL) {
26408         *bypassGeneTestP = same.bypassGeneTest;
26409       }
26410       return same.found;
26411     }
26412     same.bypassGeneTest = FALSE;
26413   }
26414 
26415   MemSet ((Pointer) &cdsFilt, 0, sizeof (cdsFilt));
26416   cdsFilt [SEQFEAT_CDREGION] = TRUE;
26417   SeqMgrExploreFeatures (bsp, (Pointer) &same, FindSameCDS, sfp->location, cdsFilt, NULL);
26418   if (bypassGeneTestP != NULL) {
26419     *bypassGeneTestP = same.bypassGeneTest;
26420   }
26421   return same.found;
26422 }
26423 
FindSameMRNA(SeqFeatPtr sfp,SeqMgrFeatContextPtr ccontext)26424 static Boolean LIBCALLBACK FindSameMRNA (SeqFeatPtr sfp, SeqMgrFeatContextPtr ccontext)
26425 
26426 {
26427   SeqMgrFeatContextPtr  gcontext;
26428   Int2                  i;
26429   SameCdsPtr            same;
26430 
26431   if (sfp == NULL || sfp->idx.subtype != FEATDEF_mRNA) return TRUE;
26432   same = (SameCdsPtr) ccontext->userdata;
26433   gcontext = same->gcontext;
26434   if (gcontext == NULL || gcontext->sfp == NULL ||
26435       gcontext->ivals == NULL || ccontext->ivals == NULL) return TRUE;
26436   if (gcontext->strand == ccontext->strand ||
26437       (ccontext->strand == Seq_strand_unknown && gcontext->strand != Seq_strand_minus) ||
26438       (gcontext->strand == Seq_strand_unknown && ccontext->strand != Seq_strand_minus) ||
26439       ccontext->strand == Seq_strand_both) {
26440     /* test for strands from SeqMgrGetBestOverlappingFeat, keep going if okay */
26441   } else {
26442     return TRUE;
26443   }
26444   if (same->subtype == FEATDEF_GENE) {
26445     if (gcontext->left == ccontext->left &&
26446         gcontext->right == ccontext->right) {
26447       same->found = TRUE;
26448       return FALSE;
26449     }
26450   } else if (same->subtype == FEATDEF_CDS) {
26451     if (gcontext->left == ccontext->left &&
26452         gcontext->right == ccontext->right &&
26453         gcontext->numivals == ccontext->numivals) {
26454       if (SeqLocAinB (gcontext->sfp->location, sfp->location) >= 0) {
26455         if (gcontext->numivals == 1) {
26456           same->found = TRUE;
26457           return FALSE;
26458         } else {
26459           for (i = 0; i < gcontext->numivals; i++) {
26460             if (gcontext->ivals [2 * i] != ccontext->ivals [2 * i]) return TRUE;
26461             if (gcontext->ivals [2 * i + 1] != ccontext->ivals [2 * i + 1]) return TRUE;
26462           }
26463           same->found = TRUE;
26464           return FALSE;
26465         }
26466       }
26467     }
26468   } else if (same->subtype == FEATDEF_exon) {
26469     if (ccontext->strand == Seq_strand_minus || gcontext->strand == Seq_strand_minus) {
26470       if (same->slpTag == SLP_NOSTART && ccontext->partialL) {
26471         if (gcontext->right == ccontext->right) {
26472           same->found = TRUE;
26473           return FALSE;
26474         }
26475       } else if (same->slpTag == SLP_NOSTOP && ccontext->partialR) {
26476         if (gcontext->left == ccontext->left) {
26477           same->found = TRUE;
26478           return FALSE;
26479         }
26480       }
26481 
26482     } else {
26483 
26484       if (same->slpTag == SLP_NOSTART && ccontext->partialL) {
26485         if (gcontext->left == ccontext->left) {
26486           same->found = TRUE;
26487           return FALSE;
26488         }
26489       } else if (same->slpTag == SLP_NOSTOP && ccontext->partialR) {
26490         if (gcontext->right == ccontext->right) {
26491           same->found = TRUE;
26492           return FALSE;
26493         }
26494       }
26495     }
26496   }
26497   return TRUE;
26498 }
26499 
SameAsMRNA(SeqFeatPtr sfp,Uint2 slpTag)26500 static Boolean SameAsMRNA (SeqFeatPtr sfp, Uint2 slpTag)
26501 
26502 {
26503   BioseqPtr          bsp;
26504   SeqMgrFeatContext  mcontext;
26505   SeqFeatPtr         mrna;
26506   Boolean            mrnaFilt [FEATDEF_MAX];
26507   SeqMgrFeatContext  gcontext;
26508   SameCds            same;
26509   VvmDataPtr         vdp;
26510   SeqFeatXrefPtr     xref;
26511 
26512   bsp = BioseqFindFromSeqLoc (sfp->location);
26513   if (bsp == NULL) return FALSE;
26514   if (SeqMgrGetDesiredFeature (0, bsp, 0, 0, sfp, &gcontext) != sfp) return FALSE;
26515   same.found = FALSE;
26516   same.gcontext = &gcontext;
26517   same.slpTag = slpTag;
26518   same.subtype = sfp->idx.subtype;
26519 
26520   vdp = (VvmDataPtr) sfp->idx.scratch;
26521   if (vdp != NULL && vdp->nearbymrna != NULL) {
26522     mrna = SeqMgrGetDesiredFeature (0, bsp, 0, 0, vdp->nearbymrna, &mcontext);
26523     if (mrna != NULL && mrna->idx.subtype == FEATDEF_mRNA && mrna == vdp->nearbymrna) {
26524       mcontext.userdata = (Pointer) &same;
26525       FindSameMRNA (mrna, &mcontext);
26526       if (same.found) {
26527         return same.found;
26528       }
26529     }
26530   }
26531 
26532   for (xref = sfp->xref; xref != NULL; xref = xref->next) {
26533     if (xref->id.choice == 0) continue;
26534     mrna = SeqMgrGetFeatureByFeatID (sfp->idx.entityID, NULL, NULL, xref, &mcontext);
26535     if (mrna == NULL || mrna->idx.subtype != FEATDEF_mRNA) continue;
26536     mcontext.userdata = (Pointer) &same;
26537     FindSameMRNA (mrna, &mcontext);
26538     if (same.found) {
26539       return same.found;
26540     }
26541   }
26542 
26543   MemSet ((Pointer) &mrnaFilt, 0, sizeof (mrnaFilt));
26544   mrnaFilt [FEATDEF_mRNA] = TRUE;
26545   SeqMgrExploreFeatures (bsp, (Pointer) &same, FindSameMRNA, sfp->location, NULL, mrnaFilt);
26546   return same.found;
26547 }
26548 
AdjacentToIntron(SeqFeatPtr trna)26549 static Boolean AdjacentToIntron (SeqFeatPtr trna)
26550 
26551 {
26552   BioseqPtr          bsp;
26553   SeqMgrFeatContext  fcontext;
26554   SeqMgrFeatContext  rcontext;
26555   SeqFeatPtr         sfp;
26556 
26557   if (trna == NULL) return FALSE;
26558 
26559   bsp = BioseqFindFromSeqLoc (trna->location);
26560   if (bsp == NULL) return FALSE;
26561 
26562   if (SeqMgrGetDesiredFeature (0, bsp, 0, 0, trna, &rcontext) != trna) return FALSE;
26563 
26564   if (rcontext.strand == Seq_strand_minus) {
26565     if (rcontext.partialL) {
26566       sfp = SeqMgrGetDesiredFeature (0, bsp, 0, rcontext.index + 1, NULL, &fcontext);
26567       if (sfp == NULL) return FALSE;
26568       if (sfp->idx.subtype != FEATDEF_intron) return FALSE;
26569       if (fcontext.strand != Seq_strand_minus) return FALSE;
26570       if (fcontext.left != rcontext.right + 1) return FALSE;
26571     }
26572     if (rcontext.partialR) {
26573       sfp = SeqMgrGetDesiredFeature (0, bsp, 0, rcontext.index - 1, NULL, &fcontext);
26574       if (sfp == NULL) return FALSE;
26575       if (sfp->idx.subtype != FEATDEF_intron) return FALSE;
26576       if (fcontext.strand != Seq_strand_minus) return FALSE;
26577       if (fcontext.right != rcontext.left - 1) return FALSE;
26578     }
26579   } else {
26580     if (rcontext.partialL) {
26581       sfp = SeqMgrGetDesiredFeature (0, bsp, 0, rcontext.index - 1, NULL, &fcontext);
26582       if (sfp == NULL) return FALSE;
26583       if (sfp->idx.subtype != FEATDEF_intron) return FALSE;
26584       if (fcontext.strand == Seq_strand_minus) return FALSE;
26585       if (fcontext.right != rcontext.left - 1) return FALSE;
26586     }
26587     if (rcontext.partialR) {
26588       sfp = SeqMgrGetDesiredFeature (0, bsp, 0, rcontext.index + 1, NULL, &fcontext);
26589       if (sfp == NULL) return FALSE;
26590       if (sfp->idx.subtype != FEATDEF_intron) return FALSE;
26591       if (fcontext.strand == Seq_strand_minus) return FALSE;
26592       if (fcontext.left != rcontext.right + 1) return FALSE;
26593     }
26594   }
26595 
26596   return TRUE;
26597 }
26598 
TestSameGene(SeqMgrFeatContextPtr ccontext,SeqMgrFeatContextPtr gcontext)26599 static Boolean TestSameGene (SeqMgrFeatContextPtr ccontext, SeqMgrFeatContextPtr gcontext)
26600 
26601 {
26602   if (ccontext == NULL || ccontext->sfp == NULL ||
26603       gcontext == NULL || gcontext->sfp == NULL ||
26604       gcontext->ivals == NULL || ccontext->ivals == NULL) return FALSE;
26605   if (gcontext->strand == ccontext->strand ||
26606       (ccontext->strand == Seq_strand_unknown && gcontext->strand != Seq_strand_minus) ||
26607       (gcontext->strand == Seq_strand_unknown && ccontext->strand != Seq_strand_minus) ||
26608       ccontext->strand == Seq_strand_both) {
26609     /* test for strands from SeqMgrGetBestOverlappingFeat, keep going if okay */
26610   } else {
26611     return FALSE;
26612   }
26613   if (gcontext->left == ccontext->left &&
26614       gcontext->right == ccontext->right) {
26615     return TRUE;
26616   }
26617   return FALSE;
26618 }
26619 
SameAsGene(SeqFeatPtr sfp)26620 static Boolean SameAsGene (SeqFeatPtr sfp)
26621 
26622 {
26623   BioseqPtr          bsp;
26624   SeqMgrFeatContext  ccontext;
26625   SeqFeatPtr         gene;
26626   SeqMgrFeatContext  gcontext;
26627   GeneRefPtr         grp;
26628 
26629   grp = SeqMgrGetGeneXref (sfp);
26630   if (grp != NULL) return FALSE;
26631   gene = SeqMgrGetOverlappingGene (sfp->location, &gcontext);
26632   if (gene == NULL) return FALSE;
26633   bsp = BioseqFindFromSeqLoc (sfp->location);
26634   if (bsp == NULL) return FALSE;
26635   if (SeqMgrGetDesiredFeature (0, bsp, 0, 0, sfp, &ccontext) != sfp) return FALSE;
26636   return TestSameGene (&gcontext, &ccontext);
26637 }
26638 
SplicingNotExpected(SeqFeatPtr sfp)26639 static Boolean SplicingNotExpected (SeqFeatPtr sfp)
26640 
26641 {
26642   BioSourcePtr       biop;
26643   BioseqPtr          bsp;
26644   SeqMgrDescContext  dcontext;
26645   OrgNamePtr         onp;
26646   OrgRefPtr          orp;
26647   SeqDescrPtr        sdp;
26648 
26649   bsp = BioseqFindFromSeqLoc (sfp->location);
26650   if (bsp == NULL) return FALSE;
26651 
26652   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
26653   if (sdp == NULL) return FALSE;
26654   biop = (BioSourcePtr) sdp->data.ptrvalue;
26655   if (biop == NULL) return FALSE;
26656   orp = biop->org;
26657   if (orp == NULL) return FALSE;
26658   onp = orp->orgname;
26659   if (onp == NULL) return FALSE;
26660 
26661   if (StringCmp (onp->div, "BCT") == 0) return TRUE;
26662   if (StringCmp (onp->div, "VRL") == 0) return TRUE;
26663   if (StringNICmp (onp->lineage, "Bacteria; ", 10) == 0) return TRUE;
26664   if (StringNICmp (onp->lineage, "Archaea; ", 9) == 0) return TRUE;
26665 
26666   return FALSE;
26667 }
26668 
FeatureOnOrganelle(SeqFeatPtr sfp)26669 static Boolean FeatureOnOrganelle (SeqFeatPtr sfp)
26670 
26671 {
26672   BioSourcePtr       biop;
26673   BioseqPtr          bsp;
26674   SeqMgrDescContext  dcontext;
26675   SeqDescrPtr        sdp;
26676 
26677   bsp = BioseqFindFromSeqLoc (sfp->location);
26678   if (bsp == NULL) return FALSE;
26679 
26680   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
26681   if (sdp == NULL) return FALSE;
26682   biop = (BioSourcePtr) sdp->data.ptrvalue;
26683   if (biop == NULL) return FALSE;
26684 
26685   return IsLocationOrganelle (biop->genome);;
26686 }
26687 
RareConsensusNotExpected(SeqFeatPtr sfp)26688 static Boolean RareConsensusNotExpected (SeqFeatPtr sfp)
26689 
26690 {
26691   BioSourcePtr       biop;
26692   BioseqPtr          bsp;
26693   SeqMgrDescContext  dcontext;
26694   OrgNamePtr         onp;
26695   OrgRefPtr          orp;
26696   SeqDescrPtr        sdp;
26697 
26698   bsp = BioseqFindFromSeqLoc (sfp->location);
26699   if (bsp == NULL) return FALSE;
26700 
26701   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
26702   if (sdp == NULL) return FALSE;
26703   biop = (BioSourcePtr) sdp->data.ptrvalue;
26704   if (biop == NULL) return FALSE;
26705   orp = biop->org;
26706   if (orp == NULL) return FALSE;
26707   onp = orp->orgname;
26708   if (onp == NULL) return FALSE;
26709 
26710   if (StringCmp (onp->div, "PLN") != 0) return TRUE;
26711   if (StringNICmp (onp->lineage, "Eukaryota; Viridiplantae; ", 26) != 0) return TRUE;
26712 
26713   return FALSE;
26714 }
26715 
HasUnderscore(CharPtr str)26716 static Boolean HasUnderscore (CharPtr str)
26717 {
26718   if (StringChr(str, '_') != NULL)
26719     return TRUE;
26720   else
26721     return FALSE;
26722 }
26723 
IsUpperCaseChar(Char ch)26724 static Boolean IsUpperCaseChar (Char ch)
26725 {
26726   if (StringChr("ABCDEFGHIJKLMNOPQRSTUVWXYZ",ch) != NULL)
26727     return TRUE;
26728   else
26729     return FALSE;
26730 }
26731 
26732 /*
26733 static Boolean IsNumericChar (Char ch)
26734 {
26735   if (StringChr("0123456789",ch) != NULL)
26736     return TRUE;
26737   else
26738     return FALSE;
26739 }
26740 */
26741 
26742 //LCOV_EXCL_START
26743 // Not used for validation
IsNuclAcc(CharPtr name)26744 NLM_EXTERN Boolean IsNuclAcc (CharPtr name)
26745 
26746 {
26747   if (!IsUpperCaseChar (name[0]))
26748     return FALSE;
26749 
26750   if (!HasUnderscore (name))
26751     return FALSE;
26752 
26753   return TRUE;
26754 }
26755 //LCOV_EXCL_STOP
26756 
IsCddFeat(SeqFeatPtr sfp)26757 static Boolean IsCddFeat (
26758   SeqFeatPtr sfp
26759 )
26760 
26761 {
26762   DbtagPtr    dbt;
26763   ValNodePtr  vnp;
26764 
26765   if (sfp == NULL || sfp->data.choice != SEQFEAT_REGION) return FALSE;
26766 
26767   for (vnp = sfp->dbxref; vnp != NULL; vnp = vnp->next) {
26768     dbt = (DbtagPtr) vnp->data.ptrvalue;
26769     if (dbt == NULL) continue;
26770     if (StringCmp (dbt->db, "CDD") == 0 || StringCmp (dbt->db, "cdd") == 0) return TRUE;
26771   }
26772 
26773   return FALSE;
26774 }
26775 
26776 /* derived from ValidateSeqLoc */
ValidateAnticodon(ValidStructPtr vsp,SeqLocPtr slp)26777 static void ValidateAnticodon (ValidStructPtr vsp, SeqLocPtr slp)
26778 {
26779   SeqLocPtr       tmp;
26780   Boolean         retval = TRUE, tmpval, mixed_strand = FALSE, unmarked_strand = FALSE,
26781                   ordered = TRUE, adjacent = FALSE, circular = FALSE, exception = FALSE;
26782   CharPtr         ctmp;
26783   Uint1           strand1 = Seq_strand_other, strand2 = Seq_strand_other;
26784   Int4            from1 = -1, from2 = -1, to1 = -1, to2 = -1;
26785   SeqIntPtr       sip;
26786   SeqPntPtr       spp;
26787   SeqIdPtr        id1 = NULL, id2 = NULL;
26788   BioseqPtr       bsp;
26789   SeqFeatPtr      sfp = NULL;
26790 
26791   if (slp == NULL)
26792     return;
26793 
26794   sfp = vsp->sfp;
26795 
26796   bsp = BioseqFindFromSeqLoc (slp);
26797   if (bsp != NULL && bsp->topology == 2) {
26798     circular = TRUE;
26799   }
26800 
26801   tmp = NULL;
26802 
26803   for (tmp = SeqLocFindNext (slp, NULL); tmp != NULL; tmp = SeqLocFindNext (slp, tmp)) {
26804     tmpval = TRUE;
26805     switch (tmp->choice) {
26806     case SEQLOC_INT:
26807       sip = (SeqIntPtr) (tmp->data.ptrvalue);
26808       if (sip == NULL) continue;
26809       strand2 = sip->strand;
26810       id2 = sip->id;
26811       from2 = sip->from;
26812       to2 = sip->to;
26813       tmpval = SeqIntCheck (sip);
26814       break;
26815     case SEQLOC_PNT:
26816       spp = (SeqPntPtr) (tmp->data.ptrvalue);
26817       if (spp == NULL) continue;
26818       strand2 = spp->strand;
26819       id2 = spp->id;
26820       from2 = spp->point;
26821       to2 = spp->point;
26822       tmpval = SeqPntCheck (spp);
26823       break;
26824     case SEQLOC_NULL:
26825       continue;
26826     default:
26827       break;
26828     }
26829 
26830     if (id1 != NULL && id2 != NULL) {
26831       if (SeqIdForSameBioseq (id1, id2)) {
26832         if ((ordered) /* && (! circular) */) {
26833           if (strand2 == Seq_strand_minus) {
26834             if (to1 < to2)
26835               ordered = FALSE;
26836             if (to2 + 1 == from1)
26837               adjacent = TRUE;
26838           } else {
26839             if (to1 > to2)
26840               ordered = FALSE;
26841             if (to1 + 1 == from2)
26842               adjacent = TRUE;
26843           }
26844         }
26845         if (strand1 == strand2 && from1 == from2 && to1 == to2) {
26846           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_DuplicateInterval, "Duplicate anticodon exons in location");
26847         }
26848       }
26849     }
26850 
26851     if (!tmpval) {
26852       retval = FALSE;
26853       ctmp = SeqLocPrint (tmp);
26854       if (ctmp != NULL && StringLen (ctmp) > 800) {
26855         StringCpy (ctmp + 797, "...");
26856       }
26857       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_Range, "Anticodon location [%s] out of range", ctmp);
26858       MemFree (ctmp);
26859     }
26860 
26861     if ((strand1 != Seq_strand_other) && (strand2 != Seq_strand_other)) {
26862       if (SeqIdForSameBioseq (id1, id2)) {
26863         if (strand1 != strand2) {
26864           if (strand1 == Seq_strand_plus && strand2 == Seq_strand_unknown) {
26865             unmarked_strand = TRUE;
26866           } else if (strand1 == Seq_strand_unknown && strand2 == Seq_strand_plus) {
26867             unmarked_strand = TRUE;
26868           } else {
26869             mixed_strand = TRUE;
26870           }
26871         }
26872       }
26873     }
26874 
26875     from1 = from2;
26876     to1 = to2;
26877     id1 = id2;
26878     strand1 = strand2;
26879   }
26880 
26881   if (sfp != NULL && sfp->excpt) {
26882     exception = TRUE;
26883   }
26884 
26885   if (adjacent) {
26886     ctmp = SeqLocPrint (slp);
26887     /*
26888     if (exception) {
26889       sev = SEV_WARNING;
26890     } else {
26891       sev = SEV_ERROR;
26892     }
26893     */
26894     if (ctmp != NULL && StringLen (ctmp) > 800) {
26895       StringCpy (ctmp + 797, "...");
26896     }
26897     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_AbuttingIntervals, "Adjacent intervals in Anticodon [%s]", ctmp);
26898     MemFree (ctmp);
26899   }
26900 
26901   if (sfp != NULL) {
26902     strand1 = SeqLocStrand (sfp->location);
26903     strand2 = SeqLocStrand (slp);
26904     if (strand1 == Seq_strand_minus && strand2 != Seq_strand_minus) {
26905       ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_BadAnticodonStrand, "Anticodon should be on minus strand");
26906     } else if (strand1 != Seq_strand_minus && strand2 == Seq_strand_minus) {
26907       ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_BadAnticodonStrand, "Anticodon should be on plus strand");
26908     }
26909   }
26910 
26911   if (exception) {
26912     /* trans splicing exception turns off both mixed_strand and out_of_order messages */
26913     if (StringISearch (sfp->except_text, "trans-splicing") != NULL) {
26914       return;
26915     }
26916   }
26917 
26918   if (mixed_strand || unmarked_strand || (!ordered)) {
26919     ctmp = SeqLocPrint (slp);
26920     if (ctmp != NULL && StringLen (ctmp) > 800) {
26921       StringCpy (ctmp + 797, "...");
26922     }
26923     if (mixed_strand) {
26924       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_MixedStrand, "Mixed strands in Anticodon [%s]", ctmp);
26925     } else if (unmarked_strand) {
26926       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_MixedStrand, "Mixed plus and unknown strands in Anticodon [%s]", ctmp);
26927     }
26928     if (!ordered)
26929       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_SeqLocOrder, "Intervals out of order in Anticodon [%s]", ctmp);
26930     MemFree (ctmp);
26931     return;
26932   }
26933 
26934   /* newer check for intervals out of order on segmented bioseq */
26935 
26936   if (bsp == NULL || bsp->repr != Seq_repr_seg) return;
26937 //LCOV_EXCL_START
26938 // Only for SegSets
26939   if (SeqLocBadSortOrder (bsp, slp)) {
26940     ctmp = SeqLocPrint (slp);
26941     if (ctmp != NULL && StringLen (ctmp) > 800) {
26942       StringCpy (ctmp + 797, "...");
26943     }
26944     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_SeqLocOrder, "Intervals out of order in Anticodon [%s]", ctmp);
26945     MemFree (ctmp);
26946   }
26947 
26948   /* newer check for mixed strand on segmented bioseq */
26949 
26950   if (SeqLocMixedStrands (bsp, slp)) {
26951     ctmp = SeqLocPrint (slp);
26952     if (ctmp != NULL && StringLen (ctmp) > 800) {
26953       StringCpy (ctmp + 797, "...");
26954     }
26955     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_MixedStrand, "Mixed strands in Anticodon [%s]", ctmp);
26956     MemFree (ctmp);
26957   }
26958 //LCOV_EXCL_STOP
26959 }
26960 
JustQuotes(CharPtr str)26961 static Boolean JustQuotes (CharPtr str)
26962 
26963 {
26964   Char  ch;
26965 
26966   if (str == NULL) return FALSE;
26967 
26968   ch = *str;
26969   while (ch != '\0') {
26970     if (ch != '"' && ch != ' ') return FALSE;
26971     str++;
26972     ch = *str;
26973   }
26974 
26975   return TRUE;
26976 }
26977 
26978 typedef struct dummysmfedata {
26979   Int4        max;
26980   Int4        num_at_max;
26981   Int4        num_trans_spliced;
26982   Boolean     equivalent_genes;
26983   GeneRefPtr  grp_at_max;
26984 } DummySmfeData, PNTR DummySmfePtr;
26985 
DummySMFEProc(SeqFeatPtr sfp,SeqMgrFeatContextPtr context)26986 static Boolean LIBCALLBACK DummySMFEProc (
26987   SeqFeatPtr sfp,
26988   SeqMgrFeatContextPtr context
26989 )
26990 
26991 
26992 {
26993   DummySmfePtr  dsp;
26994   GeneRefPtr    grp, grpx;
26995   Int4          len;
26996   Boolean       redundantgenexref = FALSE;
26997   CharPtr       syn1, syn2;
26998 
26999   if (sfp == NULL || context == NULL) return TRUE;
27000   dsp = context->userdata;
27001   if (dsp == NULL) return TRUE;
27002   if (sfp->data.choice != SEQFEAT_GENE) return TRUE;
27003   grp = (GeneRefPtr) sfp->data.value.ptrvalue;
27004   if (grp == NULL) return TRUE;
27005 
27006   len = SeqLocLen (sfp->location);
27007   if (len < dsp->max) {
27008     dsp->max = len;
27009     dsp->num_at_max = 1;
27010     dsp->num_trans_spliced = 0;
27011     if (StringISearch (sfp->except_text, "trans-splicing") != NULL) {
27012       (dsp->num_trans_spliced)++;
27013     }
27014     dsp->equivalent_genes = FALSE;
27015     dsp->grp_at_max = grp;
27016   } else if (len == dsp->max) {
27017     (dsp->num_at_max)++;
27018     if (StringISearch (sfp->except_text, "trans-splicing") != NULL) {
27019       (dsp->num_trans_spliced)++;
27020     }
27021     grpx = dsp->grp_at_max;
27022     if (grpx != NULL) {
27023       redundantgenexref = FALSE;
27024       if (StringDoesHaveText (grp->locus_tag) && StringDoesHaveText (grpx->locus_tag)) {
27025         if (StringICmp (grp->locus_tag, grpx->locus_tag) == 0) {
27026           redundantgenexref = TRUE;
27027         }
27028       } else if (StringDoesHaveText (grp->locus) && StringDoesHaveText (grpx->locus)) {
27029         if (StringICmp (grp->locus, grpx->locus) == 0) {
27030           redundantgenexref = TRUE;
27031         }
27032       } else if (grp->syn != NULL && grpx->syn != NULL) {
27033         syn1 = (CharPtr) grp->syn->data.ptrvalue;
27034         syn2 = (CharPtr) grpx->syn->data.ptrvalue;
27035         if (StringDoesHaveText (syn1) && StringDoesHaveText (syn2)) {
27036           if (StringICmp (syn1, syn2) == 0) {
27037             redundantgenexref = TRUE;
27038           }
27039         }
27040       }
27041     }
27042     if (redundantgenexref) {
27043       dsp->equivalent_genes = TRUE;
27044     }
27045   }
27046 
27047   return TRUE;
27048 }
27049 
27050 typedef struct govstruc {
27051   CharPtr  term;
27052   CharPtr  goid;
27053   CharPtr  evidence;
27054   Int4     pmid;
27055 } GovStruc, PNTR GovStrucPtr;
27056 
SortVnpByGvspTermFirst(VoidPtr ptr1,VoidPtr ptr2)27057 static int LIBCALLBACK SortVnpByGvspTermFirst (VoidPtr ptr1, VoidPtr ptr2)
27058 
27059 {
27060   int           compare;
27061   GovStrucPtr   gsp1, gsp2;
27062   ValNodePtr    vnp1, vnp2;
27063 
27064   if (ptr1 == NULL || ptr2 == NULL) return 0;
27065   vnp1 = *((ValNodePtr PNTR) ptr1);
27066   vnp2 = *((ValNodePtr PNTR) ptr2);
27067   if (vnp1 == NULL || vnp2 == NULL) return 0;
27068   gsp1 = (GovStrucPtr) vnp1->data.ptrvalue;
27069   gsp2 = (GovStrucPtr) vnp2->data.ptrvalue;
27070   if (gsp1 == NULL || gsp2 == NULL) return 0;
27071 
27072   compare = StringICmp (gsp1->term, gsp2->term);
27073   if (compare > 0) {
27074     return 1;
27075   } else if (compare < 0) {
27076     return -1;
27077   }
27078 
27079   compare = StringICmp (gsp1->goid, gsp2->goid);
27080   if (compare > 0) {
27081     return 1;
27082   } else if (compare < 0) {
27083     return -1;
27084   }
27085 
27086   compare = StringICmp (gsp1->evidence, gsp2->evidence);
27087   if (compare > 0) {
27088     return 1;
27089   } else if (compare < 0) {
27090     return -1;
27091   }
27092 
27093   if (gsp1->pmid == 0) return 1;
27094   if (gsp2->pmid == 0) return -1;
27095   if (gsp1->pmid > gsp2->pmid) {
27096     return 1;
27097   } else if (gsp1->pmid < gsp2->pmid) {
27098     return -1;
27099   }
27100 
27101   return 0;
27102 }
27103 
SortVnpByGvspGoIDFirst(VoidPtr ptr1,VoidPtr ptr2)27104 static int LIBCALLBACK SortVnpByGvspGoIDFirst (VoidPtr ptr1, VoidPtr ptr2)
27105 
27106 {
27107   int           compare;
27108   GovStrucPtr   gsp1, gsp2;
27109   ValNodePtr    vnp1, vnp2;
27110 
27111   if (ptr1 == NULL || ptr2 == NULL) return 0;
27112   vnp1 = *((ValNodePtr PNTR) ptr1);
27113   vnp2 = *((ValNodePtr PNTR) ptr2);
27114   if (vnp1 == NULL || vnp2 == NULL) return 0;
27115   gsp1 = (GovStrucPtr) vnp1->data.ptrvalue;
27116   gsp2 = (GovStrucPtr) vnp2->data.ptrvalue;
27117   if (gsp1 == NULL || gsp2 == NULL) return 0;
27118 
27119   compare = StringICmp (gsp1->goid, gsp2->goid);
27120   if (compare > 0) {
27121     return 1;
27122   } else if (compare < 0) {
27123     return -1;
27124   }
27125 
27126   compare = StringICmp (gsp1->term, gsp2->term);
27127   if (compare > 0) {
27128     return 1;
27129   } else if (compare < 0) {
27130     return -1;
27131   }
27132 
27133   compare = StringICmp (gsp1->evidence, gsp2->evidence);
27134   if (compare > 0) {
27135     return 1;
27136   } else if (compare < 0) {
27137     return -1;
27138   }
27139 
27140   if (gsp1->pmid == 0) return 1;
27141   if (gsp2->pmid == 0) return -1;
27142   if (gsp1->pmid > gsp2->pmid) {
27143     return 1;
27144   } else if (gsp1->pmid < gsp2->pmid) {
27145     return -1;
27146   }
27147 
27148   return 0;
27149 }
27150 
27151 
ValidateGoTermQualifier(ValidStructPtr vsp,UserFieldPtr field_list)27152 static ValNodePtr ValidateGoTermQualifier (
27153   ValidStructPtr vsp,
27154   UserFieldPtr   field_list
27155 )
27156 
27157 {
27158   UserFieldPtr term, ufp;
27159   CharPtr      textstr, evidence, goid;
27160   Char         gid[255];
27161   Int4         pmid, j;
27162   ObjectIdPtr  oip;
27163   ValNodePtr   head = NULL, vnp;
27164   GovStrucPtr  gsp, lastgsp;
27165 
27166   for (term = field_list; term != NULL; term = term->next) {
27167     if (term->choice != 11 || term->data.ptrvalue == NULL) {
27168       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadGeneOntologyFormat, "Bad GO term format");
27169     } else {
27170       textstr = NULL;
27171       evidence = NULL;
27172       goid = NULL;
27173       pmid = 0;
27174       for (ufp = (UserFieldPtr) term->data.ptrvalue; ufp != NULL; ufp = ufp->next) {
27175         oip = ufp->label;
27176         if (oip == NULL) {
27177           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadGeneOntologyFormat, "No label on GO term qualifier field");
27178           continue;
27179         }
27180         for (j = 0; goFieldType [j] != NULL; j++) {
27181           if (StringICmp (oip->str, goFieldType [j]) == 0) break;
27182         }
27183         if (goFieldType [j] == NULL) {
27184           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadGeneOntologyFormat, "Unrecognized label on GO term qualifier field %s", oip->str == NULL ? "[blank]" : oip->str);
27185           continue;
27186         }
27187         switch (j) {
27188           case 1 :
27189             if (ufp->choice == 1) {
27190               textstr = (CharPtr) ufp->data.ptrvalue;
27191             } else {
27192               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadGeneOntologyFormat, "Bad data format for GO term qualifier term");
27193             }
27194             break;
27195           case 2 :
27196             if (ufp->choice == 1) {
27197               goid = (CharPtr) ufp->data.ptrvalue;
27198             } else if (ufp->choice == 2) {
27199               sprintf (gid, "%ld", (long) (Int4) ufp->data.intvalue);
27200               goid = (CharPtr) gid;
27201             } else {
27202               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadGeneOntologyFormat, "Bad data format for GO term qualifier GO ID");
27203             }
27204             break;
27205           case 3 :
27206             if (ufp->choice == 2) {
27207               pmid = (Int4) ufp->data.intvalue;
27208             } else {
27209               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadGeneOntologyFormat, "Bad data format for GO term qualifier PMID");
27210             }
27211             break;
27212           case 5 :
27213             if (ufp->choice == 1) {
27214               evidence = (CharPtr) ufp->data.ptrvalue;
27215             } else {
27216               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadGeneOntologyFormat, "Bad data format for GO term qualifier evidence");
27217             }
27218             break;
27219           default :
27220             break;
27221         }
27222       }
27223       if (StringHasNoText (goid)) {
27224         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_GeneOntologyTermMissingGOID, "GO term does not have GO identifier");
27225       }
27226 
27227       gsp = (GovStrucPtr) MemNew (sizeof (GovStruc));
27228       if (gsp != NULL) {
27229         gsp->term = StringSave (textstr);
27230         gsp->goid = StringSave (goid);
27231         gsp->evidence = StringSave (evidence);
27232         gsp->pmid = pmid;
27233         ValNodeAddPointer (&head, 0, (Pointer) gsp);
27234       }
27235     }
27236   }
27237 
27238   if (head == NULL || head->next == NULL) {
27239     return head;
27240   }
27241   head = ValNodeSort (head, SortVnpByGvspTermFirst);
27242   lastgsp = head->data.ptrvalue;
27243   for (vnp = head->next; vnp != NULL; vnp = vnp->next) {
27244     gsp = vnp->data.ptrvalue;
27245     if (StringICmp (gsp->term, lastgsp->term) == 0 || StringICmp (gsp->goid, lastgsp->goid) == 0) {
27246       if (gsp->pmid == lastgsp->pmid && StringICmp (gsp->evidence, lastgsp->evidence) == 0) {
27247         ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_DuplicateGeneOntologyTerm, "Duplicate GO term on feature");
27248       }
27249     }
27250     lastgsp = gsp;
27251   }
27252   return head;
27253 }
27254 
27255 
ValidateGoTermUserObject(ValidStructPtr vsp,UserObjectPtr uop)27256 static void ValidateGoTermUserObject (ValidStructPtr vsp, UserObjectPtr uop)
27257 {
27258   ObjectIdPtr oip;
27259   UserFieldPtr ufp;
27260   ValNodePtr   term_list = NULL, vnp;
27261   GovStrucPtr  gsp, lastgsp;
27262   Int4 j;
27263 
27264   if (uop == NULL || vsp == NULL) return;
27265   oip = uop->type;
27266   if (oip == NULL) return;
27267   if (StringCmp (oip->str, "GeneOntology") == 0) {
27268     for (ufp = uop->data; ufp != NULL; ufp = ufp->next) {
27269       if (ufp->choice != 11 || ufp->label == NULL) {
27270         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadGeneOntologyFormat, "Bad data format for GO term");
27271       } else {
27272         for (j = 0; goQualType [j] != NULL; j++) {
27273           if (StringICmp (ufp->label->str, goQualType [j]) == 0) break;
27274         }
27275         if (goQualType [j] == NULL) {
27276           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadGeneOntologyFormat, "Unrecognized GO term label %s", ufp->label->str == NULL ? "[blank]" : ufp->label->str);
27277         } else {
27278           ValNodeLink (&term_list, ValidateGoTermQualifier (vsp, ufp->data.ptrvalue));
27279         }
27280       }
27281     }
27282     if (term_list != NULL) {
27283       term_list = ValNodeSort (term_list, SortVnpByGvspGoIDFirst);
27284       lastgsp = term_list->data.ptrvalue;
27285       for (vnp = term_list->next; vnp != NULL; vnp = vnp->next) {
27286         gsp = vnp->data.ptrvalue;
27287         if (gsp->goid != NULL
27288             && StringCmp (lastgsp->goid, gsp->goid) == 0
27289             && StringCmp (lastgsp->term, gsp->term) != 0) {
27290           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InconsistentGeneOntologyTermAndId, "Inconsistent GO terms for GO ID %s", gsp->goid);
27291         }
27292         lastgsp = gsp;
27293       }
27294       /* free term list */
27295       for (vnp = term_list; vnp != NULL; vnp = vnp->next) {
27296         gsp = vnp->data.ptrvalue;
27297         gsp->goid = MemFree (gsp->goid);
27298         gsp->term = MemFree (gsp->term);
27299         gsp->evidence = MemFree (gsp->evidence);
27300       }
27301     }
27302   }
27303 }
27304 
LookForAccnLocs(SeqIdPtr sip,Pointer userdata)27305 static void LookForAccnLocs (SeqIdPtr sip, Pointer userdata)
27306 
27307 {
27308   BoolPtr       bp;
27309   TextSeqIdPtr  tsip;
27310 
27311   if (sip == NULL || userdata == NULL) return;
27312   bp = (BoolPtr) userdata;
27313 
27314   switch (sip->choice) {
27315     case SEQID_GENBANK :
27316     case SEQID_EMBL :
27317     case SEQID_DDBJ :
27318     case SEQID_TPG :
27319     case SEQID_TPE :
27320     case SEQID_TPD :
27321     case SEQID_OTHER :
27322       tsip = (TextSeqIdPtr) sip->data.ptrvalue;
27323       if (tsip != NULL) {
27324         if (StringDoesHaveText (tsip->accession)) {
27325           *bp = TRUE;
27326         }
27327       }
27328       break;
27329     default :
27330       break;
27331   }
27332 }
27333 
27334 static CharPtr infMessage [] = {
27335   "unknown error",
27336   "empty inference string",
27337   "bad inference prefix",
27338   "bad inference body",
27339   "single inference field",
27340   "spaces in inference",
27341   "possible comment in inference",
27342   "same species misused",
27343   "bad inference accession",
27344   "bad inference accession version",
27345   "accession.version not public",
27346   "bad accession type",
27347   "unrecognized database",
27348   NULL
27349 };
27350 
27351 static CharPtr rnaNameByType [] = {
27352   "unknown",
27353   "premsg",
27354   "mRNA",
27355   "tRNA",
27356   "rRNA",
27357   "snRNA",
27358   "scRNA",
27359   "snoRNA",
27360   "otherRNA",
27361   NULL
27362 };
27363 
ValStrandsMatch(Uint1 featstrand,Uint1 locstrand)27364 static Boolean ValStrandsMatch (Uint1 featstrand, Uint1 locstrand)
27365 
27366 {
27367   if (featstrand == locstrand) return TRUE;
27368   if (locstrand == Seq_strand_unknown && featstrand != Seq_strand_minus) return TRUE;
27369   if (featstrand == Seq_strand_unknown && locstrand != Seq_strand_minus) return TRUE;
27370   if (featstrand == Seq_strand_both && locstrand != Seq_strand_minus) return TRUE;
27371   if (locstrand == Seq_strand_both) return TRUE;
27372   return FALSE;
27373 }
27374 
27375 static CharPtr badGeneSyn [] = {
27376   "alpha",
27377   "alternative",
27378   "beta",
27379   "cellular",
27380   "cytokine",
27381   "delta",
27382   "drosophila",
27383   "epsilon",
27384   "gamma",
27385   "HLA",
27386   "homolog",
27387   "mouse",
27388   "orf",
27389   "partial",
27390   "plasma",
27391   "precursor",
27392   "pseudogene",
27393   "putative",
27394   "rearranged",
27395   "small",
27396   "trna",
27397   "unknown",
27398   "unknown function",
27399   "unknown protein",
27400   "unnamed",
27401   NULL
27402 };
27403 
27404 static CharPtr badProtName [] = {
27405   "'hypothetical protein",
27406   "alpha",
27407   "alternative",
27408   "alternatively spliced",
27409   "bacteriophage hypothetical protein",
27410   "beta",
27411   "cellular",
27412   "cnserved hypothetical protein",
27413   "conesrved hypothetical protein",
27414   "conserevd hypothetical protein",
27415   "conserved archaeal protein",
27416   "conserved domain protein",
27417   "conserved hypohetical protein",
27418   "conserved hypotehtical protein",
27419   "conserved hypotheical protein",
27420   "conserved hypothertical protein",
27421   "conserved hypothetcial protein",
27422   "conserved hypothetical",
27423   "conserved hypothetical exported protein",
27424   "conserved hypothetical integral membrane protein",
27425   "conserved hypothetical membrane protein",
27426   "conserved hypothetical phage protein",
27427   "conserved hypothetical prophage protein",
27428   "conserved hypothetical protein",
27429   "conserved hypothetical protein - phage associated",
27430   "conserved hypothetical protein fragment 3",
27431   "conserved hypothetical protein, fragment",
27432   "conserved hypothetical protein, putative",
27433   "conserved hypothetical protein, truncated",
27434   "conserved hypothetical protein, truncation",
27435   "conserved hypothetical protein.",
27436   "conserved hypothetical protein; possible membrane protein",
27437   "conserved hypothetical protein; putative membrane protein",
27438   "conserved hypothetical proteins",
27439   "conserved hypothetical protien",
27440   "conserved hypothetical transmembrane protein",
27441   "conserved hypotheticcal protein",
27442   "conserved hypthetical protein",
27443   "conserved in bacteria",
27444   "conserved membrane protein",
27445   "conserved protein",
27446   "conserved protein of unknown function",
27447   "conserved protein of unknown function ; putative membrane protein",
27448   "conserved unknown protein",
27449   "conservedhypothetical protein",
27450   "conserverd hypothetical protein",
27451   "conservered hypothetical protein",
27452   "consrved hypothetical protein",
27453   "converved hypothetical protein",
27454   "cytokine",
27455   "delta",
27456   "drosophila",
27457   "duplicated hypothetical protein",
27458   "epsilon",
27459   "gamma",
27460   "HLA",
27461   "homeodomain",
27462   "homeodomain protein",
27463   "homolog",
27464   "hyopthetical protein",
27465   "hypotethical",
27466   "hypotheical protein",
27467   "hypothertical protein",
27468   "hypothetcical protein",
27469   "hypothetical",
27470   "hypothetical  protein",
27471   "hypothetical conserved protein",
27472   "hypothetical exported protein",
27473   "hypothetical novel protein",
27474   "hypothetical orf",
27475   "hypothetical phage protein",
27476   "hypothetical prophage protein",
27477   "hypothetical protein",
27478   "hypothetical protein (fragment)",
27479   "hypothetical protein (multi-domain)",
27480   "hypothetical protein (phage associated)",
27481   "hypothetical protein - phage associated",
27482   "hypothetical protein fragment",
27483   "hypothetical protein fragment 1",
27484   "hypothetical protein predicted by genemark",
27485   "hypothetical protein predicted by glimmer",
27486   "hypothetical protein predicted by glimmer/critica",
27487   "hypothetical protein, conserved",
27488   "hypothetical protein, phage associated",
27489   "hypothetical protein, truncated",
27490   "hypothetical protein-putative conserved hypothetical protein",
27491   "hypothetical protein.",
27492   "hypothetical proteins",
27493   "hypothetical protien",
27494   "hypothetical transmembrane protein",
27495   "hypothetoical protein",
27496   "hypothteical protein",
27497   "identified by sequence similarity; putative; ORF located~using Blastx/FrameD",
27498   "identified by sequence similarity; putative; ORF located~using Blastx/Glimmer/Genemark",
27499   "ion channel",
27500   "membrane protein, putative",
27501   "mouse",
27502   "narrowly conserved hypothetical protein",
27503   "novel protein",
27504   "orf",
27505   "orf, conserved hypothetical protein",
27506   "orf, hypothetical",
27507   "orf, hypothetical protein",
27508   "orf, hypothetical, fragment",
27509   "orf, partial conserved hypothetical protein",
27510   "orf; hypothetical protein",
27511   "orf; unknown function",
27512   "partial",
27513   "partial cds, hypothetical",
27514   "partially conserved hypothetical protein",
27515   "phage hypothetical protein",
27516   "phage-related conserved hypothetical protein",
27517   "phage-related protein",
27518   "plasma",
27519   "possible hypothetical protein",
27520   "precursor",
27521   "predicted coding region",
27522   "predicted protein",
27523   "predicted protein (pseudogene)",
27524   "predicted protein family",
27525   "product uncharacterised protein family",
27526   "protein family",
27527   "protein of unknown function",
27528   "pseudogene",
27529   "putative",
27530   "putative conserved protein",
27531   "putative exported protein",
27532   "putative hypothetical protein",
27533   "putative membrane protein",
27534   "putative orf; unknown function",
27535   "putative phage protein",
27536   "putative protein",
27537   "rearranged",
27538   "repeats containing protein",
27539   "reserved",
27540   "ribosomal protein",
27541   "similar to",
27542   "small",
27543   "small hypothetical protein",
27544   "transmembrane protein",
27545   "trna",
27546   "trp repeat",
27547   "trp-repeat protein",
27548   "truncated conserved hypothetical protein",
27549   "truncated hypothetical protein",
27550   "uncharacterized conserved membrane protein",
27551   "uncharacterized conserved protein",
27552   "uncharacterized conserved secreted protein",
27553   "uncharacterized protein",
27554   "uncharacterized protein conserved in archaea",
27555   "uncharacterized protein conserved in bacteria",
27556   "uniprot",
27557   "unique hypothetical",
27558   "unique hypothetical protein",
27559   "unknown",
27560   "unknown CDS",
27561   "unknown function",
27562   "unknown gene",
27563   "unknown protein",
27564   "unknown, conserved protein",
27565   "unknown, hypothetical",
27566   "unknown-related protein",
27567   "unknown; predicted coding region",
27568   "unnamed",
27569   "unnamed protein product",
27570   "very hypothetical protein",
27571   NULL
27572 };
27573 
NameInList(CharPtr name,CharPtr PNTR list,size_t numelements)27574 static Boolean NameInList (CharPtr name, CharPtr PNTR list, size_t numelements)
27575 
27576 {
27577   Int2  L, R, mid;
27578 
27579   if (StringHasNoText (name) || list == NULL || numelements < 1) return FALSE;
27580 
27581   L = 0;
27582   R = numelements - 1; /* -1 because now NULL terminated */
27583 
27584   while (L < R) {
27585     mid = (L + R) / 2;
27586     if (StringICmp (list [mid], name) < 0) {
27587       L = mid + 1;
27588     } else {
27589       R = mid;
27590     }
27591   }
27592 
27593   if (StringICmp (list [R], name) == 0) return TRUE;
27594 
27595   return FALSE;
27596 }
27597 
HasBadCharacter(CharPtr str)27598 static Boolean HasBadCharacter (CharPtr str)
27599 
27600 {
27601   Char  ch;
27602 
27603   if (StringHasNoText (str)) return FALSE;
27604 
27605   ch = *str;
27606   while (ch != '\0') {
27607     if (ch == '?' || ch == '!' || ch == '~') return TRUE;
27608     str++;
27609     ch = *str;
27610   }
27611 
27612   return FALSE;
27613 }
27614 
EndsWithBadCharacter(CharPtr str)27615 static Boolean EndsWithBadCharacter (CharPtr str)
27616 
27617 {
27618   Char    ch;
27619   size_t  len;
27620 
27621   if (StringHasNoText (str)) return FALSE;
27622 
27623   len = StringLen (str);
27624   if (len < 1) return FALSE;
27625 
27626   ch = str [len - 1];
27627   if (ch == '_' || ch == '.' || ch == ',' || ch == ':' || ch == ';') return TRUE;
27628 
27629   return FALSE;
27630 }
27631 
EndsWithHyphen(CharPtr str)27632 static Boolean EndsWithHyphen (CharPtr str)
27633 
27634 {
27635   Char    ch;
27636   size_t  len;
27637 
27638   if (StringHasNoText (str)) return FALSE;
27639 
27640   len = StringLen (str);
27641   if (len < 1) return FALSE;
27642 
27643   ch = str [len - 1];
27644   if (ch == '-') return TRUE;
27645 
27646   return FALSE;
27647 }
27648 
27649 
CouldExtendPartial(SeqLocPtr slp,Boolean partial5)27650 static Boolean CouldExtendPartial (SeqLocPtr slp, Boolean partial5)
27651 {
27652   BioseqPtr bsp;
27653   Int4      pos;
27654   Uint1     strand;
27655   Char      str[4];
27656   Boolean   rval = FALSE;
27657 
27658   if (slp == NULL) {
27659     return FALSE;
27660   }
27661 
27662   bsp = BioseqFindFromSeqLoc (slp);
27663   if (bsp == NULL) {
27664     return FALSE;
27665   }
27666   strand = SeqLocStrand (slp);
27667 
27668   if ((strand != Seq_strand_minus && partial5) || (strand == Seq_strand_minus && !partial5)) {
27669     pos = SeqLocStart (slp);
27670     if (pos < 2) {
27671       rval = TRUE;
27672     } else if (bsp->repr == Seq_repr_delta) {
27673       /* wasn't close to the sequence end, but perhaps it is close to a gap */
27674       SeqPortStreamInt (bsp, pos - 3, pos - 1, Seq_strand_plus, EXPAND_GAPS_TO_DASHES, (Pointer) str, NULL);
27675       if (str[0] == '-' || str[1] == '-' || str[2] == '-') {
27676         rval = TRUE;
27677       }
27678     }
27679   } else {
27680     pos = SeqLocStop (slp);
27681     if (pos > bsp->length - 2) {
27682       rval = TRUE;
27683     } else {
27684       /* wasn't close to the sequence end, but perhaps it is close to a gap */
27685       SeqPortStreamInt (bsp, pos + 1, pos + 3, Seq_strand_plus, EXPAND_GAPS_TO_DASHES, (Pointer) str, NULL);
27686       if (str[0] == '-' || str[1] == '-' || str[2] == '-') {
27687         rval = TRUE;
27688       }
27689     }
27690   }
27691 
27692   return rval;
27693 }
27694 
LocationStrandsIncompatible(SeqLocPtr slp1,SeqLocPtr slp2)27695 static Boolean LocationStrandsIncompatible (
27696   SeqLocPtr slp1,
27697   SeqLocPtr slp2
27698 )
27699 
27700 {
27701   Uint1  strand1, strand2;
27702 
27703   if (slp1 == NULL || slp2 == NULL) return FALSE;
27704   strand1 = SeqLocStrand (slp1);
27705   strand2 = SeqLocStrand (slp2);
27706   if (strand1 != strand2) {
27707     if ((strand1 == Seq_strand_unknown || strand1 == Seq_strand_plus) &&
27708         (strand2 == Seq_strand_unknown || strand2 == Seq_strand_plus)) return FALSE;
27709     /* if strands are mixed, need to check if interval is contained */
27710     if (strand1 == Seq_strand_other) {
27711       if (SeqLocCompareEx (slp1, slp2, TRUE) == SLC_B_IN_A) {
27712         return FALSE;
27713       }
27714     } else if (strand2 == Seq_strand_other) {
27715       if (SeqLocCompareEx (slp1, slp2, TRUE) == SLC_A_IN_B) {
27716         return FALSE;
27717       }
27718     }
27719     return TRUE;
27720   }
27721   return FALSE;
27722 }
27723 
27724 
GetGeneXrefLabel(GeneRefPtr grp)27725 static CharPtr GetGeneXrefLabel (GeneRefPtr grp)
27726 {
27727   SeqFeat sf;
27728   Char    buf[255];
27729 
27730   MemSet (&sf, 0, sizeof (SeqFeat));
27731   sf.data.choice = SEQFEAT_GENE;
27732   sf.data.value.ptrvalue = grp;
27733 
27734   FeatDefLabel (&sf, buf, sizeof (buf) - 1, OM_LABEL_CONTENT);
27735   return StringSave (buf);
27736 }
27737 
27738 
TestForBracketsInProductName(CharPtr str,ValidStructPtr vsp)27739 static void TestForBracketsInProductName (CharPtr str, ValidStructPtr vsp)
27740 {
27741   size_t  len;
27742   Boolean report_error = FALSE;
27743   CharPtr cp;
27744 
27745   if (StringHasNoText (str)) {
27746     return;
27747   }
27748 
27749   len = StringLen (str);
27750   if (len > 1) {
27751     if (str [len - 1] != ']') {
27752       /* doesn't end with bracket */
27753     } else if (len < 5) {
27754       /* too short to contain special text */
27755       report_error = TRUE;
27756     } else if ((cp = StringRChr (str, '[')) == NULL) {
27757       /* doesn't contain matched brackets */
27758       report_error = TRUE;
27759     } else if (StringNCmp (cp, "[NAD", 4) == 0) {
27760       /* contains special text */
27761     } else {
27762       report_error = TRUE;
27763     }
27764     if (report_error) {
27765       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ProteinNameEndsInBracket, "Protein name ends with bracket and may contain organism name");
27766     }
27767   }
27768 }
27769 
27770 
ValidateRna(SeqFeatPtr sfp,ValidStructPtr vsp,GatherContextPtr gcp)27771 static void ValidateRna (SeqFeatPtr sfp, ValidStructPtr vsp, GatherContextPtr gcp)
27772 {
27773   Uint1     aa;
27774   RnaRefPtr rrp;
27775   Boolean   pseudo, ovgenepseudo = FALSE;
27776   Boolean   protidqual = FALSE, transidqual = FALSE;
27777   GBQualPtr gbq;
27778   tRNAPtr   trp = NULL;
27779   Boolean   badanticodon, anticodonqual, productqual, mustbemethionine;
27780   Int4      anticodonlen;
27781   SeqLocPtr slp;
27782   RNAGenPtr rgp;
27783   Int2      i;
27784   CharPtr   str, three_letter_aa;
27785 
27786     rrp = (RnaRefPtr) (sfp->data.value.ptrvalue);
27787     if (rrp == NULL) return;
27788 
27789     if (rrp->type != RNA_TYPE_tRNA && rrp->ext.choice == 2) {
27790       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InvalidForType, "tRNA data structure on non-tRNA feature");
27791     }
27792     /*
27793     if (rrp->type == RNA_TYPE_misc_RNA && rrp->ext.choice == 3) {
27794       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InvalidForType, "RNA-gen data structure on miscRNA feature");
27795     }
27796     */
27797 
27798     pseudo = sfp->pseudo;
27799     ovgenepseudo = FALSE;
27800     if (OverlappingGeneIsPseudo (sfp)) {
27801       pseudo = TRUE;
27802       ovgenepseudo = TRUE;
27803     }
27804 
27805     if (rrp->type == 2) {       /* mRNA */
27806       if (!pseudo) {
27807         MrnaTransCheck (vsp, sfp);      /* transcription check */
27808         SpliceCheck (vsp, sfp);
27809       }
27810       /* CheckForBothStrands (vsp, sfp); */
27811       CheckForBadGeneOverlap (vsp, sfp);
27812       CheckForCommonMRNAProduct (vsp, sfp);
27813       protidqual = FALSE;
27814       transidqual = FALSE;
27815       gbq = sfp->qual;
27816       while (gbq != NULL) {
27817         if (StringICmp (gbq->qual, "protein_id") == 0) {
27818           protidqual = TRUE;
27819         }
27820         if (StringICmp (gbq->qual, "transcript_id") == 0) {
27821           transidqual = TRUE;
27822         }
27823         gbq = gbq->next;
27824       }
27825       if (protidqual) {
27826         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_WrongQualOnFeature, "protein_id should not be a gbqual on an mRNA feature");
27827       }
27828       if (transidqual) {
27829         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_WrongQualOnFeature, "transcript_id should not be a gbqual on an mRNA feature");
27830       }
27831       str = (CharPtr) rrp->ext.value.ptrvalue;
27832       if (StringDoesHaveText (str) && StringNICmp (str, "transfer RNA ", 13) == 0 &&
27833           StringICmp (str, "transfer RNA nucleotidyltransferase") != 0 &&
27834           StringICmp (str, "transfer RNA methyltransferase") != 0) {
27835         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_tRNAmRNAmixup, "mRNA feature product indicates it should be a tRNA feature");
27836       }
27837     }
27838     if (rrp->ext.choice == 2) { /* tRNA */
27839       trp = (tRNAPtr) (rrp->ext.value.ptrvalue);
27840       if (trp != NULL && trp->anticodon != NULL) {
27841         badanticodon = FALSE;
27842         anticodonlen = 0;
27843         slp = SeqLocFindNext (trp->anticodon, NULL);
27844         while (slp != NULL) {
27845           anticodonlen += SeqLocLen (slp);
27846           i = SeqLocCompare (slp, sfp->location);
27847           if ((i != SLC_A_IN_B) && (i != SLC_A_EQ_B)) {
27848             badanticodon = TRUE;
27849           }
27850           slp = SeqLocFindNext (trp->anticodon, slp);
27851         }
27852         if (badanticodon) {
27853           ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_Range, "Anticodon location not in tRNA");
27854         }
27855         if (anticodonlen != 3) {
27856           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_Range, "Anticodon is not 3 bases in length");
27857         }
27858         ValidateAnticodon (vsp, trp->anticodon);
27859       }
27860       CheckTrnaCodons (vsp, gcp, sfp, trp);
27861     }
27862     if (rrp->type == 3) {       /* tRNA */
27863       anticodonqual = FALSE;
27864       productqual = FALSE;
27865       mustbemethionine = FALSE;
27866       gbq = sfp->qual;
27867       while (gbq != NULL) {
27868         if (StringICmp (gbq->qual, "anticodon") == 0) {
27869           anticodonqual = TRUE;
27870         } else if (StringICmp (gbq->qual, "product") == 0) {
27871           if (StringICmp (gbq->val, "tRNA-fMet") != 0 && StringICmp (gbq->val, "tRNA-iMet") != 0) {
27872             productqual = TRUE;
27873           } else {
27874             mustbemethionine = TRUE;
27875           }
27876         }
27877         gbq = gbq->next;
27878       }
27879       if (anticodonqual) {
27880         ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "Unparsed anticodon qualifier in tRNA");
27881       }
27882       if (productqual) {
27883         ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "Unparsed product qualifier in tRNA");
27884       }
27885       if (mustbemethionine) {
27886         if (trp != NULL) {
27887           aa = GetAaFromtRNA (trp);
27888           if (aa != 'M') {
27889             three_letter_aa = Get3LetterSymbol (NULL, Seq_code_ncbieaa, NULL, aa);
27890             if (StringHasNoText (three_letter_aa)) {
27891               three_letter_aa = "?";
27892             }
27893             ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "Initiation tRNA claims to be tRNA-%s, but should be tRNA-Met", three_letter_aa);
27894           }
27895         }
27896       }
27897     }
27898     if (rrp->type == 3 && rrp->ext.choice == 1) { /* tRNA with string extension */
27899       ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "Unparsed product qualifier in tRNA");
27900     }
27901     if (rrp->type == 3 && rrp->ext.choice == 0) { /* tRNA with no extension */
27902       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_MissingTrnaAA, "Missing encoded amino acid qualifier in tRNA");
27903     }
27904     if (rrp->type == 0) {
27905       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_RNAtype0, "RNA type 0 (unknown) not supported");
27906     }
27907     if (rrp->type == 4 || rrp->type == 5 || rrp->type == 6 || rrp->type == 7) { /* rRNA, snRNA, scRNA, snoRNA */
27908       if (rrp->ext.choice != 1 || StringHasNoText ((CharPtr) rrp->ext.value.ptrvalue)) {
27909         if (! pseudo) {
27910           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InvalidQualifierValue, "%s has no name", rnaNameByType [(int) rrp->type]);
27911         }
27912       }
27913     }
27914     /*
27915     if (rrp->type == 255 && rrp->ext.choice == 1) {
27916       str = (CharPtr) rrp->ext.value.ptrvalue;
27917       if (StringICmp (str, "ncRNA") == 0) {
27918         for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
27919           if (StringICmp (gbq->qual, "ncRNA_class") != 0) continue;
27920           if (StringHasNoText (gbq->val)) continue;
27921           if (IsStringInNcRNAClassList (gbq->val)) continue;
27922           ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_InvalidQualifierValue, "Other ncRNA_class value '%s'", gbq->val);
27923         }
27924       }
27925     }
27926     */
27927     if (rrp->type == 2) {
27928       if (rrp->ext.choice == 1) {
27929         str = (CharPtr) rrp->ext.value.ptrvalue;
27930         if (StringDoesHaveText (str)) {
27931           if (HasBadCharacter (str)) {
27932             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadInternalCharacter, "mRNA name contains undesired character");
27933           }
27934           if (EndsWithBadCharacter (str)) {
27935             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadTrailingCharacter, "mRNA name ends with undesired character");
27936           }
27937           if (EndsWithHyphen (str)) {
27938             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadTrailingHyphen, "mRNA name ends with hyphen");
27939           }
27940           if (StringHasSgml (vsp, str)) {
27941             ValidErr (vsp, SEV_WARNING, ERR_GENERIC_SgmlPresentInText, "mRNA name %s has SGML", str);
27942           }
27943         }
27944       }
27945     }
27946     if (rrp->type == 4) {
27947       if (rrp->ext.choice == 1) {
27948         str = (CharPtr) rrp->ext.value.ptrvalue;
27949         if (StringDoesHaveText (str)) {
27950           if (HasBadCharacter (str)) {
27951             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadInternalCharacter, "rRNA name contains undesired character");
27952           }
27953           if (EndsWithBadCharacter (str)) {
27954             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadTrailingCharacter, "rRNA name ends with undesired character");
27955           }
27956           if (EndsWithHyphen (str)) {
27957             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadTrailingHyphen, "rRNA name ends with hyphen");
27958           }
27959           if (StringHasSgml (vsp, str)) {
27960             ValidErr (vsp, SEV_WARNING, ERR_GENERIC_SgmlPresentInText, "rRNA name %s has SGML", str);
27961           }
27962         }
27963       }
27964     }
27965     if (sfp->product != NULL) {
27966       CheckRnaProductType (vsp, gcp, sfp, rrp);
27967     }
27968 
27969     if (pseudo && sfp->product != NULL && StringISearch (sfp->except_text, "transcribed pseudogene") == NULL) {
27970       if (ovgenepseudo) {
27971         if (sfp->pseudo) {
27972           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PseudoRnaHasProduct, "A pseudo RNA should not have a product");
27973         } else {
27974           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PseudoRnaViaGeneHasProduct, "An RNA overlapped by a pseudogene should not have a product");
27975         }
27976       } else {
27977         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PseudoRnaHasProduct, "A pseudo RNA should not have a product");
27978       }
27979     }
27980 
27981     if (rrp->ext.choice == 3
27982         && (rgp = (RNAGenPtr) rrp->ext.value.ptrvalue) != NULL
27983         && !StringHasNoText (rgp->_class)
27984         && rrp->type != 8) {
27985       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_WrongQualOnFeature, "Only ncRNA should have ncRNA-class");
27986     }
27987 }
27988 
27989 
27990 //LCOV_EXCL_START
27991 // Not part of validation
IsGeneXrefRedundant(SeqFeatPtr sfp)27992 NLM_EXTERN Boolean IsGeneXrefRedundant (SeqFeatPtr sfp)
27993 {
27994   GeneRefPtr grp;
27995   SeqFeatPtr sfpx;
27996   GeneRefPtr grpx;
27997   Boolean    redundantgenexref = FALSE;
27998   CharPtr    syn1, syn2;
27999   DummySmfeData dsd;
28000   Int2          count;
28001   SeqMgrFeatContext fcontext;
28002 
28003   grp = SeqMgrGetGeneXref (sfp);
28004   if (grp == NULL) {
28005     return FALSE;
28006   }
28007   if (grp != NULL && SeqMgrGeneIsSuppressed (grp)) return FALSE;
28008 
28009   sfpx = SeqMgrGetOverlappingGene (sfp->location, &fcontext);
28010   if (sfpx == NULL || sfpx->data.choice != SEQFEAT_GENE)
28011     return FALSE;
28012   grpx = (GeneRefPtr) sfpx->data.value.ptrvalue;
28013   if (grpx == NULL)
28014     return FALSE;
28015 
28016   if (StringDoesHaveText (grp->locus_tag) && StringDoesHaveText (grp->locus_tag)) {
28017     if (StringICmp (grp->locus_tag, grpx->locus_tag) == 0) {
28018       redundantgenexref = TRUE;
28019     }
28020   } else if (StringDoesHaveText (grp->locus) && StringDoesHaveText (grp->locus)) {
28021     if (StringICmp (grp->locus, grpx->locus) == 0) {
28022       redundantgenexref = TRUE;
28023     }
28024   } else if (grp->syn != NULL && grpx->syn != NULL) {
28025     syn1 = (CharPtr) grp->syn->data.ptrvalue;
28026     syn2 = (CharPtr) grpx->syn->data.ptrvalue;
28027     if ((StringDoesHaveText (syn1)) && StringDoesHaveText (syn2)) {
28028       if (StringICmp (syn1, syn2) == 0) {
28029         redundantgenexref = TRUE;
28030       }
28031     }
28032   }
28033   if (redundantgenexref) {
28034     MemSet ((Pointer) &dsd, 0, sizeof (DummySmfeData));
28035     dsd.max = INT4_MAX;
28036     dsd.num_at_max = 0;
28037     dsd.num_trans_spliced = 0;
28038     dsd.equivalent_genes = FALSE;
28039     dsd.grp_at_max = NULL;
28040     count = SeqMgrGetAllOverlappingFeatures (sfp->location, FEATDEF_GENE, NULL, 0,
28041                                              LOCATION_SUBSET, (Pointer) &dsd, DummySMFEProc);
28042     if (dsd.num_at_max > 1) {
28043       redundantgenexref = FALSE;
28044     }
28045   }
28046   return redundantgenexref;
28047 }
28048 //LCOV_EXCL_STOP
28049 
28050 
CheckCodingRegionAndProteinFeaturePartials(SeqFeatPtr sfp,ValidStructPtr vsp)28051 static void CheckCodingRegionAndProteinFeaturePartials (SeqFeatPtr sfp, ValidStructPtr vsp)
28052 {
28053   BioseqPtr protbsp;
28054   SeqFeatPtr prot;
28055   SeqMgrFeatContext context;
28056   Boolean cds_partial5, cds_partial3, prot_partial5, prot_partial3, conflict = FALSE;
28057   SeqDescrPtr sdp;
28058   MolInfoPtr mip;
28059   Uint1 completeness;
28060 
28061   if (sfp == NULL || vsp == NULL) return;
28062 
28063   if (sfp->data.choice == SEQFEAT_CDREGION) {
28064     protbsp = BioseqFindFromSeqLoc (sfp->product);
28065     if (protbsp == NULL) return;
28066     prot = SeqMgrGetNextFeature (protbsp, NULL, 0, FEATDEF_PROT, &context);
28067     if (prot == NULL) return;
28068     CheckSeqLocForPartial (sfp->location, &cds_partial5, &cds_partial3);
28069     CheckSeqLocForPartial (prot->location, &prot_partial5, &prot_partial3);
28070     if ((cds_partial5 && !prot_partial5) || (!cds_partial5 && prot_partial5) ||
28071         (cds_partial3 && !prot_partial3) || (!cds_partial3 && prot_partial3)) {
28072       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PartialsInconsistent, "Coding region and protein feature partials conflict");
28073     }
28074   } else if (sfp->data.choice == SEQFEAT_PROT) {
28075     protbsp = BioseqFindFromSeqLoc (sfp->location);
28076     if (protbsp == NULL) return;
28077     if (SeqMgrGetCDSgivenProduct (protbsp, &context) != NULL) return;
28078     prot = SeqMgrGetNextFeature (protbsp, NULL, 0, FEATDEF_PROT, &context);
28079     if (prot == NULL) return;
28080     sdp = GetNextDescriptorUnindexed (protbsp, Seq_descr_molinfo, NULL);
28081     if (sdp == NULL) return;
28082     mip = (MolInfoPtr) sdp->data.ptrvalue;
28083     if (mip == NULL) return;
28084     CheckSeqLocForPartial (prot->location, &prot_partial5, &prot_partial3);
28085     completeness = mip->completeness;
28086     if (completeness == 2 && ((! prot_partial5) && (! prot_partial3))) {
28087       conflict = TRUE;
28088     } else if (completeness == 3 && ((! prot_partial5) || prot_partial3)) {
28089       conflict = TRUE;
28090     } else if (completeness == 4 && (prot_partial5 || (! prot_partial3))) {
28091       conflict = TRUE;
28092     } else if (completeness == 5 && ((! prot_partial5) || (! prot_partial3))) {
28093       conflict = TRUE;
28094     } else if ((completeness < 2 || completeness > 5) && (prot_partial5 || prot_partial3)) {
28095       conflict = TRUE;
28096     }
28097     if (conflict) {
28098       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PartialsInconsistent, "Molinfo completeness and protein feature partials conflict");
28099     }
28100   }
28101 }
28102 
28103 
CheckForShortExons(ValidStructPtr vsp,SeqLocPtr loc)28104 static void CheckForShortExons (ValidStructPtr vsp, SeqLocPtr loc)
28105 {
28106   /* note - only want to look at internal exons, so not the first and not the last */
28107   SeqLocPtr slp;
28108   Int4 prev_len = 16;
28109   Int4 num_short = 0;
28110 
28111   slp = SeqLocFindNext (loc, NULL);
28112   if (slp == NULL) {
28113     return;
28114   }
28115   while ((slp = SeqLocFindNext(loc, slp)) != NULL) {
28116     if (prev_len < 16) {
28117       num_short++;
28118     }
28119     prev_len = SeqLocLen (slp);
28120   }
28121   if (num_short > 1) {
28122     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ShortExon, "Coding region has multiple internal exons that are too short");
28123   } else if (num_short > 0) {
28124     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ShortExon,
28125               "Internal coding region exon is too short");
28126   }
28127 }
28128 
28129 
FeaturePairIsTwoTypes(SeqFeatPtr sfp1,SeqFeatPtr sfp2,Uint1 ftype1,Uint1 ftype2)28130 static Boolean FeaturePairIsTwoTypes (SeqFeatPtr sfp1, SeqFeatPtr sfp2, Uint1 ftype1, Uint1 ftype2)
28131 {
28132     if (sfp1 == NULL || sfp2 == NULL) {
28133         return FALSE;
28134     }
28135     if (sfp1->idx.subtype == ftype1 && sfp2->idx.subtype == ftype2) {
28136         return TRUE;
28137     } else if (sfp1->idx.subtype == ftype2 && sfp2->idx.subtype == ftype1) {
28138         return TRUE;
28139     } else {
28140         return FALSE;
28141     }
28142 }
28143 
28144 
s_GeneRefsAreEquivalent(GeneRefPtr grp,GeneRefPtr grpx,CharPtr PNTR label)28145 static Boolean s_GeneRefsAreEquivalent (GeneRefPtr grp, GeneRefPtr grpx, CharPtr PNTR label)
28146 {
28147     Boolean equivalent = FALSE;
28148     CharPtr syn1, syn2;
28149 
28150     if (grp == NULL || grpx == NULL) {
28151         return FALSE;
28152     }
28153 
28154     if (StringDoesHaveText (grp->locus_tag) && StringDoesHaveText (grpx->locus_tag)) {
28155       if (StringICmp (grp->locus_tag, grpx->locus_tag) == 0) {
28156         equivalent = TRUE;
28157         if (label != NULL) {
28158           *label = grp->locus_tag;
28159         }
28160       }
28161     } else if (StringDoesHaveText (grp->locus) && StringDoesHaveText (grpx->locus)) {
28162       if (StringICmp (grp->locus, grpx->locus) == 0) {
28163         equivalent = TRUE;
28164         if (label != NULL) {
28165           *label = grp->locus;
28166         }
28167       }
28168     } else if (grp->syn != NULL && grpx->syn != NULL) {
28169       syn1 = (CharPtr) grp->syn->data.ptrvalue;
28170       syn2 = (CharPtr) grpx->syn->data.ptrvalue;
28171       if ((StringDoesHaveText (syn1)) && StringDoesHaveText (syn2)) {
28172         if (StringICmp (syn1, syn2) == 0) {
28173           equivalent = TRUE;
28174           if (label != NULL) {
28175             *label = syn1;
28176           }
28177         }
28178       }
28179     }
28180     return equivalent;
28181 }
28182 
28183 
s_GeneXrefConflictsWithFeatureXref(SeqFeatPtr sfp,SeqFeatPtr gene)28184 static Boolean s_GeneXrefConflictsWithFeatureXref(SeqFeatPtr sfp, SeqFeatPtr gene)
28185 {
28186   SeqFeatXrefPtr  xref;
28187 
28188   for (xref = sfp->xref; xref != NULL; xref = xref->next) {
28189     if (xref->data.choice == SEQFEAT_GENE
28190         && !s_GeneRefsAreEquivalent(xref->data.value.ptrvalue,
28191                                     gene->data.value.ptrvalue,
28192                                     NULL)) {
28193       return TRUE;
28194     }
28195   }
28196   return FALSE;
28197 }
28198 
28199 
ValidateSeqFeatXref(SeqFeatPtr sfp,ValidStructPtr vsp)28200 static void ValidateSeqFeatXref (SeqFeatPtr sfp, ValidStructPtr vsp)
28201 {
28202   SeqFeatXrefPtr  xref, matchxref;
28203   SeqFeatPtr      matchsfp, origsfp;
28204   Boolean         hasxref, has_reciprocal_xref;
28205 
28206   for (xref = sfp->xref; xref != NULL; xref = xref->next) {
28207     if (xref->id.choice == 0 && xref->data.choice == 0) {
28208       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_SeqFeatXrefProblem, "SeqFeatXref with no id or data field");
28209     } else if (xref->id.choice != 0) {
28210       matchsfp = SeqMgrGetFeatureByFeatID (sfp->idx.entityID, NULL, NULL, xref, NULL);
28211       if (matchsfp == NULL) {
28212         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_SeqFeatXrefFeatureMissing, "Cross-referenced feature cannot be found");
28213       } else {
28214         hasxref = FALSE;
28215         has_reciprocal_xref = FALSE;
28216         for (matchxref = matchsfp->xref; matchxref != NULL; matchxref = matchxref->next) {
28217           if (matchxref->id.choice != 0) {
28218             hasxref = TRUE;
28219             origsfp = SeqMgrGetFeatureByFeatID (matchsfp->idx.entityID, NULL, NULL, matchxref, NULL);
28220             if (origsfp == sfp) {
28221               has_reciprocal_xref = TRUE;
28222               if (FeaturePairIsTwoTypes(sfp, matchsfp, FEATDEF_CDS, FEATDEF_mRNA)
28223                   || FeaturePairIsTwoTypes(sfp, matchsfp, FEATDEF_CDS, FEATDEF_GENE)
28224                   || FeaturePairIsTwoTypes(sfp, matchsfp, FEATDEF_mRNA, FEATDEF_GENE)) {
28225                 /* okay */
28226                 if (matchsfp->data.choice == SEQFEAT_GENE
28227                     && s_GeneXrefConflictsWithFeatureXref(sfp, matchsfp)) {
28228                   ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_SeqFeatXrefProblem, "Feature gene xref does not match Feature ID cross-referenced gene feature");
28229                 }
28230               } else {
28231                 ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_SeqFeatXrefProblem, "Cross-references are not between CDS and mRNA pair or between a gene and a CDS or mRNA");
28232               }
28233             }
28234           }
28235         }
28236         if (! hasxref) {
28237           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_SeqFeatXrefProblem, "Cross-referenced feature does not have its own cross-reference");
28238         } else if (!has_reciprocal_xref) {
28239           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_SeqFeatXrefNotReciprocal, "Cross-referenced feature does not link reciprocally");
28240         }
28241       }
28242     }
28243   }
28244 }
28245 
28246 
ValidateSeqFeat(GatherContextPtr gcp)28247 NLM_EXTERN void ValidateSeqFeat (GatherContextPtr gcp)
28248 {
28249   Int2            type, i, j;
28250   static char    *parterrs[4] = {
28251     "Start does not include first/last residue of sequence",
28252     "Stop does not include first/last residue of sequence",
28253     "Internal partial intervals do not include first/last residue of sequence",
28254     "Improper use of partial (greater than or less than)"
28255   };
28256   Uint2           partials[2], errtype;
28257   Char            buf[128];
28258   CharPtr         tmp;
28259   ValidStructPtr  vsp;
28260   SeqFeatPtr      sfp;
28261   SeqFeatPtr      cds;
28262   CloneRefPtr     clrp;
28263   CdRegionPtr     crp;
28264   CodeBreakPtr    cbp, prevcbp;
28265   CharPtr         ctmp;
28266   GBQualPtr       gbq;
28267   Boolean         pseudo, excpt, conflict, codonqual,
28268                   protidqual,
28269                   transidqual, ovgenepseudo, gene_synonym_on_cds;
28270   ImpFeatPtr      ifp;
28271   GeneRefPtr      grp;
28272   SeqFeatPtr      gene;
28273   ProtRefPtr      prp;
28274   ValNodePtr      vnp;
28275   BioseqPtr       bsp, nbsp;
28276   BioseqContextPtr bcp = NULL;
28277   BioSourcePtr    biop, dbiop;
28278   OrgNamePtr      onp;
28279   OrgRefPtr       orp, dorp;
28280   SubSourcePtr    ssp;
28281   Boolean         transgenic;
28282   Int2            biopgencode;
28283   Int2            cdsgencode;
28284   Boolean         plastid;
28285   GeneticCodePtr  gc;
28286   PubdescPtr      pdp;
28287   /*
28288   DbtagPtr        db = NULL;
28289   Int4            id = -1;
28290   */
28291   SeqMgrDescContext context;
28292   GeneRefPtr      grpx;
28293   SeqFeatPtr      sfpx = NULL, sfpy = NULL, prt;
28294   SeqFeatPtr      operon;
28295   Boolean         redundantgenexref;
28296   SeqMgrFeatContext fcontext, gcontext;
28297   CharPtr         label = NULL, genexref_label;
28298   Uint2           oldEntityID;
28299   Uint4           oldItemID;
28300   SeqIdPtr        sip;
28301   TextSeqIdPtr    tsip;
28302   BioseqPtr       protBsp;
28303   ErrSev          sev;
28304   Boolean         multitoken;
28305   Char            ch;
28306   CharPtr         ptr;
28307   SeqLocPtr       slp;
28308   Int2            count;
28309   DummySmfeData   dsd;
28310   CharPtr         str;
28311   Boolean         isgap;
28312   Boolean         badseq;
28313   Boolean         is_seqloc_bond;
28314   SeqBondPtr      sbp;
28315   CharPtr         sfp_old_locus_tag;
28316   CharPtr         gene_old_locus_tag;
28317   Boolean         bypassGeneTest;
28318   Boolean         dicistronic = FALSE;
28319   Int2            inferenceCode;
28320   Boolean         hasInference = FALSE;
28321   Boolean         hasExperiment = FALSE;
28322   Boolean         accn_seqid;
28323   SeqDescrPtr     sdp;
28324   SeqMgrDescContext dcontext;
28325   MolInfoPtr      mip;
28326   Boolean         farFetchProd;
28327   Boolean         skip;
28328   Boolean         is_nc = FALSE;
28329   VariationRefPtr  vrfp;
28330 
28331   vsp = (ValidStructPtr) (gcp->userdata);
28332   sfp = (SeqFeatPtr) (gcp->thisitem);
28333   vsp->descr = NULL;
28334   vsp->sfp = sfp;
28335   type = (Int2) (sfp->data.choice);
28336 
28337   ValidateSeqLoc (vsp, sfp->location, (sfp->data.choice == SEQFEAT_GENE || !IsGenomicPipeline(vsp)), "Location");
28338 
28339   ValidateSeqLoc (vsp, sfp->product, TRUE, "Product");
28340 
28341   CheckForBothOrBothRev (vsp, sfp);
28342 
28343   if (vsp->feat_loc_has_gi) {
28344     accn_seqid = FALSE;
28345     VisitSeqIdsInSeqLoc (sfp->location, (Pointer) &accn_seqid, LookForAccnLocs);
28346     if (accn_seqid) {
28347       if (! vsp->is_smupd_in_sep && !vsp->is_gpipe_in_sep) {
28348         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_FeatureRefersToAccession, "Feature location refers to accession");
28349       }
28350     }
28351   }
28352 
28353   if (vsp->feat_prod_has_gi) {
28354     accn_seqid = FALSE;
28355     VisitSeqIdsInSeqLoc (sfp->product, (Pointer) &accn_seqid, LookForAccnLocs);
28356     if (accn_seqid) {
28357       if (! vsp->is_smupd_in_sep && !vsp->is_gpipe_in_sep) {
28358         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_FeatureRefersToAccession, "Feature product refers to accession");
28359       }
28360     }
28361   }
28362 
28363   farFetchProd = (Boolean) (vsp->farFetchCDSproducts || vsp->farFetchMRNAproducts);
28364   partials[0] = SeqLocPartialCheckEx (sfp->product, farFetchProd);
28365   partials[1] = SeqLocPartialCheck (sfp->location);
28366 
28367   CheckCodingRegionAndProteinFeaturePartials (sfp, vsp);
28368 
28369   if ((partials[0] != SLP_COMPLETE) || (partials[1] != SLP_COMPLETE) || (sfp->partial)) {       /* partialness */
28370     /* a feature on a partial sequence should be partial -- if often isn't */
28371     if ((!sfp->partial) && (partials[1] != SLP_COMPLETE) && (sfp->location->choice == SEQLOC_WHOLE)) {
28372       //LCOV_EXCL_START
28373       //BasicCleanup changes whole locations to ints
28374       ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_PartialProblem, "On partial Bioseq, SeqFeat.partial should be TRUE");
28375       //LCOV_EXCL_STOP
28376     }
28377     /* a partial feature, with complete location, but partial product */
28378     else if ((sfp->partial) && (sfp->product != NULL) && (partials[1] == SLP_COMPLETE) && (sfp->product->choice == SEQLOC_WHOLE)
28379              && (partials[0] != SLP_COMPLETE)) {
28380       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PartialProblem, "When SeqFeat.product is a partial Bioseq, SeqFeat.location should also be partial");
28381     }
28382     /* gene on segmented set is now 'order', should also be partial */
28383     else if (type == SEQFEAT_GENE && sfp->product == NULL && partials[1] == SLP_INTERNAL) {
28384       if (!sfp->partial) {
28385         //LCOV_EXCL_START
28386         //BasicCleanup sets partial flag for ordered locations
28387         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PartialProblem, "Gene of 'order' with otherwise complete location should have partial flag set");
28388         //LCOV_EXCL_STOP
28389       }
28390     }
28391     /* inconsistent combination of partial/complete product,location,partial flag - part 1 */
28392     else if (((partials[0] == SLP_COMPLETE) && (sfp->product != NULL))) {
28393       sev = SEV_WARNING;
28394       bsp = GetBioseqGivenSeqLoc (sfp->product, gcp->entityID);
28395       /* if not local bioseq product, lower severity */
28396       if (bsp == NULL) {
28397         sev = SEV_INFO;
28398       }
28399       tmp = StringMove (buf, "Inconsistent: ");
28400       if (sfp->product != NULL) {
28401         tmp = StringMove (tmp, "Product= ");
28402         if (partials[0])
28403           tmp = StringMove (tmp, "partial, ");
28404         else
28405           tmp = StringMove (tmp, "complete, ");
28406       }
28407       tmp = StringMove (tmp, "Location= ");
28408       if (partials[1])
28409         tmp = StringMove (tmp, "partial, ");
28410       else
28411         tmp = StringMove (tmp, "complete, ");
28412       tmp = StringMove (tmp, "Feature.partial= ");
28413       if (sfp->partial)
28414         tmp = StringMove (tmp, "TRUE");
28415       else
28416         tmp = StringMove (tmp, "FALSE");
28417       if (bsp == NULL && LocationIsFar (sfp->product) && NoFetchFunctions ()) {
28418         vsp->far_fetch_failure = TRUE;
28419       } else {
28420         ValidErr (vsp, sev, ERR_SEQ_FEAT_PartialsInconsistent, buf);
28421       }
28422     /* inconsistent combination of partial/complete product,location,partial flag - part 2 */
28423     } else if ((partials[1] == SLP_COMPLETE) || (!sfp->partial)) {
28424       tmp = StringMove (buf, "Inconsistent: ");
28425       if (sfp->product != NULL) {
28426         tmp = StringMove (tmp, "Product= ");
28427         if (partials[0])
28428           tmp = StringMove (tmp, "partial, ");
28429         else
28430           tmp = StringMove (tmp, "complete, ");
28431       }
28432       tmp = StringMove (tmp, "Location= ");
28433       if (partials[1])
28434         tmp = StringMove (tmp, "partial, ");
28435       else
28436         tmp = StringMove (tmp, "complete, ");
28437       tmp = StringMove (tmp, "Feature.partial= ");
28438       if (sfp->partial)
28439         tmp = StringMove (tmp, "TRUE");
28440       else
28441         tmp = StringMove (tmp, "FALSE");
28442       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PartialsInconsistent, buf);
28443     }
28444     /* 5' or 3' partial location giving unclassified partial product */
28445     else if (((partials [1] & SLP_START) != 0 || ((partials [1] & SLP_STOP) != 0)) && ((partials [0] & SLP_OTHER) != 0) && sfp->partial) {
28446       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PartialProblem, "5' or 3' partial location should not have unclassified partial in product molinfo descriptor");
28447     }
28448 
28449     /* may have other error bits set as well */
28450 
28451     /* PartialProduct */
28452     errtype = SLP_NOSTART;
28453     for (j = 0; j < 4; j++) {
28454       bypassGeneTest = FALSE;
28455       if (partials[0] & errtype) {
28456         if (sfp->data.choice == SEQFEAT_CDREGION && sfp->excpt &&
28457                    StringStr (sfp->except_text, "rearrangement required for product") != NULL) {
28458         } else if (sfp->data.choice == SEQFEAT_CDREGION && j == 0) {
28459           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PartialProblem,
28460                 "PartialProduct: 5' partial is not at start AND is not at consensus splice site");
28461         } else if (sfp->data.choice == SEQFEAT_CDREGION && j == 1) {
28462           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PartialProblem,
28463                 "PartialProduct: 3' partial is not at stop AND is not at consensus splice site");
28464         } else {
28465           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PartialProblem,
28466             "PartialProduct: %s", parterrs[j]);
28467         }
28468       }
28469       errtype <<= 1;
28470     }
28471 
28472     /* PartialLocation */
28473     errtype = SLP_NOSTART;
28474     for (j = 0; j < 4; j++) {
28475       bypassGeneTest = FALSE;
28476       badseq = FALSE;
28477       if (partials[1] & errtype) {
28478         if (j == 3) {
28479           if (LocationIsFar (sfp->location) && NoFetchFunctions ()) {
28480             vsp->far_fetch_failure = TRUE;
28481           } else if (sfp->data.choice == SEQFEAT_CDREGION && sfp->excpt &&
28482                      StringStr (sfp->except_text, "rearrangement required for product") != NULL) {
28483           } else {
28484             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PartialProblem,
28485               "PartialLocation: Improper use of partial (greater than or less than)");
28486           }
28487         } else if (j == 2) {
28488           if (LocationIsFar (sfp->location) && NoFetchFunctions ()) {
28489             vsp->far_fetch_failure = TRUE;
28490           } else {
28491             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PartialProblem,
28492               "PartialLocation: Internal partial intervals do not include first/last residue of sequence");
28493           }
28494         } else {
28495           if (IsCddFeat (sfp)) {
28496             /* suppresses  warning */
28497           } else if (sfp->data.choice == SEQFEAT_GENE && SameAsCDS (sfp, errtype, NULL)) {
28498           } else if (sfp->data.choice == SEQFEAT_GENE && SameAsMRNA (sfp, errtype)) {
28499           } else if (sfp->idx.subtype == FEATDEF_mRNA && SameAsCDS (sfp, errtype, &bypassGeneTest)) {
28500           } else if (sfp->idx.subtype == FEATDEF_mRNA && (! bypassGeneTest) && SameAsGene (sfp)) {
28501           } else if (sfp->idx.subtype == FEATDEF_exon && SameAsMRNA (sfp, errtype)) {
28502           } else if (LocationIsFar (sfp->location) && NoFetchFunctions ()) {
28503             vsp->far_fetch_failure = TRUE;
28504           } else if (sfp->data.choice == SEQFEAT_CDREGION && SameAsMRNA (sfp, errtype) &&
28505                      PartialAtSpliceSiteOrGap (vsp, sfp->location, errtype, &isgap, &badseq)) {
28506           } else if (PartialAtSpliceSiteOrGap (vsp, sfp->location, errtype, &isgap, &badseq)) {
28507             if (! isgap) {
28508               if (sfp->idx.subtype == FEATDEF_tRNA && j == 0 && AdjacentToIntron (sfp)) {
28509               } else if (sfp->idx.subtype == FEATDEF_CDS && FeatureOnOrganelle (sfp)) {
28510                 if (AdjacentToIntron (sfp)) {
28511                 } else {
28512                   ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_PartialProblem,
28513                               "PartialLocation: %s (organelle does not use standard splice site convention)",
28514                               parterrs[j]);
28515                 }
28516               } else if (sfp->idx.subtype != FEATDEF_CDS || SplicingNotExpected (sfp)) {
28517                 if ( ! sfp->pseudo) {
28518                   ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_PartialProblem,
28519                             "PartialLocation: %s (but is at consensus splice site)",
28520                             parterrs[j]);
28521                 }
28522               } else if (sfp->idx.subtype == FEATDEF_CDS) {
28523                 bsp = BioseqFindFromSeqLoc (sfp->location);
28524                 if (bsp != NULL) {
28525                   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext);
28526                   if (sdp != NULL) {
28527                     mip = (MolInfoPtr) sdp->data.ptrvalue;
28528                     if (mip != NULL) {
28529                       if (mip->biomol == MOLECULE_TYPE_MRNA) {
28530                         if ( ! sfp->pseudo) {
28531                           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PartialProblem,
28532                                     "PartialLocation: %s (but is at consensus splice site, but is on an mRNA that is already spliced)",
28533                                     parterrs[j]);
28534                         }
28535                       }
28536                     }
28537                   }
28538                 }
28539               }
28540             }
28541           } else if (badseq) {
28542             ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_PartialProblem,
28543               "PartialLocation: %s (and is at bad sequence)",
28544               parterrs[j]);
28545           } else if (sfp->data.choice == SEQFEAT_CDREGION && sfp->excpt &&
28546                      StringStr (sfp->except_text, "rearrangement required for product") != NULL) {
28547           } else if (sfp->data.choice == SEQFEAT_CDREGION && j == 0) {
28548             if (PartialAtGapOrNs (vsp, sfp->location, errtype) || StringStr (sfp->comment, "coding region disrupted by sequencing gap") != NULL) {
28549             } else {
28550               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PartialProblem,
28551                     "PartialLocation: 5' partial is not at start AND is not at consensus splice site");
28552             }
28553           } else if (sfp->data.choice == SEQFEAT_CDREGION && j == 1) {
28554             if (PartialAtGapOrNs (vsp, sfp->location, errtype) || StringStr (sfp->comment, "coding region disrupted by sequencing gap") != NULL) {
28555             } else {
28556               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_PartialProblem,
28557                     "PartialLocation: 3' partial is not at stop AND is not at consensus splice site");
28558             }
28559           } else if (sfp->idx.subtype == FEATDEF_tRNA && j == 0 && AdjacentToIntron (sfp)) {
28560           } else if (j == 0) {
28561             sev = SEV_WARNING;
28562             if (vsp->genomeSubmission && sfp->idx.subtype == FEATDEF_rRNA) {
28563               sev = SEV_ERROR;
28564             }
28565             ValidErr (vsp, sev, ERR_SEQ_FEAT_PartialProblem,
28566               "PartialLocation: Start does not include first/last residue of sequence");
28567           } else if (j == 1) {
28568             sev = SEV_WARNING;
28569             if (vsp->genomeSubmission && sfp->idx.subtype == FEATDEF_rRNA) {
28570               sev = SEV_ERROR;
28571             }
28572             ValidErr (vsp, sev, ERR_SEQ_FEAT_PartialProblem,
28573               "PartialLocation: Stop does not include first/last residue of sequence");
28574           }
28575         }
28576       }
28577       errtype <<= 1;
28578     }
28579 
28580   }
28581 
28582   CheckForIllegalDbxref (vsp, gcp, sfp->dbxref);
28583 
28584   switch (type) {
28585   case 1:                      /* Gene-ref */
28586     grp = (GeneRefPtr) (sfp->data.value.ptrvalue);
28587     if (grp != NULL) {
28588       if (EmptyOrNullString (grp->locus) &&
28589           EmptyOrNullString (grp->allele) && EmptyOrNullString (grp->desc) &&
28590           EmptyOrNullString (grp->maploc) && EmptyOrNullString (grp->locus_tag) &&
28591           grp->db == NULL && grp->syn == NULL) {
28592         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_GeneRefHasNoData, "There is a gene feature where all fields are empty");
28593       }
28594       if (StringDoesHaveText (grp->locus_tag)) {
28595         multitoken = FALSE;
28596         for (ptr = grp->locus_tag, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
28597           if (IS_WHITESP (ch)) {
28598             multitoken = TRUE;
28599           }
28600         }
28601         if (multitoken) {
28602           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_LocusTagProblem, "Gene locus_tag '%s' should be a single word without any spaces", grp->locus_tag);
28603         }
28604         /* check for matching old_locus_tag */
28605         for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
28606           if (StringCmp (gbq->qual, "old_locus_tag") == 0 && StringCmp (grp->locus_tag, gbq->val) == 0) {
28607             ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_LocusTagProblem, "Gene locus_tag and old_locus_tag '%s' match", grp->locus_tag);
28608           }
28609         }
28610         if (StringDoesHaveText (grp->locus) && StringICmp (grp->locus, grp->locus_tag) == 0) {
28611           ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_LocusTagProblem, "Gene locus and locus_tag '%s' match", grp->locus);
28612         }
28613       }
28614       CheckForIllegalDbxref (vsp, gcp, grp->db);
28615       if (StringDoesHaveText (grp->allele)) {
28616         for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
28617           if (StringCmp (gbq->qual, "allele") == 0 && StringDoesHaveText (gbq->val)) {
28618             if (StringICmp (gbq->val, grp->allele) == 0) {
28619               //LCOV_EXCL_START
28620               //BasicCleanup removes redundant allele qualifier
28621               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_WrongQualOnFeature, "Redundant allele qualifier (%s) on gene", gbq->val);
28622               //LCOV_EXCL_STOP
28623             } else if (sfp->idx.subtype != FEATDEF_variation) {
28624               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_WrongQualOnFeature, "Hidden allele qualifier (%s) on gene", gbq->val);
28625             }
28626           }
28627         }
28628       }
28629       /*
28630       for (vnp = grp->db; vnp != NULL; vnp = vnp->next) {
28631         id = -1;
28632         db = vnp->data.ptrvalue;
28633         if (db && db->db) {
28634           for (i = 0; i < DBNUM; i++) {
28635             if (StringCmp (db->db, dbtag[i]) == 0) {
28636               id = i;
28637               break;
28638             }
28639           }
28640           if (id == -1 || (type != SEQFEAT_CDREGION && id < 4)) {
28641             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_IllegalDbXref, "Illegal db_xref type %s", db->db);
28642           }
28643         }
28644       }
28645       */
28646       if (grp->locus != NULL && sfp->comment != NULL) {
28647         if (StringCmp (grp->locus, sfp->comment) == 0) {
28648           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_RedundantFields, "Comment has same value as gene locus");
28649         }
28650       }
28651       if (grp->locus != NULL) {
28652         if (HasBadCharacter (grp->locus)) {
28653           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadInternalCharacter, "Gene locus contains undesired character");
28654         }
28655         if (EndsWithBadCharacter (grp->locus)) {
28656           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadTrailingCharacter, "Gene locus ends with undesired character");
28657         }
28658         if (EndsWithHyphen (grp->locus)) {
28659           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadTrailingHyphen, "Gene locus ends with hyphen");
28660         }
28661       }
28662       if (grp->locus_tag != NULL && sfp->comment != NULL) {
28663         if (StringCmp (grp->locus_tag, sfp->comment) == 0) {
28664           //LCOV_EXCL_START
28665           //BasicCleanup removes redundant comment
28666           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_RedundantFields, "Comment has same value as gene locus_tag");
28667           //LCOV_EXCL_STOP
28668         }
28669       }
28670       if (StringDoesHaveText (grp->locus_tag)) {
28671         for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
28672           if (StringCmp (gbq->qual, "old_locus_tag") == 0 && StringDoesHaveText (gbq->val)) {
28673             if (StringICmp (gbq->val, grp->locus_tag) == 0) {
28674               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_RedundantFields, "old_locus_tag has same value as gene locus_tag");
28675             }
28676           }
28677         }
28678       }
28679       if (grp->syn != NULL && (vsp->is_refseq_in_sep /* || vsp->seqSubmitParent */)) {
28680         for (vnp = grp->syn; vnp != NULL; vnp = vnp->next) {
28681           str = (CharPtr) vnp->data.ptrvalue;
28682           if (StringHasNoText (str)) continue;
28683           if (NameInList (str, badGeneSyn, sizeof (badGeneSyn) / sizeof (badGeneSyn [0]))) {
28684             ValidErr (vsp, vsp->is_gpipe_in_sep ? SEV_INFO : SEV_WARNING,
28685                 ERR_SEQ_FEAT_UndesiredGeneSynonym, "Uninformative gene synonym '%s'", str);
28686           }
28687         }
28688       }
28689       if (grp->syn != NULL && (vsp->is_refseq_in_sep || vsp->seqSubmitParent)) {
28690         for (vnp = grp->syn; vnp != NULL; vnp = vnp->next) {
28691           str = (CharPtr) vnp->data.ptrvalue;
28692           if (StringHasNoText (str)) continue;
28693           if (StringDoesHaveText (grp->locus) && StringCmp (grp->locus, str) == 0) {
28694             //LCOV_EXCL_START
28695             //BasicCleanup removes redundant gene synonym
28696             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UndesiredGeneSynonym, "gene synonym has same value as gene locus");
28697             //LCOV_EXCL_STOP
28698           }
28699         }
28700       }
28701       if (!vsp->is_gpipe_in_sep) {
28702           if (grp->syn != NULL) {
28703             bsp = BioseqFindFromSeqLoc (sfp->location);
28704             for (vnp = grp->syn; vnp != NULL; vnp = vnp->next) {
28705               str = (CharPtr) vnp->data.ptrvalue;
28706               if (StringHasNoText (str)) continue;
28707               sfpx = SeqMgrGetFeatureByLabel (bsp, str, SEQFEAT_GENE, 0, NULL);
28708               if (sfpx != NULL && sfpx != sfp) {
28709                 ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_IdenticalGeneSymbolAndSynonym, "gene synonym has same value (%s) as locus of another gene feature", str);
28710               }
28711             }
28712           }
28713       }
28714       if (!vsp->is_gpipe_in_sep && StringDoesHaveText (grp->locus) && StringDoesHaveText (grp->desc) && StringCmp (grp->locus, grp->desc) == 0) {
28715         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UndesiredGeneSynonym, "gene description has same value as gene locus");
28716       }
28717       if (StringHasNoText (grp->locus) && StringHasNoText (grp->desc) && grp->syn != NULL) {
28718         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UndesiredGeneSynonym, "gene synonym without gene locus or description");
28719       }
28720       if (StringDoesHaveText (grp->desc) && StringStr (grp->desc,"..") != NULL) {
28721         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_WrongQualOnFeature, "Possible location text (%s) on gene description", grp->desc);
28722       }
28723       /* - need to ignore if curated drosophila - add to vsp internal flags for efficiency?
28724       if (StringDoesHaveText (grp->locus)) {
28725         for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
28726           if (StringCmp (gbq->qual, "old_locus_tag") == 0 && StringDoesHaveText (gbq->val)) {
28727             if (StringICmp (gbq->val, grp->locus) == 0) {
28728               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_RedundantFields, "old_locus_tag has same value as gene locus");
28729             }
28730           }
28731         }
28732       }
28733       */
28734       if (StringHasSgml (vsp, grp->locus)) {
28735         ValidErr (vsp, SEV_WARNING, ERR_GENERIC_SgmlPresentInText, "gene locus %s has SGML", grp->locus);
28736       }
28737       if (StringHasSgml (vsp, grp->locus_tag)) {
28738         ValidErr (vsp, SEV_WARNING, ERR_GENERIC_SgmlPresentInText, "gene locus_tag %s has SGML", grp->locus_tag);
28739       }
28740       if (StringHasSgml (vsp, grp->desc)) {
28741         ValidErr (vsp, SEV_WARNING, ERR_GENERIC_SgmlPresentInText, "gene description %s has SGML", grp->desc);
28742       }
28743       for (vnp = grp->syn; vnp != NULL; vnp = vnp->next) {
28744         str = (CharPtr) vnp->data.ptrvalue;
28745         if (StringHasSgml (vsp, str)) {
28746           ValidErr (vsp, SEV_WARNING, ERR_GENERIC_SgmlPresentInText, "gene synonym %s has SGML", str);
28747         }
28748       }
28749       if (StringDoesHaveText (grp->locus)) {
28750         bsp = BioseqFindFromSeqLoc (sfp->location);
28751         sfpx = SeqMgrGetGeneByLocusTag (bsp, grp->locus, &fcontext);
28752         if (sfpx != NULL) {
28753           grpx = (GeneRefPtr) sfpx->data.value.ptrvalue;
28754           if (grpx != NULL) {
28755             if (grp == grpx) {
28756               /*
28757               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_LocusCollidesWithLocusTag, "locus collides with locus_tag in same gene");
28758               */
28759             } else {
28760               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_LocusCollidesWithLocusTag, "locus collides with locus_tag in another gene");
28761             }
28762           }
28763         }
28764       }
28765     }
28766     break;
28767   case 2:                      /* Org-ref */
28768     break;
28769   case 3:                      /* Cdregion */
28770     CheckForShortExons(vsp, sfp->location);
28771     pseudo = sfp->pseudo;       /* now also uses new feature pseudo flag */
28772     excpt = FALSE;
28773     conflict = FALSE;
28774     codonqual = FALSE;
28775     crp = (CdRegionPtr) (sfp->data.value.ptrvalue);
28776     if (crp != NULL) {
28777       conflict = crp->conflict;
28778     }
28779     protidqual = FALSE;
28780     transidqual = FALSE;
28781     ovgenepseudo = FALSE;
28782     gene_synonym_on_cds = FALSE;
28783     gbq = sfp->qual;
28784     while (gbq != NULL) {
28785       if (StringICmp (gbq->qual, "pseudo") == 0) {
28786         pseudo = TRUE;
28787       }
28788       if (StringICmp (gbq->qual, "exception") == 0) {
28789         excpt = TRUE;
28790       }
28791       if (StringICmp (gbq->qual, "codon") == 0) {
28792         codonqual = TRUE;
28793       }
28794       if (StringICmp (gbq->qual, "protein_id") == 0) {
28795         protidqual = TRUE;
28796       }
28797       if (StringICmp (gbq->qual, "transcript_id") == 0) {
28798         transidqual = TRUE;
28799       }
28800       if (StringICmp (gbq->qual, "gene_synonym") == 0) {
28801         gene_synonym_on_cds = TRUE;
28802       }
28803       gbq = gbq->next;
28804     }
28805     if (protidqual) {
28806       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_WrongQualOnFeature, "protein_id should not be a gbqual on a CDS feature");
28807     }
28808     if (transidqual) {
28809       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_WrongQualOnFeature, "transcript_id should not be a gbqual on a CDS feature");
28810     }
28811     if (gene_synonym_on_cds) {
28812       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_WrongQualOnFeature, "gene_synonym should not be a gbqual on a CDS feature");
28813     }
28814     if (OverlappingGeneIsPseudo (sfp)) {
28815       pseudo = TRUE;
28816       ovgenepseudo = TRUE;
28817     }
28818     if ((!pseudo) && (!conflict)) {
28819       CdTransCheck (vsp, sfp);
28820       SpliceCheck (vsp, sfp);
28821     } else if (conflict) {
28822       CdConflictCheck (vsp, sfp);
28823     }
28824     CdsProductIdCheck (vsp, sfp);
28825     crp = (CdRegionPtr) (sfp->data.value.ptrvalue);
28826     if (crp != NULL) {
28827       if (crp->code_break != NULL && StringISearch (sfp->except_text, "RNA editing") != NULL) {
28828         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_TranslExceptAndRnaEditing, "CDS has both RNA editing /exception and /transl_except qualifiers");
28829       }
28830       prevcbp = NULL;
28831       for (cbp = crp->code_break; cbp != NULL; cbp = cbp->next) {
28832         i = SeqLocCompare (cbp->loc, sfp->location);
28833         if ((i != SLC_A_IN_B) && (i != SLC_A_EQ_B)) {
28834           ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_Range, "Code-break location not in coding region");
28835         } else if (sfp->product != NULL) {
28836           slp = dnaLoc_to_aaLoc (sfp, cbp->loc, TRUE, NULL, TRUE);
28837           if (slp == NULL) {
28838             ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_Range, "Code-break location not in coding region - may be frame problem");
28839           }
28840           SeqLocFree (slp);
28841         }
28842         if (prevcbp != NULL) {
28843           i = SeqLocCompare (cbp->loc, prevcbp->loc);
28844           if (i == SLC_A_EQ_B) {
28845             ctmp = SeqLocPrint (cbp->loc);
28846             if (ctmp != NULL) {
28847               ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_DuplicateTranslExcept, "Multiple code-breaks at same location [%s]", ctmp);
28848             } else {
28849               ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_DuplicateTranslExcept, "Multiple code-breaks at same location");
28850             }
28851             MemFree (ctmp);
28852           }
28853         }
28854         prevcbp = cbp;
28855       }
28856       if (excpt && (!sfp->excpt)) {
28857         //LCOV_EXCL_START
28858         //BasicCleanup converts "exception" gbqual to except_text and sets except flag
28859         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ExceptInconsistent, "Exception flag should be set in coding region");
28860         //LCOV_EXCL_STOP
28861       }
28862       if (crp->orf && sfp->product != NULL) {
28863         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_OrfCdsHasProduct, "An ORF coding region should not have a product");
28864       }
28865       if (pseudo && sfp->product != NULL) {
28866         if (ovgenepseudo) {
28867           if (sfp->pseudo) {
28868             ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_PseudoCdsHasProduct, "A pseudo coding region should not have a product");
28869           } else {
28870             ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_PseudoCdsViaGeneHasProduct, "A coding region overlapped by a pseudogene should not have a product");
28871           }
28872         } else {
28873           ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_PseudoCdsHasProduct, "A pseudo coding region should not have a product");
28874         }
28875       }
28876 
28877       if (codonqual) {
28878         ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_CodonQualifierUsed, "Use the proper genetic code, if available, or set transl_excepts on specific codons");
28879       }
28880       biopgencode = 0;
28881       cdsgencode = 0;
28882       bsp = GetBioseqGivenSeqLoc (sfp->location, gcp->entityID);
28883       if (bsp != NULL) {
28884         vnp = NULL;
28885         if (vsp->useSeqMgrIndexes) {
28886           vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
28887         } else {
28888 //LCOV_EXCL_START
28889           bcp = BioseqContextNew (bsp);
28890           vnp = BioseqContextGetSeqDescr (bcp, Seq_descr_source, NULL, NULL);
28891 //LCOV_EXCL_STOP
28892         }
28893         if (vnp != NULL && vnp->data.ptrvalue != NULL) {
28894           plastid = FALSE;
28895           biop = (BioSourcePtr) vnp->data.ptrvalue;
28896           orp = biop->org;
28897           if (orp != NULL && orp->orgname != NULL) {
28898             onp = orp->orgname;
28899             if (biop->genome == GENOME_kinetoplast ||
28900                 biop->genome == GENOME_mitochondrion ||
28901                 biop->genome == GENOME_hydrogenosome) {
28902               biopgencode = onp->mgcode;
28903             } else if (biop->genome == GENOME_chloroplast ||
28904                        biop->genome == GENOME_chromoplast ||
28905                        biop->genome == GENOME_plastid ||
28906                        biop->genome == GENOME_cyanelle ||
28907                        biop->genome == GENOME_apicoplast ||
28908                        biop->genome == GENOME_leucoplast ||
28909                        biop->genome == GENOME_proplastid ||
28910                        biop->genome == GENOME_chromatophore) {
28911               if (onp->pgcode > 0) {
28912                 biopgencode = onp->pgcode;
28913               } else {
28914                 biopgencode = 11;
28915               }
28916               plastid = TRUE;
28917             } else {
28918               biopgencode = onp->gcode;
28919             }
28920             gc = crp->genetic_code;
28921             if (gc != NULL) {
28922               for (vnp = gc->data.ptrvalue; vnp != NULL; vnp = vnp->next) {
28923                 if (vnp->choice == 2) {
28924                   cdsgencode = (Int2) vnp->data.intvalue;
28925                 }
28926               }
28927             }
28928             if (biopgencode != cdsgencode && StringISearch (sfp->except_text, kAllowManualGenCodeException) == NULL) {
28929               if (! vsp->seqSubmitParent) { /* suppress when validator run from tbl2asn */
28930                 if (plastid) {
28931                   ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_GenCodeMismatch,
28932                             "Genetic code conflict between CDS (code %d) and BioSource.genome biological context (%s) (uses code 11)", (int) cdsgencode, plastidtxt [biop->genome]);
28933                 } else {
28934                   ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_GenCodeMismatch,
28935                             "Genetic code conflict between CDS (code %d) and BioSource (code %d)", (int) cdsgencode, (int) biopgencode);
28936                 }
28937               }
28938             }
28939           }
28940         }
28941         if (!vsp->useSeqMgrIndexes) {
28942           BioseqContextFree (bcp);
28943         }
28944       }
28945     }
28946     /* CheckForBothStrands (vsp, sfp); */
28947     CheckForBadGeneOverlap (vsp, sfp);
28948     CheckForBadMRNAOverlap (vsp, sfp);
28949     CheckForCommonCDSProduct (vsp, sfp);
28950     CheckCDSPartial (vsp, sfp);
28951     if (StringDoesHaveText (sfp->comment)) {
28952       if (LookForECnumberPattern (sfp->comment)) {
28953         skip = FALSE;
28954         bsp = BioseqFindFromSeqLoc (sfp->product);
28955         if (bsp != NULL && ISA_aa (bsp->mol)) {
28956           prt = SeqMgrGetBestProteinFeature (bsp, NULL);
28957           if (prt != NULL && prt->data.choice == SEQFEAT_PROT) {
28958             prp = (ProtRefPtr) prt->data.value.ptrvalue;
28959             if (prp != NULL) {
28960               for (vnp = prp->ec; vnp != NULL; vnp = vnp->next) {
28961                 str = (CharPtr) vnp->data.ptrvalue;
28962                 if (StringHasNoText (str)) continue;
28963                 if (StringStr (sfp->comment, str) != NULL) {
28964                   skip = TRUE;
28965                 }
28966                 skip = TRUE; /* now suppress even if EC numbers are different */
28967               }
28968             }
28969           }
28970         }
28971         if (! skip) {
28972           ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_EcNumberProblem, "Apparent EC number in CDS comment");
28973         }
28974       }
28975     }
28976     break;
28977   case 4:                      /* Prot-ref */
28978     prp = (ProtRefPtr) (sfp->data.value.ptrvalue);
28979     if (prp != NULL) {
28980       if (prp->processed != 3 && prp->processed != 4) {
28981         vnp = prp->name;
28982         if ((vnp == NULL || EmptyOrNullString ((CharPtr) vnp->data.ptrvalue)) &&
28983             EmptyOrNullString (prp->desc) && prp->ec == NULL && prp->activity == NULL && prp->db == NULL) {
28984           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ProtRefHasNoData, "There is a protein feature where all fields are empty");
28985         }
28986         if (vnp != NULL) {
28987           str = (CharPtr) vnp->data.ptrvalue;
28988           if (StringDoesHaveText (str)) {
28989             if (! vsp->is_embl_tpe_in_sep) {
28990               TestForBracketsInProductName (str, vsp);
28991             }
28992             if (StringNICmp (str, "hypothetical protein XP_", 24) == 0) {
28993               bsp = GetBioseqGivenSeqLoc (sfp->location, gcp->entityID);
28994               if (bsp != NULL) {
28995                 for (sip = bsp->id; sip != NULL; sip = sip->next) {
28996                   if (sip->choice != SEQID_OTHER) continue;
28997                   tsip = (TextSeqIdPtr) sip->data.ptrvalue;
28998                   if (tsip == NULL) continue;
28999                   if (StringICmp (tsip->accession, str + 21) != 0) {
29000                     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_HpotheticalProteinMismatch, "Hypothetical protein reference does not match accession");
29001                   }
29002                 }
29003               }
29004             }
29005             if (prp->ec != NULL) {
29006               if (StringCmp (str, "Hypothetical protein") == 0 ||
29007                   StringCmp (str, "hypothetical protein") == 0 ||
29008                   StringCmp (str, "Unknown protein") == 0 ||
29009                   StringCmp (str, "unknown protein") == 0) {
29010                 ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadProteinName, "Unknown or hypothetical protein should not have EC number");
29011               }
29012             }
29013             if (LookForECnumberPattern (str)) {
29014               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_EcNumberProblem, "Apparent EC number in protein title");
29015             }
29016             if (vsp->rubiscoTest && StringStr (str, "ribulose") != NULL && StringStr (str, "bisphosphate") != NULL) {
29017                 //LCOV_EXCL_START
29018                 //no option to enable rubisco test, problems handled by basic cleanup
29019               if (StringStr (str, "methyltransferase") == NULL && StringStr (str, "activase") == NULL) {
29020                 if (StringICmp (str, "ribulose-1,5-bisphosphate carboxylase/oxygenase") == 0) {
29021                   /* allow standard name without large or small subunit designation - later need kingdom test */
29022                 } else if (StringICmp (str, "ribulose-1,5-bisphosphate carboxylase/oxygenase large subunit") != 0 &&
29023                     StringICmp (str, "ribulose-1,5-bisphosphate carboxylase/oxygenase small subunit") != 0) {
29024                   ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_RubiscoProblem, "Nonstandard ribulose bisphosphate protein name");
29025                 }
29026               }
29027               //LCOV_EXCL_STOP
29028             }
29029             if (StringHasPMID (str)) {
29030               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ProteinNameHasPMID, "Protein name has internal PMID");
29031             }
29032           }
29033           if (str != NULL && sfp->comment != NULL) {
29034             if (StringCmp (str, sfp->comment) == 0) {
29035               //LCOV_EXCL_START
29036               //BasicCleanup removes redundant comment
29037               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_RedundantFields, "Comment has same value as protein name");
29038               //LCOV_EXCL_STOP
29039             }
29040           }
29041           if (StringDoesHaveText (sfp->comment)) {
29042             if (LookForECnumberPattern (sfp->comment)) {
29043               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_EcNumberProblem, "Apparent EC number in protein comment");
29044             }
29045           }
29046         }
29047       }
29048       CheckForIllegalDbxref (vsp, gcp, prp->db);
29049       /*
29050       for (vnp = prp->db; vnp != NULL; vnp = vnp->next) {
29051         id = -1;
29052         db = vnp->data.ptrvalue;
29053         if (db && db->db) {
29054           for (i = 0; i < DBNUM; i++) {
29055             if (StringCmp (db->db, dbtag[i]) == 0) {
29056               id = i;
29057               break;
29058             }
29059           }
29060           if (id == -1 || (type != SEQFEAT_CDREGION && id < 4)) {
29061             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_IllegalDbXref, "Illegal db_xref type %s", db->db);
29062           }
29063         }
29064       }
29065       */
29066       if (prp->name == NULL && prp->processed != 3 && prp->processed != 4) {
29067         if (StringDoesHaveText (prp->desc)) {
29068           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_NoNameForProtein, "Protein feature has description but no name");
29069         } else if (prp->activity != NULL) {
29070           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_NoNameForProtein, "Protein feature has function but no name");
29071         } else if (prp->ec != NULL) {
29072           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_NoNameForProtein, "Protein feature has EC number but no name");
29073         } else {
29074           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_NoNameForProtein, "Protein feature has no name");
29075         }
29076       }
29077       if (prp->desc != NULL && sfp->comment != NULL) {
29078         if (StringCmp (prp->desc, sfp->comment) == 0) {
29079           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_RedundantFields, "Comment has same value as protein description");
29080         }
29081       }
29082       for (vnp = prp->ec; vnp != NULL; vnp = vnp->next) {
29083         str = (CharPtr) vnp->data.ptrvalue;
29084         if (StringDoesHaveText (str)) {
29085           if (! ValidateECnumber (str)) {
29086             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadEcNumberFormat, "%s is not in proper EC_number format", str);
29087           } else if (ECnumberNotInList (str)) {
29088             if (ECnumberWasDeleted (str)) {
29089               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_DeletedEcNumber, "EC_number %s was deleted", str);
29090             } else if (ECnumberWasReplaced (str)) {
29091               if (EcCnumberWasSplit (vsp, str)) {
29092                  ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_SplitEcNumber, "EC_number %s was transferred and is no longer valid", str);
29093              } else {
29094                 ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ReplacedEcNumber, "EC_number %s was transferred and is no longer valid", str);
29095               }
29096             } else {
29097               StringNCpy_0 (buf, str, sizeof (buf));
29098               ptr = StringChr (buf, 'n');
29099               if (ptr != NULL) {
29100                 ch = ptr [1];
29101                 if (IS_DIGIT (ch)) {
29102                   ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_BadEcNumberValue, "%s is not a legal preliminary value for qualifier EC_number", str);
29103                 } else {
29104                   ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadEcNumberValue, "%s is not a legal value for qualifier EC_number", str);
29105                 }
29106               } else {
29107                 ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadEcNumberValue, "%s is not a legal value for qualifier EC_number", str);
29108               }
29109             }
29110           }
29111         } else {
29112           //LCOV_EXCL_START
29113           //BasicCleanup removes empty EC numbers
29114           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_EcNumberProblem, "EC number should not be empty");
29115           //LCOV_EXCL_STOP
29116         }
29117       }
29118     }
29119     if (prp != NULL && prp->name != NULL && (vsp->is_refseq_in_sep /* || vsp->seqSubmitParent */)) {
29120       for (vnp = prp->name; vnp != NULL; vnp = vnp->next) {
29121         str = (CharPtr) vnp->data.ptrvalue;
29122         if (StringHasNoText (str)) continue;
29123           if (NameInList (str, badProtName, sizeof (badProtName) / sizeof (badProtName [0]))) {
29124             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UndesiredProteinName, "Uninformative protein name '%s'", str);
29125           } else if (StringStr (str, "=") != NULL ||
29126                      StringStr (str, "~") != NULL ||
29127                      StringISearch (str, "uniprot") != NULL ||
29128                      StringISearch (str, "uniprotkb") != NULL ||
29129                      StringISearch (str, "pmid") != NULL ||
29130                      StringISearch (str, "dbxref") != NULL) {
29131             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UndesiredProteinName, "Uninformative protein name '%s'", str);
29132           }
29133         }
29134       }
29135       if (prp != NULL) {
29136         for (vnp = prp->name; vnp != NULL; vnp = vnp->next) {
29137           str = (CharPtr) vnp->data.ptrvalue;
29138           if (StringHasNoText (str)) continue;
29139           if (HasBadCharacter (str)) {
29140             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadInternalCharacter, "Protein name contains undesired character");
29141           }
29142           if (EndsWithBadCharacter (str)) {
29143             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadTrailingCharacter, "Protein name ends with undesired character");
29144           }
29145           if (EndsWithHyphen (str)) {
29146             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadTrailingHyphen, "Protein name ends with hyphen");
29147           }
29148         }
29149         for (vnp = prp->name; vnp != NULL; vnp = vnp->next) {
29150           str = (CharPtr) vnp->data.ptrvalue;
29151           if (StringHasSgml (vsp, str)) {
29152             ValidErr (vsp, SEV_WARNING, ERR_GENERIC_SgmlPresentInText, "protein name %s has SGML", str);
29153           }
29154         }
29155         if (StringHasSgml (vsp, prp->desc)) {
29156           ValidErr (vsp, SEV_WARNING, ERR_GENERIC_SgmlPresentInText, "protein description %s has SGML", prp->desc);
29157         }
29158       }
29159     break;
29160   case 5:                      /* RNA-ref */
29161     ValidateRna(sfp, vsp, gcp);
29162 
29163     break;
29164   case 6:                      /* Pub */
29165     pdp = (PubdescPtr) sfp->data.value.ptrvalue;
29166     /*
29167        ValidatePubdesc (vsp, pdp);
29168      */
29169     break;
29170   case 7:                      /* Seq */
29171     break;
29172   case 8:                      /* Imp-feat */
29173     ifp = (ImpFeatPtr) sfp->data.value.ptrvalue;
29174     if (vsp->validateExons) {
29175 
29176       if (ifp != NULL && StringICmp (ifp->key, "exon") == 0 && (! sfp->pseudo)) {
29177         skip = FALSE;
29178         bsp = BioseqFindFromSeqLoc (sfp->location);
29179         if (bsp != NULL) {
29180           sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext);
29181           if (sdp != NULL) {
29182             mip = (MolInfoPtr) sdp->data.ptrvalue;
29183             if (mip != NULL) {
29184               if (mip->biomol == MOLECULE_TYPE_MRNA) {
29185                 skip = TRUE;
29186               }
29187             }
29188           }
29189         }
29190         if (! skip) {
29191           SpliceCheckEx (vsp, sfp, TRUE);
29192         }
29193       }
29194     }
29195     if (ifp != NULL) {
29196       ValidateImpFeat (vsp, gcp, sfp, ifp);
29197     }
29198     break;
29199   case 9:                      /* Region */
29200     break;
29201   case 10:                     /* Comment */
29202     break;
29203   case 11:                     /* Bond */
29204     break;
29205   case 12:                     /* Site */
29206     break;
29207   case 13:                     /* Rsite-ref */
29208     break;
29209   case 14:                     /* User-object */
29210     break;
29211   case 15:                     /* TxInit */
29212     break;
29213   case 16:                     /* Numbering */
29214     break;
29215   case 17:                     /* Secondary Structure */
29216     break;
29217   case 18:                     /* NonStdRes */
29218     break;
29219   case 19:                     /* Heterogen */
29220     break;
29221   case 20:                     /* BioSource */
29222     biop = (BioSourcePtr) sfp->data.value.ptrvalue;
29223     if (biop != NULL && biop->is_focus) {
29224       ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_FocusOnBioSourceFeature, "Focus must be on BioSource descriptor, not BioSource feature.");
29225     }
29226     if (biop != NULL) {
29227       orp = biop->org;
29228       if (orp != NULL) {
29229         bsp = GetBioseqGivenSeqLoc (sfp->location, gcp->entityID);
29230         if (bsp != NULL) {
29231           vnp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
29232           if (vnp != NULL) {
29233             dbiop = (BioSourcePtr) vnp->data.ptrvalue;
29234             if (dbiop != NULL) {
29235               dorp = dbiop->org;
29236               if (dorp != NULL) {
29237                 if (!StringHasNoText (orp->taxname)) {
29238                   if (StringICmp (orp->taxname, dorp->taxname) != 0) {
29239                     if (!dbiop->is_focus) {
29240                       transgenic = FALSE;
29241                       for (ssp = dbiop->subtype; ssp != NULL; ssp = ssp->next) {
29242                         if (ssp->subtype == SUBSRC_transgenic) {
29243                           transgenic = TRUE;
29244                         }
29245                       }
29246                       if (! transgenic) {
29247                         oldEntityID = gcp->entityID;
29248                         oldItemID = gcp->itemID;
29249 
29250                         gcp->entityID = context.entityID;
29251                         gcp->itemID = context.itemID;
29252                         gcp->thistype = OBJ_SEQDESC;
29253 
29254                         ValidErr (vsp, SEV_ERROR, ERR_SEQ_DESCR_BioSourceNeedsFocus,
29255                                   "BioSource descriptor must have focus or transgenic when BioSource feature with different taxname is present.");
29256 
29257                         gcp->entityID = oldEntityID;
29258                         gcp->itemID = oldItemID;
29259                         gcp->thistype = OBJ_SEQFEAT;
29260                       }
29261                     }
29262                   }
29263                 }
29264               }
29265             }
29266           }
29267         }
29268       }
29269     }
29270     /*
29271        ValidateBioSource (vsp, gcp, biop, sfp, NULL);
29272      */
29273     break;
29274   case 21:                     /* CloneRef */
29275     clrp = (CloneRefPtr) sfp->data.value.ptrvalue;
29276     if (clrp != NULL) {
29277     }
29278     break;
29279   case 22:                     /* VariationRef */
29280     vrfp = (VariationRefPtr) sfp->data.value.ptrvalue;
29281     if (vrfp != NULL) {
29282     }
29283     break;
29284   default:
29285       //LCOV_EXCL_START
29286       //invalid ASN.1 cannot be tested in regression
29287     ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidType, "Invalid SeqFeat type [%d]", (int) (type));
29288     break;
29289     //LCOV_EXCL_STOP
29290   }
29291   if (type == SEQFEAT_HET) {
29292     /* heterogen can have mix of bonds with just "a" point specified */
29293     is_seqloc_bond = FALSE;
29294     slp = SeqLocFindNext (sfp->location, NULL);
29295     while (slp != NULL) {
29296       if (slp->choice == SEQLOC_BOND) {
29297         sbp = (SeqBondPtr) slp->data.ptrvalue;
29298         if (sbp != NULL) {
29299           if (sbp->a == NULL || sbp->b != NULL) {
29300             is_seqloc_bond = TRUE;
29301           }
29302         }
29303       }
29304       slp = SeqLocFindNext (sfp->location, slp);
29305     }
29306     if (is_seqloc_bond) {
29307       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ImproperBondLocation, "Bond location should only be on bond features");
29308     }
29309   } else if (type != SEQFEAT_BOND) {
29310     is_seqloc_bond = FALSE;
29311     slp = SeqLocFindNext (sfp->location, NULL);
29312     while (slp != NULL) {
29313       if (slp->choice == SEQLOC_BOND) {
29314         is_seqloc_bond = TRUE;
29315       }
29316       slp = SeqLocFindNext (sfp->location, slp);
29317     }
29318     if (is_seqloc_bond) {
29319       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ImproperBondLocation, "Bond location should only be on bond features");
29320     }
29321   }
29322   if (type != 8) {
29323     ValidateNonImpFeat (vsp, gcp, sfp);
29324   }
29325   if ((! sfp->excpt) && (! StringHasNoText (sfp->except_text))) {
29326     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ExceptInconsistent, "Exception text is present, but exception flag is not set");
29327   }
29328   if ((sfp->excpt) && (StringHasNoText (sfp->except_text))) {
29329     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ExceptInconsistent, "Exception flag is set, but exception text is empty");
29330   }
29331   if (! StringHasNoText (sfp->except_text)) {
29332     ValidateExceptText (vsp, gcp, sfp);
29333   }
29334 
29335   ValidateSeqFeatXref(sfp, vsp);
29336 
29337   if (StringHasSgml (vsp, sfp->comment)) {
29338     ValidErr (vsp, SEV_WARNING, ERR_GENERIC_SgmlPresentInText, "feature comment %s has SGML", sfp->comment);
29339   }
29340 
29341   for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
29342     /* first check for anything other than replace */
29343     if (StringICmp (gbq->qual, "replace") != 0) {
29344       if (JustQuotes (gbq->val)) {
29345         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InvalidQualifierValue, "Qualifier other than replace has just quotation marks");
29346       }
29347     }
29348     /* now check specific gbual types */
29349     if (StringICmp (gbq->qual, "inference") == 0) {
29350       hasInference = TRUE;
29351       inferenceCode = ValidateInferenceQualifier (gbq->val, vsp->inferenceAccnCheck);
29352       if (inferenceCode != VALID_INFERENCE) {
29353         if (inferenceCode < VALID_INFERENCE || inferenceCode > UNRECOGNIZED_DATABASE) {
29354           inferenceCode = VALID_INFERENCE;
29355         }
29356         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InvalidInferenceValue, "Inference qualifier problem - %s (%s)",
29357                   infMessage [(int) inferenceCode], (gbq->val != NULL)? gbq->val : "?");
29358       }
29359     } else if (StringICmp (gbq->qual, "experiment") == 0) {
29360       hasExperiment = TRUE;
29361     } else if (StringICmp (gbq->qual, "EC_number") == 0) {
29362       str = gbq->val;
29363       if (StringDoesHaveText (str)) {
29364         if (! ValidateECnumber (str)) {
29365           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadEcNumberFormat, "%s is not in proper EC_number format", str);
29366         } else if (ECnumberNotInList (str)) {
29367           if (ECnumberWasDeleted (str)) {
29368             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_DeletedEcNumber, "EC_number %s was deleted", str);
29369           } else if (ECnumberWasReplaced (str)) {
29370             if (EcCnumberWasSplit (vsp, str)) {
29371               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_SplitEcNumber, "EC_number %s was replaced", str);
29372             } else {
29373               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ReplacedEcNumber, "EC_number %s was replaced", str);
29374             }
29375           } else {
29376             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadEcNumberValue, "%s is not a legal value for qualifier EC_number", str);
29377           }
29378         }
29379       } else {
29380         //LCOV_EXCL_START
29381         //BasicCleanup removes empty EC numbers
29382         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_EcNumberProblem, "EC number should not be empty");
29383         //LCOV_EXCL_STOP
29384       }
29385     } else if (StringICmp (gbq->qual, "old_locus_tag") == 0) {
29386       if (StringChr (gbq->val, ',') != NULL) {
29387         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_LocusTagProblem,
29388                   "old_locus_tag has comma, may contain multiple values");
29389       }
29390       pseudo = FALSE;
29391       if (sfp->pseudo) {
29392         pseudo = TRUE;
29393       }
29394       grp = SeqMgrGetGeneXref (sfp);
29395       if (grp == NULL) {
29396         if (sfp->data.choice == SEQFEAT_GENE) {
29397           grp = (GeneRefPtr) sfp->data.value.ptrvalue;
29398         } else {
29399           gene = SeqMgrGetOverlappingGene (sfp->location, NULL);
29400           if (gene != NULL) {
29401             if (gene->pseudo) {
29402               pseudo = TRUE;
29403             } else {
29404               grp = (GeneRefPtr) gene->data.value.ptrvalue;
29405             }
29406           }
29407         }
29408       }
29409       if (grp == NULL || SeqMgrGeneIsSuppressed (grp) || StringHasNoText (grp->locus_tag)) {
29410         if (! pseudo) {
29411           ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_LocusTagProblem,
29412                     "old_locus_tag without inherited locus_tag");
29413         }
29414       }
29415     }
29416     if (StringHasSgml (vsp, gbq->val)) {
29417       ValidErr (vsp, SEV_WARNING, ERR_GENERIC_SgmlPresentInText, "feature qualifier %s has SGML", gbq->val);
29418     }
29419   }
29420   if (sfp->exp_ev > 0 && (! hasInference) && (! hasExperiment) && (! vsp->feat_loc_has_gi)) {
29421     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InvalidInferenceValue,
29422               "Inference or experiment qualifier missing but obsolete experimental evidence qualifier set");
29423   }
29424 
29425   if (sfp->product != NULL) {
29426     sip = SeqLocId (sfp->product);
29427     if (sip != NULL) {
29428       switch (sip->choice) {
29429         case SEQID_LOCAL :
29430       break;
29431         case SEQID_GENBANK :
29432         case SEQID_EMBL :
29433         case SEQID_DDBJ :
29434         case SEQID_OTHER :
29435         case SEQID_TPG :
29436         case SEQID_TPE :
29437         case SEQID_TPD :
29438         case SEQID_GPIPE :
29439           tsip = (TextSeqIdPtr) sip->data.ptrvalue;
29440           if (tsip != NULL) {
29441             if (tsip->accession == NULL && (! StringHasNoText (tsip->name))) {
29442               if (ValidateAccn (tsip->name) == 0) {
29443                 ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadProductSeqId,
29444                          "Feature product should not put an accession in the Textseq-id 'name' slot");
29445               } else {
29446                 ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadProductSeqId,
29447                          "Feature product should not use Textseq-id 'name' slot");
29448               }
29449             }
29450           }
29451           break;
29452         default :
29453           break;
29454       }
29455     }
29456     bsp = BioseqFindFromSeqLoc (sfp->location);
29457     protBsp = BioseqFindFromSeqLoc (sfp->product);
29458     if (bsp != NULL && protBsp != NULL) {
29459       if (bsp == protBsp) {
29460         ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_SelfReferentialProduct, "Self-referential feature product");
29461       }
29462     }
29463     if (protBsp != NULL && protBsp->id != NULL) {
29464       for (sip = protBsp->id; sip != NULL; sip = sip->next) {
29465         switch (sip->choice) {
29466           case SEQID_GENBANK :
29467           case SEQID_EMBL :
29468           case SEQID_DDBJ :
29469           case SEQID_OTHER :
29470           case SEQID_TPG :
29471           case SEQID_TPE :
29472           case SEQID_TPD :
29473           case SEQID_GPIPE:
29474             tsip = (TextSeqIdPtr) sip->data.ptrvalue;
29475             if (tsip != NULL) {
29476               if (tsip->accession == NULL && (! StringHasNoText (tsip->name))) {
29477                 if (ValidateAccn (tsip->name) == 0) {
29478                   ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadProductSeqId,
29479                             "Protein bioseq has Textseq-id 'name' that looks"
29480                             " like it is derived from a nucleotide accession");
29481                 } else {
29482                   ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_BadProductSeqId,
29483                             "Protein bioseq has Textseq-id 'name' and no accession");
29484                 }
29485               }
29486             }
29487             break;
29488           default :
29489             break;
29490         }
29491       }
29492     }
29493 
29494   }
29495 
29496   if (sfp->ext != NULL) {
29497     ValidateGoTermUserObject (vsp, sfp->ext);
29498   }
29499 
29500   if (type != SEQFEAT_GENE) {
29501     grp = SeqMgrGetGeneXref (sfp);
29502     if (grp == NULL) {
29503       sfpx = SeqMgrGetOverlappingGene (sfp->location, &fcontext);
29504       if (sfpx != NULL) {
29505         grp = (GeneRefPtr) sfpx->data.value.ptrvalue;
29506       }
29507     }
29508     if (grp != NULL && (! SeqMgrGeneIsSuppressed (grp))) {
29509       if (! StringHasNoText (grp->allele)) {
29510         for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
29511           if (StringCmp (gbq->qual, "allele") == 0 && StringDoesHaveText (gbq->val)) {
29512             if (StringICmp (gbq->val, grp->allele) == 0) {
29513               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InvalidQualifierValue, "Redundant allele qualifier (%s) on gene and feature", gbq->val);
29514             } else if (sfp->idx.subtype != FEATDEF_variation) {
29515               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InvalidQualifierValue, "Mismatched allele qualifier on gene (%s) and feature (%s)", grp->allele, gbq->val);
29516             }
29517           }
29518         }
29519       }
29520     }
29521     grp = SeqMgrGetGeneXref (sfp);
29522     if (grp != NULL && SeqMgrGeneIsSuppressed (grp)) return;
29523 
29524     if (grp == NULL) {
29525       sfpx = SeqMgrGetOverlappingGene (sfp->location, &fcontext);
29526       if (sfpx == NULL || sfpx->data.choice != SEQFEAT_GENE) return;
29527       sfp_old_locus_tag = NULL;
29528       gene_old_locus_tag = NULL;
29529       for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
29530         if (StringCmp (gbq->qual, "old_locus_tag") == 0 && StringDoesHaveText (gbq->val)) {
29531           sfp_old_locus_tag = gbq->val;
29532         }
29533       }
29534       for (gbq = sfpx->qual; gbq != NULL; gbq = gbq->next) {
29535         if (StringCmp (gbq->qual, "old_locus_tag") == 0 && StringDoesHaveText (gbq->val)) {
29536           gene_old_locus_tag = gbq->val;
29537         }
29538       }
29539       if (StringDoesHaveText (sfp_old_locus_tag) && StringDoesHaveText (gene_old_locus_tag)) {
29540         if (StringICmp (sfp_old_locus_tag, gene_old_locus_tag) != 0) {
29541           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_OldLocusTagMismtach,
29542                     "Old locus tag on feature (%s) does not match that on gene (%s)",
29543                     sfp_old_locus_tag, gene_old_locus_tag);
29544         }
29545       }
29546       MemSet ((Pointer) &dsd, 0, sizeof (DummySmfeData));
29547       dsd.max = INT4_MAX;
29548       dsd.num_at_max = 0;
29549       dsd.num_trans_spliced = 0;
29550       dsd.equivalent_genes = FALSE;
29551       dsd.grp_at_max = NULL;
29552       count = SeqMgrGetAllOverlappingFeatures (sfp->location, FEATDEF_GENE, NULL, 0,
29553                                                LOCATION_SUBSET, (Pointer) &dsd, DummySMFEProc);
29554       if (dsd.num_at_max > 1 && sfp->idx.subtype != FEATDEF_repeat_region && sfp->idx.subtype != FEATDEF_mobile_element) {
29555         if (vsp->is_small_genome_set && dsd.num_at_max == dsd.num_trans_spliced) {
29556           /* suppress for trans-spliced genes on small genome set */
29557         } else if (dsd.equivalent_genes) {
29558           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_GeneXrefNeeded,
29559                     "Feature overlapped by %d identical-length equivalent genes but has no cross-reference", (int) dsd.num_at_max);
29560         } else {
29561           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_MissingGeneXref,
29562                     "Feature overlapped by %d identical-length genes but has no cross-reference", (int) dsd.num_at_max);
29563         }
29564       }
29565       return;
29566     }
29567 
29568     if (StringDoesHaveText (grp->locus) /* && sfp->idx.subtype != FEATDEF_tRNA */) {
29569       bsp = BioseqFindFromSeqLoc (sfp->location);
29570       if (bsp != NULL) {
29571         sfpx = SeqMgrGetFeatureByLabel (bsp, grp->locus, SEQFEAT_GENE, 0, &fcontext);
29572         if (sfpx == NULL && ISA_aa (bsp->mol)) {
29573           cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
29574           if (cds != NULL) {
29575             nbsp = BioseqFindFromSeqLoc (cds->location);
29576             if (nbsp != NULL) {
29577               sfpx = SeqMgrGetFeatureByLabel (nbsp, grp->locus, SEQFEAT_GENE, 0, &fcontext);
29578             }
29579           }
29580         }
29581         if (sfpx != NULL) {
29582           sfpy = sfpx;
29583         }
29584         if (sfpx == NULL) {
29585           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_GeneXrefWithoutGene,
29586                     "Feature has gene locus cross-reference but no equivalent gene feature exists");
29587         } else if (LocationStrandsIncompatible (sfp->location, sfpx->location)) {
29588           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_GeneXrefStrandProblem,
29589                     "Gene cross-reference is not on expected strand");
29590         } else if (StringStr (sfpx->except_text, "dicistronic gene") != NULL) {
29591           dicistronic = TRUE;
29592         }
29593       }
29594     }
29595     if (StringDoesHaveText (grp->locus_tag)) {
29596       bsp = BioseqFindFromSeqLoc (sfp->location);
29597       if (bsp != NULL) {
29598         sfpx = SeqMgrGetGeneByLocusTag (bsp, grp->locus_tag, &fcontext);
29599         if (sfpx == NULL && ISA_aa (bsp->mol)) {
29600           cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
29601           if (cds != NULL) {
29602             nbsp = BioseqFindFromSeqLoc (cds->location);
29603             if (nbsp != NULL) {
29604               sfpx = SeqMgrGetFeatureByLabel (nbsp, grp->locus, SEQFEAT_GENE, 0, &fcontext);
29605             }
29606           }
29607         }
29608         if (sfpx != NULL) {
29609           sfpy = sfpx;
29610         }
29611         if (sfpx == NULL) {
29612           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_GeneXrefWithoutGene,
29613                     "Feature has gene locus_tag cross-reference but no equivalent gene feature exists");
29614         } else if (LocationStrandsIncompatible (sfp->location, sfpx->location)) {
29615           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_GeneXrefStrandProblem,
29616                     "Gene cross-reference is not on expected strand");
29617         } else if (StringStr (sfpx->except_text, "dicistronic gene") != NULL) {
29618           dicistronic = TRUE;
29619         }
29620         /* look for gene xrefs with locus_tag but no locus */
29621         if (StringHasNoText (grp->locus)
29622             && sfpx != NULL && sfpx->data.choice == SEQFEAT_GENE
29623             && sfpx->data.value.ptrvalue != NULL
29624             && StringDoesHaveText (((GeneRefPtr)sfpx->data.value.ptrvalue)->locus)) {
29625             ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_GeneXrefWithoutLocus,
29626                       "Feature has Gene Xref with locus_tag but no locus, gene with locus_tag and locus exists");
29627         }
29628       }
29629     }
29630 
29631     sfpx = NULL;
29632     if (SeqMgrGetDesiredFeature (sfp->idx.entityID, NULL, 0, 0, sfp, &fcontext) == sfp) {
29633       if (fcontext.bad_order || fcontext.mixed_strand) {
29634         sfpx = SeqMgrGetOverlappingFeatureEx (sfp->location, FEATDEF_GENE, NULL, 0, NULL, LOCATION_SUBSET, &gcontext, TRUE);
29635       } else if (vsp->has_multi_int_genes) {
29636         sfpx = SeqMgrGetOverlappingFeatureEx (sfp->location, FEATDEF_GENE, NULL, 0, NULL, LOCATION_SUBSET, &gcontext, TRUE);
29637         if (sfpx == NULL && (vsp->has_seg_bioseqs || vsp->is_embl_ddbj_in_sep || vsp->is_old_gb_in_sep)) {
29638           sfpx = SeqMgrGetOverlappingGene (sfp->location, &gcontext);
29639         }
29640       } else {
29641         sfpx = SeqMgrGetOverlappingGene (sfp->location, &gcontext);
29642       }
29643     } else {
29644       sfpx = SeqMgrGetOverlappingGene (sfp->location, &gcontext);
29645     }
29646     if (sfpx == NULL || sfpx->data.choice != SEQFEAT_GENE)
29647       return;
29648     grpx = (GeneRefPtr) sfpx->data.value.ptrvalue;
29649     if (grpx == NULL)
29650       return;
29651     redundantgenexref = FALSE;
29652     label = gcontext.label;
29653     redundantgenexref = s_GeneRefsAreEquivalent(grp, grpx, &label);
29654     if (redundantgenexref) {
29655       MemSet ((Pointer) &dsd, 0, sizeof (DummySmfeData));
29656       dsd.max = INT4_MAX;
29657       dsd.num_at_max = 0;
29658       dsd.num_trans_spliced = 0;
29659       dsd.equivalent_genes = FALSE;
29660       dsd.grp_at_max = NULL;
29661       count = SeqMgrGetAllOverlappingFeatures (sfp->location, FEATDEF_GENE, NULL, 0,
29662                                                LOCATION_SUBSET, (Pointer) &dsd, DummySMFEProc);
29663       if (dsd.num_at_max > 1) {
29664         redundantgenexref = FALSE;
29665       }
29666     }
29667     if (redundantgenexref) {
29668       if (StringHasNoText (label)) {
29669         label = "?";
29670       }
29671       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnnecessaryGeneXref, "Unnecessary gene cross-reference %s", label);
29672     } else {
29673       if ((! dicistronic) && GPSorNTorNCorNGorNW (vsp->sep, sfp->location)) {
29674         /*
29675         SeqEntryToBioSource (vsp->sep, NULL, NULL, 0, &biop);
29676         */
29677         bsp = BioseqFindFromSeqLoc (sfp->location);
29678         BioseqToGeneticCode (bsp, NULL, NULL, NULL, NULL, 0, &biop);
29679         if (biop != NULL) {
29680           orp = biop->org;
29681           if (orp != NULL) {
29682             /* curated fly source still has duplicate features */
29683             if (StringNICmp (orp->taxname, "Drosophila ", 11) == 0) {
29684               if (StringHasNoText (label)) {
29685                 label = "?";
29686               }
29687               if (sfpy != NULL && SeqLocAinB (sfp->location, sfpy->location) >= 0 &&
29688                   ValStrandsMatch (SeqLocStrand (sfp->location), SeqLocStrand (sfpy->location))) {
29689                 /* cross-reference needed to disambiguate between multiple overlapping genes, ignore */
29690               } else {
29691                 genexref_label = GetGeneXrefLabel (grp);
29692                 ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_SuspiciousGeneXref, "Curated Drosophila record should not have gene cross-reference %s", genexref_label);
29693                 genexref_label = MemFree (genexref_label);
29694               }
29695             }
29696           }
29697         }
29698       }
29699     }
29700   } else {
29701     grp = SeqMgrGetGeneXref (sfp);
29702     if (grp != NULL) {
29703       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnnecessaryGeneXref, "Gene feature has gene cross-reference");
29704     }
29705     operon = SeqMgrGetOverlappingOperon (sfp->location, &fcontext);
29706     if (operon != NULL) {
29707       if (SeqMgrGetDesiredFeature (sfp->idx.entityID, 0, 0, 0, sfp, &fcontext) == sfp) {
29708         if (! StringHasNoText (fcontext.label)) {
29709           for (gbq = operon->qual; gbq != NULL; gbq = gbq->next) {
29710             if (StringCmp (gbq->qual, "operon") == 0) {
29711               if (StringICmp (gbq->val, fcontext.label) == 0) {
29712                 ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_InvalidQualifierValue, "Operon is same as gene - %s", gbq->val);
29713               }
29714             }
29715           }
29716         }
29717       }
29718     }
29719   }
29720 }
29721 
29722 /*****************************************************************************
29723 *
29724 *   MrnaTransCheck (sfp, vsp)
29725 *
29726 *****************************************************************************/
29727 
29728 static CharPtr bypass_mrna_trans_check [] = {
29729   "RNA editing",
29730   "reasons given in citation",
29731   "artificial frameshift",
29732   "transcribed product replaced",
29733   "unclassified transcription discrepancy",
29734   "mismatches in transcription",
29735   "adjusted for low-quality genome",
29736   "annotated by transcript or proteomic data",
29737   NULL
29738 };
29739 
MrnaTransCheck(ValidStructPtr vsp,SeqFeatPtr sfp)29740 NLM_EXTERN void MrnaTransCheck (ValidStructPtr vsp, SeqFeatPtr sfp)
29741 {
29742   BioseqPtr       bsp;
29743   Char            ch;
29744   Int4            counta, countnona;
29745   DbtagPtr        dbt;
29746   CharPtr         farstr = "";
29747   ErrSev          fetchsev;
29748   GatherContextPtr  gcp;
29749   Boolean         has_errors = FALSE, unclassified_except = FALSE,
29750                   mismatch_except = FALSE, other_than_mismatch = FALSE,
29751                   product_replaced = FALSE;
29752   Int2            i;
29753   Char            id [64];
29754   Boolean         is_refseq = FALSE;
29755   ErrSev          logsev;
29756   ErrSev          msgsev;
29757   Int4            mismatch, total;
29758   CharPtr         mrseq, pdseq;
29759   Int4            mlen, plen;
29760   CharPtr         ptr1, ptr2;
29761   Boolean         report_errors = TRUE;
29762   ErrSev          sev;
29763   SeqFeat         sf;
29764   SeqIdPtr        sip, sip2, sip3;
29765   Boolean         unlockProd = FALSE;
29766   ValNode         vn;
29767   SeqDescrPtr     sdp;
29768   MolInfoPtr      mip;
29769   TextSeqIdPtr    tsip;
29770   Boolean         rna_editing = FALSE;
29771 
29772   if (sfp == NULL)
29773     return;
29774   if (sfp->pseudo)
29775     return;
29776   if (sfp->product == NULL)
29777     return;
29778 
29779   if (sfp->excpt && (! vsp->ignoreExceptions) && (! StringHasNoText (sfp->except_text))) {
29780     for (i = 0; bypass_mrna_trans_check [i] != NULL; i++) {
29781       if (StringISearch (sfp->except_text,  bypass_mrna_trans_check [i]) != NULL) {
29782         report_errors = FALSE;  /* biological exception */
29783       }
29784     }
29785     if (StringISearch (sfp->except_text, "RNA editing") != NULL) {
29786       rna_editing = TRUE;
29787     }
29788     if (StringStr (sfp->except_text, "unclassified transcription discrepancy") != NULL) {
29789       unclassified_except = TRUE;
29790     }
29791     if (StringStr (sfp->except_text, "mismatches in transcription") != NULL) {
29792       mismatch_except = TRUE;
29793       report_errors = TRUE;
29794     }
29795     if (StringICmp (sfp->except_text, "transcribed product replaced") == 0) {
29796       product_replaced = TRUE;
29797     }
29798   }
29799 
29800   sip = SeqLocId (sfp->product);
29801   if (sip == NULL)
29802     return;
29803 
29804   msgsev = ErrSetMessageLevel (SEV_MAX);
29805   logsev = ErrSetLogLevel (SEV_MAX);
29806 
29807   mrseq = GetSequenceByFeature (sfp);
29808 
29809   ErrSetLogLevel (logsev);
29810   ErrSetMessageLevel (msgsev);
29811 
29812   if (mrseq == NULL) {
29813     //LCOV_EXCL_START
29814     //this measures a failure in the C Toolkit, not a problem with the ASN.1
29815     ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_MrnaTransFail, "Unable to transcribe mRNA");
29816     return;
29817     //LCOV_EXCL_STOP
29818   }
29819 
29820   bsp = BioseqFindFromSeqLoc (sfp->location);
29821   if (bsp != NULL) {
29822     for (sip2 = bsp->id; sip2 != NULL; sip2 = sip2->next) {
29823       if (sip2->choice == SEQID_OTHER) {
29824         is_refseq = TRUE;
29825       }
29826     }
29827   }
29828 
29829   mismatch = 0;
29830   total = 0;
29831 
29832   sev = SEV_ERROR;
29833   gcp = vsp->gcp;
29834   if (gcp != NULL) {
29835     bsp = GetBioseqGivenSeqLoc (sfp->product, gcp->entityID);
29836     if (bsp == NULL) {
29837       /* if not local bioseq product, lower severity */
29838       sev = SEV_WARNING;
29839       if (is_refseq) {
29840         /* if refseq, restore higher severity */
29841         sev = SEV_ERROR;
29842       }
29843     }
29844     if (bsp == NULL && vsp->farFetchMRNAproducts) {
29845       if (sip != NULL && (sip->choice != SEQID_GI || sip->data.intvalue > 0)) {
29846         bsp = BioseqLockById (sip);
29847       }
29848       if (bsp != NULL) {
29849         unlockProd = TRUE;
29850         farstr = "(far) ";
29851         if (sfp->partial) {
29852           sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_molinfo, NULL);
29853           if (sdp != NULL && sdp->choice == Seq_descr_molinfo) {
29854             mip = (MolInfoPtr) sdp->data.ptrvalue;
29855             if (mip != NULL) {
29856               if (mip->completeness < 2 || mip->completeness > 5) {
29857                 for (sip3 = bsp->id; sip3 != NULL; sip3 = sip3->next) {
29858                   if (sip3->choice != SEQID_OTHER) continue;
29859                   tsip = (TextSeqIdPtr) sip3->data.ptrvalue;
29860                   if (tsip == NULL) continue;
29861                   if (StringNCmp (tsip->accession, "NM_", 3) == 0) {
29862                     /* if far NM_ record, return to lower severity */
29863                     sev = SEV_WARNING;
29864                   }
29865                 }
29866               }
29867             }
29868           }
29869         }
29870       }
29871     }
29872     if (bsp == NULL && (! vsp->farFetchMRNAproducts)) {
29873       goto erret;
29874     }
29875     if (bsp == NULL && sfp->product != NULL && vsp->farFetchMRNAproducts) {
29876       SeqIdWrite (sip, id, PRINTID_FASTA_LONG, sizeof (id));
29877       sev = SEV_ERROR;
29878       if (sip != NULL && sip->choice == SEQID_GENERAL) {
29879         dbt = (DbtagPtr) sip->data.ptrvalue;
29880         if (dbt != NULL && StringICmp (dbt->db, "ti") != 0 && StringICmp (dbt->db, "SRA") != 0) {
29881           sev = SEV_WARNING;
29882         }
29883       }
29884       ValidErr (vsp, sev, ERR_SEQ_FEAT_ProductFetchFailure, "Unable to fetch mRNA transcript '%s'", id);
29885       goto erret;
29886     }
29887   }
29888   if (is_refseq && unclassified_except) {
29889     /* if unclassified exception, drop back down to warning */
29890     sev = SEV_WARNING;
29891   }
29892 
29893   /* coerced feature on whole product for GetSequenceByFeature */
29894 
29895   MemSet ((Pointer) &sf, 0, sizeof (SeqFeat));
29896   MemSet ((Pointer) &vn, 0, sizeof (ValNode));
29897   sf.location = &vn;
29898   vn.choice = SEQLOC_WHOLE;
29899   vn.data.ptrvalue = sip;
29900 
29901   pdseq = GetSequenceByFeature (&sf);
29902   if (pdseq == NULL) {
29903     //LCOV_EXCL_START
29904     //this measures a failure in the C Toolkit, not a problem with the ASN.1
29905     has_errors = TRUE;
29906     other_than_mismatch = TRUE;
29907     if (report_errors || unclassified_except) {
29908       fetchsev = SEV_ERROR;
29909       if (sip->choice != SEQID_GI) {
29910         fetchsev = SEV_WARNING;
29911       }
29912       ValidErr (vsp, fetchsev, ERR_SEQ_FEAT_MrnaTransFail, "Unable to fetch mRNA transcript");
29913     }
29914     //LCOV_EXCL_STOP
29915   }
29916   if (pdseq != NULL) {
29917     mlen = StringLen (mrseq);
29918     plen = StringLen (pdseq);
29919     if (mlen != plen) {
29920       if (mlen < plen) {
29921         ptr1 = pdseq + mlen;
29922         counta = 0;
29923         countnona = 0;
29924         ch = *ptr1;
29925         while (ch != '\0') {
29926           if (ch == 'A' || ch == 'a') {
29927             counta++;
29928           } else {
29929             countnona++;
29930           }
29931           ptr1++;
29932           ch = *ptr1;
29933         }
29934         if (counta < 19 * countnona) {
29935           has_errors = TRUE;
29936           other_than_mismatch = TRUE;
29937           if (report_errors || rna_editing) {
29938             ValidErr (vsp, sev, ERR_SEQ_FEAT_TranscriptLen, "Transcript length [%ld] less than %sproduct length [%ld], and tail < 95%s polyA", (long) mlen, farstr, (long) plen, "%");
29939           }
29940           plen = mlen; /* even if it fails polyA test, allow base-by-base comparison on common length */
29941         } else if (counta > 0 && countnona == 0) {
29942           has_errors = TRUE;
29943           other_than_mismatch = TRUE;
29944           if (report_errors || rna_editing) {
29945             ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_PolyATail, "Transcript length [%ld] less than %sproduct length [%ld], but tail is 100%s polyA", (long) mlen, farstr, (long) plen, "%");
29946           }
29947           plen = mlen; /* if it passes polyA test, allow base-by-base comparison on common length */
29948         } else {
29949           has_errors = TRUE;
29950           other_than_mismatch = TRUE;
29951           if (report_errors || rna_editing) {
29952             ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_PolyATail, "Transcript length [%ld] less than %sproduct length [%ld], but tail >= 95%s polyA", (long) mlen, farstr, (long) plen, "%");
29953           }
29954           plen = mlen; /* if it passes polyA test, allow base-by-base comparison on common length */
29955         }
29956       } else {
29957         has_errors = TRUE;
29958         other_than_mismatch = TRUE;
29959         if (report_errors || rna_editing) {
29960           ValidErr (vsp, sev, ERR_SEQ_FEAT_TranscriptLen, "Transcript length [%ld] greater than %sproduct length [%ld]", (long) mlen, farstr, (long) plen);
29961         }
29962       }
29963     }
29964     if (mlen == plen && mlen > 0 && StringICmp (mrseq, pdseq) != 0) {
29965       mismatch = 0;
29966       total = 0;
29967       ptr1 = mrseq;
29968       ptr2 = pdseq;
29969       while (total < mlen) {
29970         if (*ptr1 != *ptr2) {
29971           mismatch++;
29972         }
29973         ptr1++;
29974         ptr2++;
29975         total++;
29976       }
29977       if (mismatch > 0) {
29978         has_errors = TRUE;
29979         if (report_errors && (! mismatch_except)) {
29980           ValidErr (vsp, sev, ERR_SEQ_FEAT_TranscriptMismatches,
29981                     "There are %ld mismatches out of %ld bases between the transcript and %sproduct sequence", (long) mismatch, (long) total, farstr);
29982         }
29983       }
29984     }
29985     MemFree (pdseq);
29986   }
29987 
29988   if (! report_errors) {
29989     if (! has_errors) {
29990       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnnecessaryException, "mRNA has exception but passes transcription test");
29991     } else if (unclassified_except && (! other_than_mismatch)) {
29992       if (mismatch * 50 <= total) {
29993         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ErroneousException,
29994                   "mRNA has unclassified exception but only difference is %ld mismatches out of %ld bases",
29995                   (long) mismatch, (long) total);
29996       }
29997     } else if (product_replaced) {
29998       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnqualifiedException, "mRNA has unqualified transcribed product replaced exception");
29999     }
30000   }
30001 
30002 erret:
30003 
30004   MemFree (mrseq);
30005 
30006   if (unlockProd) {
30007     BioseqUnlock (bsp);
30008   }
30009 
30010 }
30011 
30012 /*****************************************************************************
30013 *
30014 *   CdTransCheck(sfp)
30015 *       Treatment of terminal 'X'
30016 *          If either the protein or the translation end in 'X' (usually
30017 *          due to partial last codon) it is ignored to minimize conflicts
30018 *          between approaches to add the X or not in this case.
30019 *
30020 *****************************************************************************/
MapToNTCoords(SeqFeatPtr sfp,SeqIdPtr protID,Int4 pos)30021 static CharPtr MapToNTCoords (SeqFeatPtr sfp, SeqIdPtr protID, Int4 pos)
30022 {
30023   SeqLocPtr       nslp;
30024   SeqLocPtr       pslp;
30025   CharPtr         rsult;
30026   SeqPntPtr       spntp;
30027 
30028   rsult = NULL;
30029   if (sfp != NULL && protID != NULL && pos >= 0) {
30030     spntp = SeqPntNew ();
30031     pslp = ValNodeNew (NULL);
30032     pslp->choice = SEQLOC_PNT;
30033     pslp->data.ptrvalue = (Pointer) spntp;
30034     spntp->point = pos;
30035     spntp->id = SeqIdDup (protID);
30036     nslp = aaLoc_to_dnaLoc (sfp, pslp);
30037     if (nslp != NULL) {
30038       rsult = SeqLocPrint (nslp);
30039     }
30040     SeqLocFree (pslp);
30041     SeqLocFree (nslp);
30042   }
30043   return rsult;
30044 }
30045 
Loc_is_RefSeq(SeqLocPtr location)30046 static Boolean Loc_is_RefSeq (SeqLocPtr location)
30047 {
30048   BioseqPtr       bsp;
30049   SeqIdPtr        sip;
30050   TextSeqIdPtr    tsip;
30051 
30052   if (location == NULL)
30053     return FALSE;
30054   sip = SeqLocId (location);
30055   if (sip == NULL)
30056     return FALSE;
30057   bsp = BioseqFind (sip);
30058   if (bsp == NULL)
30059     return FALSE;
30060   for (sip = bsp->id; sip != NULL; sip = sip->next) {
30061     if (sip->choice == SEQID_OTHER) {
30062       tsip = (TextSeqIdPtr) sip->data.ptrvalue;
30063       if (tsip != NULL) {
30064         if (StringNICmp (tsip->accession, "NM_", 3) == 0) {
30065           return TRUE;
30066         }
30067       }
30068     }
30069   }
30070   return FALSE;
30071 }
30072 
Loc_is_GEDL(SeqLocPtr location)30073 static Boolean Loc_is_GEDL (SeqLocPtr location)
30074 {
30075   BioseqPtr  bsp;
30076   SeqIdPtr   sip;
30077 
30078   if (location == NULL)
30079     return FALSE;
30080   sip = SeqLocId (location);
30081   if (sip == NULL)
30082     return FALSE;
30083   bsp = BioseqFind (sip);
30084   if (bsp == NULL)
30085     return FALSE;
30086   for (sip = bsp->id; sip != NULL; sip = sip->next) {
30087     if (sip->choice == SEQID_GENBANK) return TRUE;
30088     if (sip->choice == SEQID_EMBL) return TRUE;
30089     if (sip->choice == SEQID_DDBJ) return TRUE;
30090     if (sip->choice == SEQID_LOCAL) return TRUE;
30091   }
30092   return FALSE;
30093 }
30094 
CdConflictCheck(ValidStructPtr vsp,SeqFeatPtr sfp)30095 static void CdConflictCheck (ValidStructPtr vsp, SeqFeatPtr sfp)
30096 
30097 {
30098   ByteStorePtr  bs;
30099   BioseqPtr     bsp;
30100   CharPtr       str1, str2;
30101 
30102   if (sfp == NULL || vsp == NULL) return;
30103 
30104   bsp = BioseqFindFromSeqLoc (sfp->product);
30105   str1 = GetSequenceByBsp (bsp);
30106   bs = TransTableTranslateCdRegion (NULL, sfp, FALSE, FALSE, TRUE);
30107   str2 = (CharPtr) BSMerge (bs, NULL);
30108   BSFree (bs);
30109 
30110   if (str1 != NULL && str2 != NULL && StringCmp (str1, str2) == 0) {
30111     ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_BadConflictFlag, "Coding region conflict flag should not be set");
30112   } else {
30113     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ConflictFlagSet, "Coding region conflict flag is set");
30114   }
30115 
30116   MemFree (str1);
30117   MemFree (str2);
30118 }
30119 
30120 static CharPtr bypass_cds_trans_check [] = {
30121   "RNA editing",
30122   "reasons given in citation",
30123   "artificial frameshift",
30124   "rearrangement required for product",
30125   "translated product replaced",
30126   "unclassified translation discrepancy",
30127   "mismatches in translation",
30128   "adjusted for low-quality genome",
30129   "annotated by transcript or proteomic data",
30130   /*
30131   "heterogeneous population sequenced",
30132   "low-quality sequence region",
30133   "artificial location",
30134   */
30135   NULL
30136 };
30137 
ValidateTranslExcept(ValidStructPtr vsp,SeqFeatPtr sfp,ValNodePtr codebreakhead,Boolean farFetchProd,Uint1 frame,ValNodePtr genetic_code)30138 static void ValidateTranslExcept (
30139   ValidStructPtr vsp,
30140   SeqFeatPtr sfp,
30141   ValNodePtr codebreakhead,
30142   Boolean farFetchProd,
30143   Uint1 frame,
30144   ValNodePtr genetic_code
30145 )
30146 
30147 {
30148   Boolean       alt_start = FALSE;
30149   CdRegion      cr;
30150   ByteStorePtr  newprot = NULL;
30151   CharPtr       protseq = NULL;
30152   Int4          prot2len, i;
30153   SeqFeat       sf;
30154   ValNodePtr    vnp;
30155 
30156   MemSet ((Pointer) &sf, 0, sizeof (SeqFeat));
30157   MemSet ((Pointer) &cr, 0, sizeof (CdRegion));
30158   sf.data.choice = SEQFEAT_CDREGION;
30159   sf.data.value.ptrvalue = (Pointer) &cr;
30160   sf.location = sfp->location;
30161   cr.frame = frame;
30162   cr.genetic_code = genetic_code;
30163 
30164   newprot = ProteinFromCdRegionExEx (&sf, TRUE, FALSE, &alt_start, farFetchProd);
30165   if (newprot == NULL) return;
30166   protseq = BSMerge (newprot, NULL);
30167   BSFree (newprot);
30168   if (protseq == NULL) return;
30169   prot2len = StringLen (protseq);
30170   for (vnp = codebreakhead; vnp != NULL; vnp = vnp->next) {
30171     i = vnp->data.intvalue;
30172     if (i == 0 && ! sfp->partial && (char) vnp->choice != 'M') {
30173       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_TranslExcept,
30174                 "Suspicious transl_except %c at first codon of complete CDS",
30175                 (char) vnp->choice);
30176     }
30177     if (i >= 0 && i < prot2len) {
30178       if (protseq [i] == (Char) vnp->choice) {
30179         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnnecessaryTranslExcept,
30180                   "Unnecessary transl_except %c at position %ld",
30181                   (char) vnp->choice, (long) (i + 1));
30182       }
30183     } else if (i == prot2len) {
30184       if ((Char) vnp->choice != '*') {
30185         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnnecessaryTranslExcept,
30186                   "Unexpected transl_except %c at position %ld just past end of protein",
30187                   (char) vnp->choice, (long) (i + 1));
30188       }
30189     }
30190   }
30191   MemFree (protseq);
30192 }
30193 
30194 /* unusual translation start without initiator tRNA */
LeuCUGstart(SeqFeatPtr sfp)30195 static Boolean LeuCUGstart (SeqFeatPtr sfp)
30196 {
30197   GBQualPtr  gbq;
30198 
30199   if (sfp == NULL) return FALSE;
30200   if (! sfp->excpt) return FALSE;
30201   if (StringISearch (sfp->except_text, "translation initiation by tRNA-Leu at CUG codon") == NULL) return FALSE;
30202 
30203   for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
30204     if (StringICmp (gbq->qual, "experiment") == 0) return TRUE;
30205   }
30206 
30207   return FALSE;
30208 }
30209 
30210 
ValidateCodeBreaks(ValidStructPtr vsp,SeqFeatPtr cds,Boolean farFetchProd)30211 static void ValidateCodeBreaks (ValidStructPtr vsp, SeqFeatPtr cds, Boolean farFetchProd)
30212 {
30213   CdRegionPtr     crp, tmp_crp;
30214   CodeBreakPtr    cbp;
30215   SeqFeatPtr      tmp;
30216   ByteStorePtr    newprot = NULL;
30217   CharPtr         protseq = NULL;
30218   Int4            pos, prot_len = 0;
30219   char            aa;
30220   Boolean         alt_start;
30221   Boolean         partial5, partial3;
30222 
30223   if (cds == NULL || cds->data.choice != SEQFEAT_CDREGION
30224       || (crp = (CdRegionPtr) (cds->data.value.ptrvalue)) == NULL
30225       || (cbp = crp->code_break) == NULL) {
30226     //LCOV_EXCL_START
30227     //condition never met given how function is called
30228     return;
30229     //LCOV_EXCL_STOP
30230   }
30231   CheckSeqLocForPartial (cds->location, &partial5, &partial3);
30232   /* don't copy code break when copying for tmp */
30233   crp->code_break = NULL;
30234   tmp = (SeqFeatPtr)AsnIoMemCopy (cds, (AsnReadFunc) SeqFeatAsnRead, (AsnWriteFunc) SeqFeatAsnWrite);
30235   crp->code_break = cbp;
30236   /* calculate expected protein translation without breaks */
30237   newprot = ProteinFromCdRegionExEx (tmp, TRUE, FALSE, &alt_start, farFetchProd);
30238   if (newprot != NULL) {
30239     protseq = BSMerge (newprot, NULL);
30240     BSFree (newprot);
30241     prot_len = StringLen (protseq);
30242     /* shorten prot len if ends with stop codon */
30243     if (prot_len > 0 && protseq[prot_len - 1] == '*' && !partial3) {
30244       prot_len--;
30245     }
30246     protseq = MemFree (protseq);
30247   }
30248 
30249   /* free tmp location; will use code break locations instead */
30250   tmp->location = SeqLocFree (tmp->location);
30251   /* clear frame - locations should already be offset */
30252   tmp_crp = (CdRegionPtr) tmp->data.value.ptrvalue;
30253   tmp_crp->frame = 0;
30254 
30255   for (cbp = crp->code_break; cbp != NULL; cbp = cbp->next) {
30256     pos = (GetOffsetInLoc (cbp->loc, cds->location, SEQLOC_START)) / 3;
30257     aa = (char) cbp->aa.value.intvalue;
30258     if (pos == 0 && ! partial5 && aa != 'M') {
30259       if (pos == 0 && aa == 'L' && LeuCUGstart (cds) && vsp->is_refseq_in_sep) {
30260         /* do not warn on explicitly documented unusual translation initiation at CUG without initiator tRNA-Met */
30261       } else {
30262         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_TranslExcept,
30263                   "Suspicious transl_except %c at first codon of complete CDS",
30264                   aa);
30265       }
30266     }
30267     if (pos < prot_len) {
30268       tmp->location = SeqLocCopy (cbp->loc);
30269       SetSeqLocPartial (tmp->location, TRUE, TRUE);
30270       newprot = ProteinFromCdRegionExEx (tmp, TRUE, FALSE, &alt_start, farFetchProd);
30271       if (newprot == NULL) {
30272         /* do something about inability to translate? */
30273         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnnecessaryTranslExcept,
30274                   "Unable to translate location for transl_except %c at position %ld",
30275                   aa, (long) (pos + 1));
30276       } else {
30277         protseq = BSMerge (newprot, NULL);
30278         BSFree (newprot);
30279         if (protseq == NULL) {
30280           /* do something about inablity to translate? */
30281           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnnecessaryTranslExcept,
30282                     "Unable to translate location for transl_except %c at position %ld",
30283                     aa, (long) (pos + 1));
30284         } else {
30285           if (protseq[0] == cbp->aa.value.intvalue) {
30286             if (pos == 0 && aa == 'L' && LeuCUGstart (cds) && vsp->is_refseq_in_sep) {
30287               /* do not warn on explicitly documented unusual translation initiation at CUG without initiator tRNA-Met */
30288             } else {
30289               ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnnecessaryTranslExcept,
30290                         "Unnecessary transl_except %c at position %ld",
30291                         protseq[0], (long) (pos + 1));
30292             }
30293           }
30294           protseq = MemFree (protseq);
30295         }
30296       }
30297       tmp->location = SeqLocFree (tmp->location);
30298     } else if (aa != '*') {
30299       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnnecessaryTranslExcept,
30300                 "Unexpected transl_except %c at position %ld just past end of protein",
30301                 aa, (long) (pos + 1));
30302     }
30303 
30304   }
30305   tmp = SeqFeatFree (tmp);
30306 }
30307 
CheckForThreeBaseNonsense(ValidStructPtr vsp,SeqFeatPtr sfp,CdRegionPtr crp,Int4 start,Int4 stop,Uint1 strand)30308 static void CheckForThreeBaseNonsense (ValidStructPtr vsp, SeqFeatPtr sfp, CdRegionPtr crp, Int4 start, Int4 stop, Uint1 strand)
30309 
30310 {
30311   ByteStorePtr   bs;
30312   BioseqPtr      bsp;
30313   Int2           genCode = 0;
30314   SeqInt         sint;
30315   CharPtr        res;
30316   ErrSev         sev;
30317   Char           str [32];
30318   Boolean        tableExists = FALSE;
30319   TransTablePtr  tbl = NULL;
30320   ValNode        vn;
30321   ValNodePtr     vnp;
30322   ValNodePtr     tvnp;
30323 
30324   if (vsp == NULL || sfp == NULL || crp == NULL) return;
30325 
30326   bsp = BioseqFindFromSeqLoc (sfp->location);
30327   if (bsp == NULL) return;
30328 
30329   MemSet ((Pointer) &sint, 0, sizeof (SeqInt));
30330   MemSet ((Pointer) &vn, 0, sizeof (ValNode));
30331 
30332   sint.from = stop;
30333   sint.to = start;
30334   sint.strand = strand;
30335   sint.id = bsp->id;
30336 
30337   vn.choice = SEQLOC_INT;
30338   vn.data.ptrvalue = (Pointer) &sint;
30339   vn.next = NULL;
30340 
30341   SetSeqLocPartial (&vn, TRUE, TRUE);
30342 
30343   if (crp->genetic_code != NULL) {
30344     vnp = (ValNodePtr) crp->genetic_code->data.ptrvalue;
30345     while (vnp != NULL) {
30346       if (vnp->choice == 2) {
30347         genCode = (Int2) vnp->data.intvalue;
30348       }
30349       vnp = vnp->next;
30350     }
30351   }
30352 
30353   if (genCode == 7) {
30354     genCode = 4;
30355   } else if (genCode == 8) {
30356     genCode = 1;
30357   } else if (genCode == 0) {
30358     genCode = 1;
30359   }
30360 
30361   /* set app property name for storing desired FSA */
30362 
30363   sprintf (str, "TransTableFSAforGenCode%d", (int) genCode);
30364 
30365   /* get FSA for desired genetic code if it already exists */
30366 
30367   tbl = (TransTablePtr) GetAppProperty (str);
30368   tableExists = (Boolean) (tbl != NULL);
30369 
30370   bs = TransTableTranslateSeqLoc (&tbl, &vn, genCode, 1, TRUE, TRUE);
30371   res = BSMerge (bs, NULL);
30372   BSFree (bs);
30373 
30374   /* save FSA in genetic code-specific app property name */
30375 
30376   if (! tableExists) {
30377     SetAppProperty (str, (Pointer) tbl);
30378   }
30379 
30380   if (StringCmp (res, "*") == 0) {
30381     sev = SEV_REJECT;
30382     if (vsp->is_embl_ddbj_in_sep) {
30383       sev = SEV_ERROR;
30384     }
30385     ValidErr (vsp, sev, ERR_SEQ_FEAT_NonsenseIntron, "Triplet intron encodes stop codon");
30386     tvnp = ValNodeNew(NULL);
30387     tvnp->data.ptrvalue = sfp;
30388     tvnp->next = vsp->sisfp;
30389     vsp->sisfp = tvnp;
30390   }
30391 
30392   MemFree (res);
30393 }
30394 
TranslateTripletIntrons(ValidStructPtr vsp,SeqFeatPtr sfp,CdRegionPtr crp)30395 static void TranslateTripletIntrons (ValidStructPtr vsp, SeqFeatPtr sfp, CdRegionPtr crp)
30396 
30397 {
30398   SeqLocPtr slp;
30399   Int4      last_start, last_stop, start, stop;
30400   Uint1     strand;
30401 
30402   if (vsp == NULL || sfp == NULL || crp == NULL || sfp->excpt || IsPseudo (sfp) || crp->code_break != NULL) return;
30403 
30404   slp = SeqLocFindNext (sfp->location, NULL);
30405   last_start = SeqLocStart (slp);
30406   last_stop = SeqLocStop (slp);
30407 
30408   slp = SeqLocFindNext (sfp->location, slp);
30409   while (slp != NULL) {
30410       start = SeqLocStart (slp);
30411       stop = SeqLocStop (slp);
30412       strand = SeqLocStrand (slp);
30413       if (strand == Seq_strand_minus) {
30414         if (last_start - stop == 4) {
30415           CheckForThreeBaseNonsense (vsp, sfp, crp, last_start - 1, stop + 1, strand);
30416         }
30417       } else {
30418         if (start - last_stop == 4) {
30419           CheckForThreeBaseNonsense (vsp, sfp, crp, start - 1, last_stop + 1, strand);
30420         }
30421       }
30422       last_start = start;
30423       last_stop = stop;
30424       slp = SeqLocFindNext (sfp->location, slp);
30425   }
30426 }
30427 
30428 typedef struct cdsmismatch {
30429   Int4 pos;
30430   Int2 cds_residue;
30431   Int2 prot_residue;
30432 } CDSMismatchData, PNTR CDSMismatchPtr;
30433 
CdTransCheck(ValidStructPtr vsp,SeqFeatPtr sfp)30434 NLM_EXTERN void CdTransCheck (ValidStructPtr vsp, SeqFeatPtr sfp)
30435 
30436 {
30437   ByteStorePtr    newprot = NULL;
30438   CharPtr         protseq = NULL;
30439   BioseqPtr       prot1seq = NULL, prot2seq = NULL;
30440   Int4            i, len = 0, x_count = 0,
30441                   nonx_count = 0, xcount1 = 0, xcount2 = 0;
30442   Int4            prot1len = 0; /* length of protein product sequence */
30443   Int4            prot2len = 0; /* length of translation of coding region */
30444   CdRegionPtr     crp;
30445   SeqIdPtr        protid = NULL;
30446   Int2            residue1, residue2, stop_count = 0, mismatch = 0, ragged = 0;
30447   CDSMismatchData mismatches[11];
30448   Boolean         got_stop = FALSE;
30449   /*
30450   SeqPortPtr      spp = NULL;
30451   */
30452   Uint2           part_loc = 0, part_prod = 0;
30453   Boolean         no_end = FALSE, no_beg = FALSE, show_stop = FALSE,
30454                   got_dash = FALSE, alt_start = FALSE, got_x = FALSE, done;
30455   GBQualPtr       gb;
30456   ValNodePtr      vnp, vnp2, code, codebreakhead = NULL;
30457   int             gccode = 0;
30458   Boolean         transl_except = FALSE, prot_ok = TRUE, is_nc = FALSE,
30459                   has_errors = FALSE, report_errors = TRUE,
30460                   unclassified_except = FALSE, mismatch_except = FALSE,
30461                   frameshift_except = FALSE, rearrange_except = FALSE,
30462                   other_than_mismatch = FALSE, product_replaced = FALSE,
30463                   mixed_population = FALSE, low_quality = FALSE,
30464                   artificial_location = FALSE;
30465   Boolean         partial5 = FALSE;
30466   Boolean         partial3 = FALSE;
30467   Boolean         rna_editing = FALSE;
30468   CharPtr         nuclocstr, farstr = "", loc2str;
30469   CodeBreakPtr    cbp;
30470   Int4            pos1, pos2, pos;
30471   SeqLocPtr       tmp;
30472   ErrSev          sev, trans_len_sev = SEV_ERROR;
30473   SeqEntryPtr     sep;
30474   Boolean         unlockProd = FALSE;
30475   StreamCache     sc;
30476   Boolean         isgap;
30477   Boolean         badseq = FALSE;
30478   BioseqPtr       bsp;
30479   DbtagPtr        dbt;
30480   SeqIdPtr        sip, sip3;
30481   Char            id [64];
30482   Boolean         is_ged = FALSE;
30483   Boolean         is_refseq = FALSE;
30484   Boolean         has_gi = FALSE;
30485   Boolean         farFetchProd;
30486   SeqDescrPtr     sdp;
30487   MolInfoPtr      mip;
30488   TextSeqIdPtr    tsip;
30489   Boolean         annotated_by_transcript_or_proteomic = FALSE;
30490   GeneRefPtr      grp;
30491   Boolean         pseudo = FALSE;
30492   Boolean         suppressed = FALSE;
30493 
30494   if (sfp == NULL) return;
30495 
30496   crp = (CdRegionPtr) (sfp->data.value.ptrvalue);
30497   if (crp == NULL) return;
30498 
30499   for (gb = sfp->qual; gb != NULL; gb = gb->next) {     /* pseuogene */
30500     if (!StringICmp ("pseudo", gb->qual))
30501       return;
30502   }
30503 
30504   grp = GetGeneByFeat (sfp, &pseudo, &suppressed);
30505   if (pseudo) return;
30506 
30507   if (LocationIsFar (sfp->location) && NoFetchFunctions ()) {
30508     vsp->far_fetch_failure = TRUE;
30509     return;
30510   }
30511 
30512   CheckSeqLocForPartial (sfp->location, &partial5, &partial3);
30513 
30514   if (sfp->excpt && (! vsp->ignoreExceptions) && (! StringHasNoText (sfp->except_text))) {
30515     for (i = 0; bypass_cds_trans_check [i] != NULL; i++) {
30516       if (StringISearch (sfp->except_text,  bypass_cds_trans_check [i]) != NULL) {
30517         report_errors = FALSE;  /* biological exception */
30518       }
30519     }
30520     if (StringStr (sfp->except_text, "unclassified translation discrepancy") != NULL) {
30521       unclassified_except = TRUE;
30522     }
30523     if (StringStr (sfp->except_text, "mismatches in translation") != NULL) {
30524       mismatch_except = TRUE;
30525       report_errors = TRUE;
30526     }
30527     if (StringStr (sfp->except_text, "artificial frameshift") != NULL) {
30528       frameshift_except = TRUE;
30529       report_errors = TRUE;
30530     }
30531     if (StringStr (sfp->except_text, "rearrangement required for product") != NULL) {
30532       rearrange_except = TRUE;
30533     }
30534     if (StringICmp (sfp->except_text, "translated product replaced") == 0) {
30535       product_replaced = TRUE;
30536     }
30537     if (StringICmp (sfp->except_text, "heterogeneous population sequenced") == 0) {
30538       mixed_population = TRUE;
30539     }
30540     if (StringICmp (sfp->except_text, "low-quality sequence region") == 0) {
30541       low_quality = TRUE;
30542     }
30543     if (StringICmp (sfp->except_text, "artificial location") == 0) {
30544       artificial_location = TRUE;
30545     }
30546     if (StringISearch (sfp->except_text, "RNA editing") != NULL) {
30547       rna_editing = TRUE;
30548     }
30549   }
30550   if (StringISearch (sfp->except_text, "annotated by transcript or proteomic data") != NULL) {
30551     annotated_by_transcript_or_proteomic = TRUE;
30552   }
30553 
30554   /* check for unparsed transl_except */
30555   for (gb = sfp->qual; gb != NULL; gb = gb->next) {
30556     if (StringCmp (gb->qual, "transl_except") == 0) {
30557       transl_except = TRUE;
30558       break;
30559     }
30560   }
30561 
30562   if (crp->code_break != NULL) {
30563     codebreakhead = MakeCodeBreakList (sfp->location, SeqLocLen (sfp->location), crp->code_break, crp->frame);
30564   }
30565 
30566   if (crp->genetic_code != NULL) {
30567     for (vnp = crp->genetic_code->data.ptrvalue; ((vnp != NULL) && (!gccode)); vnp = vnp->next) {
30568       switch (vnp->choice) {
30569       case 0:
30570         break;
30571       case 1:                  /* name */
30572         code = GeneticCodeFind (0, (CharPtr) (vnp->data.ptrvalue));
30573         if (code != NULL) {
30574           for (vnp2 = code->data.ptrvalue; ((vnp2 != NULL) && (!gccode)); vnp2 = vnp2->next) {
30575             if (vnp2->choice == 2)       /* id */
30576               gccode = (int) (vnp2->data.intvalue);
30577           }
30578         }
30579         break;
30580       case 2:                  /* id */
30581         gccode = (int) (vnp->data.intvalue);
30582         break;
30583       default:
30584         gccode = 255;
30585         break;
30586       }
30587     }
30588   }
30589 
30590   farFetchProd = (Boolean) (vsp->farFetchCDSproducts || vsp->farFetchMRNAproducts);
30591   newprot = ProteinFromCdRegionExEx (sfp, TRUE, FALSE, &alt_start, farFetchProd);   /* include stop codons, do not remove trailing X/B/Z */
30592   if (newprot == NULL) {
30593     has_errors = TRUE;
30594     other_than_mismatch = TRUE;
30595     if (report_errors || unclassified_except) {
30596       ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_CdTransFail, "Unable to translate");
30597     }
30598     prot_ok = FALSE;
30599     goto erret;
30600   }
30601 
30602   if (codebreakhead != NULL) {
30603     /*ValidateTranslExcept (vsp, sfp, codebreakhead, farFetchProd, crp->frame, crp->genetic_code); */
30604     ValidateCodeBreaks (vsp, sfp, farFetchProd);
30605   }
30606 
30607   protid = SeqLocId (sfp->product);
30608   if (protid != NULL) {
30609     prot1seq = BioseqFind (protid);
30610     if (prot1seq == NULL && vsp->farFetchCDSproducts) {
30611       if (protid != NULL && (protid->choice != SEQID_GI || protid->data.intvalue > 0)) {
30612         prot1seq = BioseqLockById (protid);
30613       }
30614       if (prot1seq != NULL) {
30615         unlockProd = TRUE;
30616         farstr = "(far) ";
30617         if (sfp->partial) {
30618           sdp = GetNextDescriptorUnindexed (prot1seq, Seq_descr_molinfo, NULL);
30619           if (sdp != NULL && sdp->choice == Seq_descr_molinfo) {
30620             mip = (MolInfoPtr) sdp->data.ptrvalue;
30621             if (mip != NULL) {
30622               if (mip->completeness < 2 || mip->completeness > 5) {
30623                 for (sip3 = prot1seq->id; sip3 != NULL; sip3 = sip3->next) {
30624                   if (sip3->choice != SEQID_OTHER) continue;
30625                   tsip = (TextSeqIdPtr) sip3->data.ptrvalue;
30626                   if (tsip == NULL) continue;
30627                   if (StringNCmp (tsip->accession, "NP_", 3) == 0) {
30628                     /* if far NP_ record, return to lower severity */
30629                     trans_len_sev = SEV_WARNING;
30630                   }
30631                 }
30632               }
30633             }
30634           }
30635         }
30636       }
30637     }
30638     if (prot1seq != NULL)
30639       prot1len = prot1seq->length;
30640   }
30641 
30642   if (alt_start && gccode == 1) {
30643     //LCOV_EXCL_START
30644     //sev is always set to none, so error won't be reported
30645     /* sev = SEV_WARNING; */
30646     sev = SEV_NONE; /* only enable for RefSeq, leave old code in for now */
30647     if (Loc_is_RefSeq (sfp->location)) {
30648       sev = /* SEV_ERROR */ SEV_NONE; /* now also disable for RefSeq */
30649     } else if (Loc_is_GEDL (sfp->location)) {
30650       sev = SEV_NONE;
30651     }
30652     if (sfp->excpt && StringDoesHaveText (sfp->except_text)) {
30653       if (StringStr (sfp->except_text, "alternative start codon") != NULL) {
30654         sev = SEV_NONE;
30655       }
30656     }
30657     if (sev > SEV_NONE) {
30658       has_errors = TRUE;
30659       other_than_mismatch = TRUE;
30660       if (report_errors) {
30661         ValidErr (vsp, sev, ERR_SEQ_FEAT_AltStartCodon, "Alternative start codon used");
30662       }
30663     }
30664     //LCOV_EXCL_STOP
30665   } else if (! alt_start) {
30666     if (sfp->excpt && StringDoesHaveText (sfp->except_text)) {
30667       if (StringStr (sfp->except_text, "alternative start codon") != NULL) {
30668         if (Loc_is_RefSeq (sfp->location)) {
30669           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_AltStartCodon, "Unnecessary alternative start codon exception");
30670         }
30671       }
30672     }
30673   }
30674 
30675   part_loc = SeqLocPartialCheck (sfp->location);
30676   part_prod = SeqLocPartialCheckEx (sfp->product, farFetchProd);
30677   if ((part_loc & SLP_STOP) || (part_prod & SLP_STOP))
30678     no_end = TRUE;
30679   else {                        /* complete stop, so check for ragged end */
30680 
30681     len = SeqLocLen (sfp->location);
30682     if (crp->frame > 1)
30683       len -= (Int4) (crp->frame - 1);
30684     ragged = (Int2) (len % (Int4) (3));
30685     if (ragged) {
30686       len = SeqLocLen (sfp->location);
30687       cbp = crp->code_break;
30688       while (cbp != NULL) {
30689         pos1 = INT4_MAX;
30690         pos2 = -10;
30691         tmp = NULL;
30692         while ((tmp = SeqLocFindNext (cbp->loc, tmp)) != NULL) {
30693           pos = GetOffsetInLoc (tmp, sfp->location, SEQLOC_START);
30694           if (pos < pos1)
30695             pos1 = pos;
30696           pos = GetOffsetInLoc (tmp, sfp->location, SEQLOC_STOP);
30697           if (pos > pos2)
30698             pos2 = pos;
30699         }
30700         pos = pos2 - pos1;      /* codon length */
30701         if (pos >= 0 && pos <= 1 && pos2 == len - 1)
30702         {                       /*  a codon */
30703           /* allowing a partial codon at the end */
30704           ragged = 0;
30705         }
30706 
30707         cbp = cbp->next;
30708       }
30709     }
30710   }
30711 
30712   /* check for code break not on a codon */
30713   len = SeqLocLen (sfp->location);
30714   cbp = crp->code_break;
30715   while (cbp != NULL) {
30716     pos1 = INT4_MAX;
30717     pos2 = -10;
30718     tmp = NULL;
30719     while ((tmp = SeqLocFindNext (cbp->loc, tmp)) != NULL) {
30720       pos = GetOffsetInLoc (tmp, sfp->location, SEQLOC_START);
30721       if (pos < pos1)
30722         pos1 = pos;
30723       pos = GetOffsetInLoc (tmp, sfp->location, SEQLOC_STOP);
30724       if (pos > pos2)
30725         pos2 = pos;
30726     }
30727     pos = pos2 - pos1;          /* codon length */
30728     /* check for code break not on a codon */
30729     if (pos == 2 || (pos >= 0 && pos <= 1 && pos2 == len - 1)) {
30730       if (crp->frame == 2)
30731         pos = 1;
30732       else if (crp->frame == 3)
30733         pos = 2;
30734       else
30735         pos = 0;
30736       if ((pos1 % 3) != pos) {
30737         has_errors = TRUE;
30738         other_than_mismatch = TRUE;
30739         if (report_errors) {
30740           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_TranslExceptPhase, "transl_except qual out of frame.");
30741         }
30742       }
30743     }
30744 
30745 
30746     cbp = cbp->next;
30747   }
30748 
30749   if (crp->frame > 1) {
30750     if (!(part_loc & SLP_START)) {
30751       sev = SEV_ERROR;
30752       /*
30753       sev = SEV_WARNING;
30754       if (Loc_is_RefSeq (sfp->location)) {
30755         sev = SEV_ERROR;
30756       }
30757       */
30758       has_errors = TRUE;
30759       other_than_mismatch = TRUE;
30760       if (report_errors) {
30761         ValidErr (vsp, sev, ERR_SEQ_FEAT_SuspiciousFrame, "Suspicious CDS location - frame > 1 but not 5' partial");
30762       }
30763     } else if ((part_loc & SLP_NOSTART) && (!PartialAtSpliceSiteOrGap (vsp, sfp->location, SLP_NOSTART, &isgap, &badseq))) {
30764       if (PartialAtGapOrNs (vsp, sfp->location, SLP_NOSTART) || StringStr (sfp->comment, "coding region disrupted by sequencing gap") != NULL) {
30765         /* suppress */
30766       } else {
30767         sev = SEV_WARNING;
30768         if (Loc_is_RefSeq (sfp->location)) {
30769           sev = SEV_ERROR;
30770         }
30771         has_errors = TRUE;
30772         other_than_mismatch = TRUE;
30773         if (report_errors) {
30774           ValidErr (vsp, sev, ERR_SEQ_FEAT_SuspiciousFrame, "Suspicious CDS location - frame > 1 and not at consensus splice site");
30775         }
30776       }
30777     }
30778   }
30779 
30780   if ((part_loc & SLP_START) || (part_prod & SLP_START))
30781     no_beg = TRUE;
30782 
30783   protseq = BSMerge (newprot, NULL);
30784   prot2len = StringLen (protseq);
30785   if (protseq != NULL) {
30786     len = prot2len;
30787     for (i = 0; i < len; i++) {
30788       residue1 = protseq [i];
30789       if (i == 0 && residue1 == '-') {
30790         got_dash = TRUE;
30791       }
30792       if (i == 0 && residue1 == 'X') {
30793         got_x = TRUE;
30794       }
30795       if (residue1 == '*') {
30796         if (i == (len - 1))
30797           got_stop = TRUE;
30798         else
30799           stop_count++;
30800       }
30801       if (residue1 == 'X') {
30802         x_count++;
30803       } else {
30804         nonx_count++;
30805       }
30806     }
30807     if (x_count > nonx_count) {
30808       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_CDShasTooManyXs, "CDS translation consists of more than 50%s X residues", "%");
30809     }
30810   }
30811 
30812   if (! annotated_by_transcript_or_proteomic) {
30813     if (1.2 * prot2len < prot1len) {
30814       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ProductLength, "Protein product length [%ld] is more than 120%% of the %stranslation length [%ld]", prot1len, farstr, prot2len);
30815     }
30816   }
30817 
30818   /*
30819   prot2len = BSLen (newprot);
30820   len = prot2len;
30821   BSSeek (newprot, 0, SEEK_SET);
30822   for (i = 0; i < len; i++) {
30823     residue1 = BSGetByte (newprot);
30824     if ((i == 0) && (residue1 == '-'))
30825       got_dash = TRUE;
30826     if (residue1 == '*') {
30827       if (i == (len - 1))
30828         got_stop = TRUE;
30829       else
30830         stop_count++;
30831     }
30832   }
30833   */
30834 
30835   if (stop_count > 0) {
30836     if (got_dash) {
30837       has_errors = TRUE;
30838       other_than_mismatch = TRUE;
30839       sev = SEV_ERROR;
30840       if (unclassified_except) {
30841         sev = SEV_WARNING;
30842       }
30843       if (report_errors || unclassified_except) {
30844         ValidErr (vsp, sev, ERR_SEQ_FEAT_StartCodon,
30845                   "Illegal start codon (and %ld internal stops). Probably wrong genetic code [%d]", (long) stop_count, gccode);
30846         ValidErr (vsp, sev, ERR_SEQ_FEAT_InternalStop, "%ld internal stops (and illegal start codon). Genetic code [%d]", (long) stop_count, gccode);
30847       }
30848     } else if (got_x) {
30849       has_errors = TRUE;
30850       other_than_mismatch = TRUE;
30851       sev = SEV_ERROR;
30852       if (unclassified_except) {
30853         sev = SEV_WARNING;
30854       }
30855       if (report_errors || unclassified_except) {
30856         ValidErr (vsp, sev, ERR_SEQ_FEAT_StartCodon,
30857                     "Ambiguous start codon (and %ld internal stops). Possibly wrong genetic code [%d]", (long) stop_count, gccode);
30858         ValidErr (vsp, sev, ERR_SEQ_FEAT_InternalStop, "%ld internal stops (and ambiguous start codon). Genetic code [%d]", (long) stop_count, gccode);
30859       }
30860     } else {
30861       has_errors = TRUE;
30862       other_than_mismatch = TRUE;
30863       sev = SEV_ERROR;
30864       if (unclassified_except) {
30865         sev = SEV_WARNING;
30866       }
30867       if (report_errors /* || unclassified_except */ ) {
30868         bsp = BioseqFindFromSeqLoc (sfp->location);
30869         if (bsp != NULL) {
30870           for (sip = bsp->id; sip != NULL; sip = sip->next) {
30871             switch (sip->choice) {
30872               case SEQID_GI :
30873                 has_gi = TRUE;
30874                 break;
30875               case SEQID_GENBANK :
30876               case SEQID_EMBL :
30877               case SEQID_DDBJ :
30878               case SEQID_TPG :
30879               case SEQID_TPE :
30880               case SEQID_TPD :
30881                 is_ged = TRUE;
30882                 break;
30883               case SEQID_OTHER :
30884                 is_refseq = TRUE;
30885                 break;
30886               default :
30887                 break;
30888             }
30889           }
30890           if (has_gi && is_ged && (! is_refseq)) {
30891             sev = SEV_REJECT;
30892           }
30893         }
30894         ValidErr (vsp, sev, ERR_SEQ_FEAT_InternalStop, "%ld internal stops. Genetic code [%d]", (long) stop_count, gccode);
30895       }
30896     }
30897     prot_ok = FALSE;
30898     if (stop_count > 5)
30899       goto erret;
30900   } else if (got_dash) {
30901     has_errors = TRUE;
30902     other_than_mismatch = TRUE;
30903     if (report_errors) {
30904       ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_StartCodon, "Illegal start codon used. Wrong genetic code [%d] or protein should be partial", gccode);
30905     }
30906   } else if (got_x && (! partial5)) {
30907     has_errors = TRUE;
30908     other_than_mismatch = TRUE;
30909     if (report_errors) {
30910       ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_StartCodon, "Ambiguous start codon used. Wrong genetic code [%d] or protein should be partial", gccode);
30911     }
30912   }
30913 
30914   show_stop = TRUE;
30915 
30916   if (protid != NULL) {
30917     if (prot1seq == NULL && (! vsp->farFetchCDSproducts)) {
30918       goto erret;
30919     }
30920     if (prot1seq == NULL && sfp->product != NULL && vsp->farFetchCDSproducts) {
30921       SeqIdWrite (protid, id, PRINTID_FASTA_LONG, sizeof (id));
30922       sev = SEV_ERROR;
30923       if (protid != NULL && protid->choice == SEQID_GENERAL) {
30924         dbt = (DbtagPtr) protid->data.ptrvalue;
30925         if (dbt != NULL && StringICmp (dbt->db, "ti") != 0 && StringICmp (dbt->db, "SRA") != 0) {
30926           sev = SEV_WARNING;
30927         }
30928       }
30929       ValidErr (vsp, sev, ERR_SEQ_FEAT_ProductFetchFailure, "Unable to fetch CDS product '%s'", id);
30930       goto erret;
30931     }
30932     if (prot1seq != NULL)
30933       prot1len = prot1seq->length;
30934   }
30935 
30936   if (prot1seq == NULL) {
30937     if (prot2len > 6) {
30938       if (! NGorNT (vsp->sep, sfp->location, &is_nc)) {
30939         sev = SEV_ERROR;
30940         if (DeltaOrFarSeg (vsp->sep, sfp->location)) {
30941           sev = SEV_WARNING;
30942         }
30943         if (is_nc) {
30944           sev = SEV_WARNING;
30945           sep = vsp->sep;
30946           if (sep != NULL && IS_Bioseq (sep)) {
30947             sev = SEV_NONE;
30948           }
30949         }
30950         if (sev != SEV_NONE) {
30951           has_errors = TRUE;
30952           other_than_mismatch = TRUE;
30953           if (report_errors) {
30954             ValidErr (vsp, sev, ERR_SEQ_FEAT_NoProtein, "No protein Bioseq given");
30955           }
30956         }
30957       }
30958     }
30959     goto erret;
30960   }
30961 
30962   len = prot2len;
30963 
30964   if ((got_stop) && (len == (prot1len + 1))) {  /* ok, got stop */
30965     len--;
30966   }
30967 
30968   if (! StreamCacheSetup (prot1seq, NULL, STREAM_EXPAND_GAPS, &sc)) {
30969     goto erret;
30970   }
30971   /*
30972   spp = SeqPortNew (prot1seq, 0, -1, 0, Seq_code_ncbieaa);
30973   if (spp == NULL)
30974     goto erret;
30975   */
30976 
30977   /* ignore terminal 'X' from partial last codon if present */
30978 
30979   done = FALSE;
30980   if ((!done) && (prot1len)) {
30981     /* prime the cache at a reasonable position near the end */
30982     if (prot1len > 4000) {
30983       StreamCacheSetPosition (&sc, prot1len - 2000);
30984     }
30985     residue1 = StreamCacheGetResidue (&sc);
30986   }
30987   while ((!done) && (prot1len)) {
30988     StreamCacheSetPosition (&sc, prot1len - 1);
30989     residue1 = StreamCacheGetResidue (&sc);
30990     /*
30991     SeqPortSeek (spp, (prot1len - 1), SEEK_SET);
30992     residue1 = SeqPortGetResidue (spp);
30993     */
30994     if (residue1 == 'X') {        /* remove terminal X */
30995       prot1len--;
30996       xcount1++;
30997     }
30998     else
30999       done = TRUE;
31000   }
31001   done = FALSE;
31002   while ((!done) && (len)) {
31003     /*
31004     BSSeek (newprot, (len - 1), SEEK_SET);
31005     residue2 = BSGetByte (newprot);
31006     */
31007     residue2 = protseq [len - 1];
31008     if (residue2 == 'X') {
31009       len--;
31010       xcount2++;
31011     }
31012     else
31013       done = TRUE;
31014   }
31015 
31016   if (xcount1 != xcount2) {
31017     ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_TerminalXDiscrepancy,
31018               "Terminal X count for CDS translation (%ld) and protein product sequence (%ld) are not equal",
31019               (long) xcount2, (long) xcount1);
31020   }
31021 
31022   if (len == prot1len) {        /* could be identical */
31023     StreamCacheSetPosition (&sc, 0);
31024     /*
31025     SeqPortSeek (spp, 0, SEEK_SET);
31026     BSSeek (newprot, 0, SEEK_SET);
31027     */
31028     for (i = 0; i < len; i++) {
31029       residue1 = protseq [i];
31030       residue2 = StreamCacheGetResidue (&sc);
31031       /*
31032       residue1 = BSGetByte (newprot);
31033       residue2 = SeqPortGetResidue (spp);
31034       */
31035       if (residue1 != residue2) {
31036         prot_ok = FALSE;
31037         if (residue2 == INVALID_RESIDUE)
31038           residue2 = '?';
31039         sev = SEV_ERROR;
31040         if (residue2 == 'X') {
31041           if (residue1 == 'B' || residue1 == 'Z' || residue1 == 'J') {
31042             sev = SEV_WARNING;
31043           }
31044         }
31045         if (i == 0) {
31046           if ((sfp->partial) && (!no_beg) && (!no_end)) { /* ok, it's partial */
31047             has_errors = TRUE;
31048             other_than_mismatch = TRUE;
31049             if (report_errors) {
31050               ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_PartialProblem, "Start of location should probably be partial");
31051             }
31052           } else if (residue1 == '-') {
31053             has_errors = TRUE;
31054             other_than_mismatch = TRUE;
31055             if (report_errors) {
31056               if (! got_dash) {
31057                 ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_StartCodon, "Illegal start codon used. Wrong genetic code [%d] or protein should be partial", gccode);
31058               }
31059             }
31060           } else if (residue1 == 'X') {
31061             has_errors = TRUE;
31062             other_than_mismatch = TRUE;
31063             if (report_errors) {
31064               if (! got_x) {
31065                 ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_StartCodon, "Ambiguous start codon used. Wrong genetic code [%d] or protein should be partial", gccode);
31066               }
31067             }
31068             mismatches[mismatch].pos = i;
31069             mismatches[mismatch].cds_residue = residue1;
31070             mismatches[mismatch].prot_residue = residue2;
31071             mismatch++;
31072           } else {
31073             has_errors = TRUE;
31074             mismatches[mismatch].pos = i;
31075             mismatches[mismatch].cds_residue = residue1;
31076             mismatches[mismatch].prot_residue = residue2;
31077             mismatch++;
31078           }
31079         } else {
31080           has_errors = TRUE;
31081           if (mismatch >= 10) {
31082             mismatches[10].pos = i;
31083             mismatches[10].cds_residue = residue1;
31084             mismatches[10].prot_residue = residue2;
31085           } else {
31086             mismatches[mismatch].pos = i;
31087             mismatches[mismatch].cds_residue = residue1;
31088             mismatches[mismatch].prot_residue = residue2;
31089           }
31090           mismatch++;
31091         }
31092       }
31093     }
31094 
31095     if (report_errors && !mismatch_except) {
31096       if (mismatch > 10) {
31097         if (report_errors && !mismatch_except) {
31098           nuclocstr = MapToNTCoords (sfp, protid, mismatches[0].pos);
31099           loc2str = MapToNTCoords (sfp, protid, mismatches[10].pos);
31100           ValidErr (vsp, vsp->is_geneious ? SEV_WARNING : sev, ERR_SEQ_FEAT_MisMatchAA,
31101             "%d mismatches found.  First mismatch at %ld, residue in protein [%c] != translation [%c]%s%s.  Last mismatch at %ld, residue in protein [%c] != translation [%c]%s%s.  Genetic code [%d]",
31102             mismatch,
31103             (long) (mismatches[0].pos + 1), mismatches[0].prot_residue, mismatches[0].cds_residue,
31104             nuclocstr == NULL ? "" : " at ", nuclocstr == NULL ? "" : nuclocstr,
31105             (long) (mismatches[10].pos + 1), mismatches[10].prot_residue, mismatches[10].cds_residue,
31106             loc2str == NULL ? "" : " at ", loc2str == NULL ? "" : loc2str,
31107             gccode);
31108           nuclocstr = MemFree (nuclocstr);
31109           loc2str = MemFree (loc2str);
31110         }
31111       } else {
31112         for (i = 0; i < mismatch; i++) {
31113           nuclocstr = MapToNTCoords (sfp, protid, mismatches[i].pos);
31114           ValidErr (vsp, vsp->is_geneious ? SEV_WARNING : sev, ERR_SEQ_FEAT_MisMatchAA,
31115                     "%sResidue %ld in protein [%c] != translation [%c]%s%s", farstr,
31116                       (long) (mismatches[i].pos + 1),
31117                       (char) mismatches[i].prot_residue,
31118                       (char) mismatches[i].cds_residue,
31119                       nuclocstr == NULL ? "" : " at ",
31120                       nuclocstr == NULL ? "" : nuclocstr);
31121           nuclocstr = MemFree (nuclocstr);
31122         }
31123       }
31124     }
31125 
31126   } else {
31127     has_errors = TRUE;
31128     other_than_mismatch = TRUE;
31129     if (report_errors || (rna_editing && (prot1len < len - 1 || prot1len > len))) {
31130       ValidErr (vsp, rna_editing ? SEV_WARNING : trans_len_sev, ERR_SEQ_FEAT_TransLen,
31131                "Given protein length [%ld] does not match %stranslation length [%ld]%s",
31132                prot1len, farstr, len,
31133                rna_editing ? " (RNA editing present)" : "");
31134     }
31135   }
31136 
31137   if ((sfp->partial) && (!mismatch)) {
31138     if ((!no_beg) && (!no_end)) {       /* just didn't label */
31139       if (!got_stop) {
31140         has_errors = TRUE;
31141         other_than_mismatch = TRUE;
31142         if (report_errors) {
31143           ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_PartialProblem, "End of location should probably be partial");
31144         }
31145       } else {
31146         has_errors = TRUE;
31147         other_than_mismatch = TRUE;
31148         if (report_errors) {
31149           ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_PartialProblem, "This SeqFeat should not be partial");
31150         }
31151       }
31152       show_stop = FALSE;
31153     }
31154   }
31155 
31156 
31157 
31158 erret:
31159   if (unlockProd) {
31160     BioseqUnlock (prot1seq);
31161   }
31162 
31163   if (show_stop) {
31164     if ((!got_stop) && (!no_end)) {
31165       has_errors = TRUE;
31166       other_than_mismatch = TRUE;
31167       if (report_errors) {
31168         ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_NoStop, "Missing stop codon");
31169       }
31170     } else if ((got_stop) && (no_end)) {
31171       has_errors = TRUE;
31172       other_than_mismatch = TRUE;
31173       if (report_errors) {
31174         ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_PartialProblem, "Got stop codon, but 3'end is labeled partial");
31175       }
31176     } else if ((got_stop) && (!no_end) && (ragged)) {
31177       has_errors = TRUE;
31178       other_than_mismatch = TRUE;
31179       sev = SEV_ERROR;
31180       if (unclassified_except) {
31181         sev = SEV_WARNING;
31182       }
31183       if (report_errors || unclassified_except) {
31184         ValidErr (vsp, sev, ERR_SEQ_FEAT_TransLen, "Coding region extends %d base(s) past stop codon", (int) ragged);
31185       }
31186     }
31187   }
31188 
31189   if (!prot_ok) {
31190     if (transl_except) {
31191       has_errors = TRUE;
31192       other_than_mismatch = TRUE;
31193       if (report_errors) {
31194         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_TranslExcept, "Unparsed transl_except qual. Skipped");
31195       }
31196     }
31197   } else {
31198     if (transl_except) {
31199       has_errors = TRUE;
31200       other_than_mismatch = TRUE;
31201       if (report_errors) {
31202         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_TranslExcept, "Unparsed transl_except qual (but protein is okay). Skipped");
31203       }
31204     }
31205   }
31206 
31207   if (prot2seq != NULL)
31208     BioseqFree (prot2seq);
31209   else
31210     BSFree (newprot);
31211   /*
31212   SeqPortFree (spp);
31213   */
31214   MemFree (protseq);
31215   ValNodeFree (codebreakhead);
31216 
31217   if (! report_errors) {
31218     if (! has_errors) {
31219       if ((! frameshift_except) && (! rearrange_except) && (! mixed_population) &&
31220           (! low_quality) && (! artificial_location) && (! annotated_by_transcript_or_proteomic)) {
31221         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnnecessaryException, "CDS has exception but passes translation test");
31222       }
31223     } else if (unclassified_except && (! other_than_mismatch)) {
31224       if (mismatch * 50 <= len) {
31225         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_ErroneousException,
31226                   "CDS has unclassified exception but only difference is %ld mismatches out of %ld residues",
31227                   (long) mismatch, (long) len);
31228       }
31229     } else if (product_replaced) {
31230       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnqualifiedException, "CDS has unqualified translated product replaced exception");
31231     }
31232   }
31233 
31234   TranslateTripletIntrons (vsp, sfp, crp);
31235 }
31236 
31237 
mRNAMatchesCompleteCDSEnd(SeqFeatPtr mrna,BoolPtr p5,BoolPtr p3)31238 static void mRNAMatchesCompleteCDSEnd (SeqFeatPtr mrna, BoolPtr p5, BoolPtr p3)
31239 {
31240   Boolean partial5, partial3;
31241   SeqFeatPtr cds;
31242   Uint2 strand;
31243 
31244   if (p5 != NULL) {
31245     *p5 = FALSE;
31246   }
31247   if (p3 != NULL) {
31248     *p3 = FALSE;
31249   }
31250 
31251   cds = GetCDSformRNA (mrna);
31252 
31253   if (mrna == NULL || cds == NULL) {
31254     return;
31255   }
31256 
31257   strand = SeqLocStrand (mrna->location);
31258 
31259   CheckSeqLocForPartial (cds->location, &partial5, &partial3);
31260   if (p5 != NULL && !partial5) {
31261     if (strand == Seq_strand_minus) {
31262       if (SeqLocStop (cds->location) == SeqLocStop (mrna->location)) {
31263         *p5 = TRUE;
31264       }
31265     } else {
31266       if (SeqLocStart (cds->location) == SeqLocStart (mrna->location)) {
31267         *p5 = TRUE;
31268       }
31269     }
31270   }
31271 
31272   if (p3 != NULL && !partial3) {
31273     if (strand == Seq_strand_minus) {
31274       if (SeqLocStart (cds->location) == SeqLocStart (mrna->location)) {
31275         *p3 = TRUE;
31276       }
31277     } else {
31278       if (SeqLocStop (cds->location) == SeqLocStop (mrna->location)) {
31279         *p3 = TRUE;
31280       }
31281     }
31282   }
31283 }
31284 
31285 
31286 /*****************************************************************************
31287 *
31288 *   SpliceCheck(sfp)
31289 *      checks for GT/AG rule at splice junctions
31290 *
31291 *****************************************************************************/
31292 #define NOVALUE 0
31293 #define HADGT 1
31294 #define NOGT 2
31295 
SpliceCheckEx(ValidStructPtr vsp,SeqFeatPtr sfp,Boolean checkAll)31296 static void SpliceCheckEx (ValidStructPtr vsp, SeqFeatPtr sfp, Boolean checkAll)
31297 {
31298   SeqLocPtr       slp, nxt, head;
31299   Uint1           strand = Seq_strand_unknown;
31300   /*
31301   SeqPortPtr      spp = NULL;
31302   */
31303   SeqIdPtr        last_sip = NULL, sip;
31304   Int2            total, ctr;
31305   BioseqPtr       bsp = NULL;
31306   Int4            strt, stp, len = 0, donor, acceptor;
31307   Int2            residue1, residue2;
31308   Char            tbuf[40];
31309   Boolean         reportAsError, first, last, firstPartial, lastPartial, has_errors = FALSE,
31310                   report_errors = TRUE, checkExonDonor, checkExonAcceptor, pseudo, ribo_slip = FALSE;
31311   int             severity;
31312   Uint2           partialflag;
31313   SeqEntryPtr     sep;
31314   StreamCache     sc;
31315   SeqInt          sint;
31316   ValNode         vn;
31317   SeqMgrFeatContext  context;
31318   SeqFeatPtr      mrna, gene;
31319   GeneRefPtr      grp;
31320   Boolean         ignore_partial_mrna_5 = FALSE, ignore_partial_mrna_3 = FALSE;
31321 
31322   if (sfp == NULL)
31323     return;
31324 
31325   if (GetAppProperty ("NcbiSubutilValidation") != NULL)
31326     return;                     /* suppress if NCBISubValidate */
31327 
31328   /* suppress if organelle */
31329   bsp = BioseqFindFromSeqLoc (sfp->location);
31330   if (bsp != NULL && IsBioseqOrganelle(bsp)) {
31331     return;
31332   }
31333 
31334   /* specific biological exceptions suppress check */
31335 
31336   if (sfp->excpt) {
31337     if (StringISearch (sfp->except_text, "low-quality sequence region") != NULL) {
31338       return;
31339     }
31340     if (StringISearch (sfp->except_text, "ribosomal slippage") != NULL) {
31341       report_errors = FALSE;
31342       ribo_slip = TRUE;
31343     }
31344     if (StringISearch (sfp->except_text, "ribosomal slippage") != NULL||
31345         StringISearch (sfp->except_text, "artificial frameshift") != NULL ||
31346         StringISearch (sfp->except_text, "nonconsensus splice site") != NULL ||
31347         StringISearch (sfp->except_text, "adjusted for low-quality genome") != NULL ||
31348         StringISearch (sfp->except_text, "heterogeneous population sequenced") != NULL ||
31349         StringISearch (sfp->except_text, "low-quality sequence region") != NULL ||
31350         StringISearch (sfp->except_text, "artificial location") != NULL) {
31351       report_errors = FALSE;
31352     }
31353   }
31354 
31355   MemSet ((Pointer) &sint, 0, sizeof (SeqInt));
31356   MemSet ((Pointer) &vn, 0, sizeof (ValNode));
31357 
31358   head = sfp->location;
31359   if (head == NULL)
31360     return;
31361 
31362   if (LocationIsFar (sfp->location) && NoFetchFunctions ()) {
31363     vsp->far_fetch_failure = TRUE;
31364     return;
31365   }
31366 
31367   reportAsError = FALSE;
31368   if (GetAppProperty ("SpliceValidateAsError") != NULL) {
31369     reportAsError = TRUE;
31370   }
31371 
31372   slp = NULL;
31373   total = 0;
31374   while ((slp = SeqLocFindPart (head, slp, EQUIV_IS_ONE)) != NULL) {
31375     total++;
31376     if (slp->choice == SEQLOC_EQUIV)
31377       return;                   /* bail on this one */
31378     if (total == 1)
31379       strand = SeqLocStrand (slp);
31380     else {
31381       if (strand != SeqLocStrand (slp)) /* bail on mixed strand */
31382         return;
31383     }
31384   }
31385 
31386   if ((!checkAll) && total < 2)
31387     return;
31388   if (total < 1)
31389     return;
31390 
31391   slp = NULL;
31392   ctr = 0;
31393 
31394   first = TRUE;
31395   last = FALSE;
31396   firstPartial = FALSE;
31397   lastPartial = FALSE;
31398 
31399   if (sfp->idx.subtype == FEATDEF_mRNA) {
31400     mRNAMatchesCompleteCDSEnd (sfp, &ignore_partial_mrna_5, &ignore_partial_mrna_3);
31401   }
31402 
31403 
31404   /* genomic product set or NT_ contig always relaxes to SEV_WARNING */
31405 
31406   sep = vsp->sep;
31407 
31408   slp = SeqLocFindPart (head, slp, EQUIV_IS_ONE);
31409   while (slp != NULL) {
31410     nxt = SeqLocFindPart (head, slp, EQUIV_IS_ONE);
31411     last = (Boolean) (nxt == NULL);
31412     partialflag = SeqLocPartialCheck (slp);
31413     firstPartial = (Boolean) (first && (partialflag & SLP_START));
31414     lastPartial = (Boolean) (last && (partialflag & SLP_STOP));
31415     ctr++;
31416     sip = SeqLocId (slp);
31417     if (sip == NULL)
31418       break;
31419 
31420     bsp = BioseqFind (sip);
31421 
31422     if ((ctr == 1) || (!SeqIdMatch (sip, last_sip))) {
31423       /* spp = SeqPortFree (spp); */
31424       bsp = NULL;
31425       if (sip != NULL && (sip->choice != SEQID_GI || sip->data.intvalue > 0)) {
31426         bsp = BioseqLockById (sip);
31427       }
31428       if (bsp == NULL)
31429         break;
31430       len = bsp->length;
31431       if (strand != Seq_strand_minus) {
31432         if (! StreamCacheSetup (bsp, NULL, EXPAND_GAPS_TO_DASHES, &sc)) {
31433           //LCOV_EXCL_START
31434           //C Toolkit specific
31435           BioseqUnlock (bsp);
31436           break;
31437           //LCOV_EXCL_STOP
31438         }
31439       } else {
31440         sint.from = 0;
31441         sint.to = len - 1;
31442         sint.strand = strand;
31443         sint.id = sip;
31444         vn.choice = SEQLOC_INT;
31445         vn.data.ptrvalue = (Pointer) &sint;
31446         vn.next = NULL;
31447         if (! StreamCacheSetup (NULL, &vn, EXPAND_GAPS_TO_DASHES, &sc)) {
31448           //LCOV_EXCL_START
31449           //C Toolkit specific
31450           BioseqUnlock(bsp);
31451           break;
31452           //LCOV_EXCL_STOP
31453         }
31454       }
31455       /* spp = SeqPortNew (bsp, 0, -1, strand, Seq_code_ncbi4na); */
31456       BioseqUnlock (bsp);
31457       /*
31458       if (spp == NULL)
31459         break;
31460       */
31461       last_sip = sip;
31462     }
31463 
31464     acceptor = SeqLocStart (slp);
31465     donor = SeqLocStop (slp);
31466 
31467     if (acceptor < 0 || acceptor >= len || donor < 0 || donor >= len) {
31468       /*
31469       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_Range,
31470                 "Unable to check splice consensus because feature outside range of sequence");
31471       */
31472       return;
31473     }
31474 
31475     if (strand != Seq_strand_minus) {
31476       strt = acceptor;
31477       stp = donor;
31478     } else {
31479       strt = donor;
31480       donor = acceptor;
31481       acceptor = strt;
31482       stp = len - donor - 1;    /* orient to reverse complement seqport */
31483       strt = len - acceptor - 1;
31484     }
31485 
31486     checkExonDonor = FALSE;
31487     checkExonAcceptor = FALSE;
31488     if (checkAll) {
31489       pseudo = FALSE;
31490       grp = SeqMgrGetGeneXref (sfp);
31491       if (grp == NULL) {
31492         gene = SeqMgrGetOverlappingGene (sfp->location, &context);
31493         if (gene != NULL) {
31494           pseudo = gene->pseudo;
31495         }
31496       }
31497       if (! pseudo) {
31498         checkExonDonor = TRUE;
31499         checkExonAcceptor = TRUE;
31500         mrna = SeqMgrGetOverlappingmRNA (sfp->location, &context);
31501         if (mrna != NULL /* && (! mrna->partial) */ ) {
31502           if (strand != Seq_strand_minus) {
31503             if (donor == SeqLocStop (mrna->location) && (! context.partialR)) {
31504               checkExonDonor = FALSE;
31505             }
31506             if (acceptor == SeqLocStart (mrna->location) && (! context.partialL)) {
31507               checkExonAcceptor = FALSE;
31508             }
31509           } else {
31510             if (donor == SeqLocStart (mrna->location) && (! context.partialR)) {
31511               checkExonDonor = FALSE;
31512             }
31513             if (acceptor == SeqLocStop (mrna->location) && (! context.partialL)) {
31514               checkExonAcceptor = FALSE;
31515             }
31516           }
31517         }
31518       }
31519     }
31520 
31521     if (((checkExonDonor && (!lastPartial))
31522          || ctr < total
31523          || (ctr == total && lastPartial && (sfp->idx.subtype != FEATDEF_mRNA || !ignore_partial_mrna_3) && sfp->idx.subtype != FEATDEF_exon))
31524         && (stp < (len - 2)))
31525     {   /* check donor on all but last exon and on sequence */
31526       tbuf[0] = '\0';
31527       StreamCacheSetPosition (&sc, stp + 1);
31528       residue1 = StreamCacheGetResidue (&sc);
31529       residue2 = StreamCacheGetResidue (&sc);
31530       /*
31531       SeqPortSeek (spp, (stp + 1), SEEK_SET);
31532       residue1 = SeqPortGetResidue (spp);
31533       residue2 = SeqPortGetResidue (spp);
31534       */
31535       if (residue1 == '-' && residue2 == '-') {
31536         /* ignore gap, and suppress UnnecessaryException message */
31537         has_errors = TRUE;
31538       } else if (IS_residue (residue1) && IS_residue (residue2)) {
31539         if (ConsistentWithG ((Char)residue1) && ConsistentWithT ((Char)residue2)) {
31540         } else {        /* not T */
31541           if (residue1 == 'G' && residue2 == 'C') {       /* GC minor splice site */
31542             tbuf[0] = '\0';
31543             if (bsp == NULL) {
31544               StringCpy (tbuf, "?");
31545             } else if (vsp->suppressContext || vsp->convertGiToAccn) {
31546                 //LCOV_EXCL_START
31547                 // option not used
31548                 WorstBioseqLabel(bsp, tbuf, 39, OM_LABEL_CONTENT);
31549                 //LCOV_EXCL_STOP
31550             } else {
31551               BioseqLabel (bsp, tbuf, 39, OM_LABEL_CONTENT);
31552             }
31553             tbuf[39] = '\0';
31554             if (RareConsensusNotExpected (sfp)) {
31555               has_errors = TRUE;
31556               if (report_errors) {
31557                 ValidErr (vsp, SEV_INFO, ERR_SEQ_FEAT_RareSpliceConsensusDonor,
31558                           "Rare splice donor consensus (GC) found instead of (GT) after exon ending at position %ld of %s", (long) (donor + 1), tbuf);
31559               }
31560             }
31561           } else {
31562             if (checkExonDonor) {
31563               severity = SEV_WARNING;
31564             } else if (reportAsError) {
31565               severity = SEV_ERROR;
31566             } else {
31567               severity = SEV_WARNING;
31568             }
31569             tbuf[0] = '\0';
31570             if (bsp == NULL) {
31571               StringCpy (tbuf, "?");
31572             } else if (vsp->suppressContext || vsp->convertGiToAccn) {
31573                 //LCOV_EXCL_START
31574                 // option not used
31575                 WorstBioseqLabel(bsp, tbuf, 39, OM_LABEL_CONTENT);
31576                 //LCOV_EXCL_STOP
31577             } else {
31578               BioseqLabel (bsp, tbuf, 39, OM_LABEL_CONTENT);
31579             }
31580             tbuf[39] = '\0';
31581             has_errors = TRUE;
31582             if (report_errors) {
31583               ValidErr (vsp, severity, ERR_SEQ_FEAT_NotSpliceConsensusDonor,
31584                         "Splice donor consensus (GT) not found after exon ending at position %ld of %s", (long) (donor + 1), tbuf);
31585             }
31586           }
31587         }
31588       } else {
31589         //LCOV_EXCL_START
31590         //StreamCacheGetResidue converts bad residues to "good"
31591         has_errors = TRUE;
31592         if (report_errors) {
31593           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_NotSpliceConsensusDonor,
31594                     "Bad sequence at splice donor after exon ending at position %ld of %s", (long) (donor + 1), tbuf);
31595         }
31596         //LCOV_EXCL_STOP
31597       }
31598     }
31599 
31600     if (((checkExonAcceptor && (!firstPartial))
31601          || ctr != 1
31602          || (ctr == 1 && firstPartial && (sfp->idx.subtype != FEATDEF_mRNA || !ignore_partial_mrna_5) && sfp->idx.subtype != FEATDEF_exon))
31603         && (strt > 1))
31604     {
31605       StreamCacheSetPosition (&sc, strt - 2);
31606       residue1 = StreamCacheGetResidue (&sc);
31607       residue2 = StreamCacheGetResidue (&sc);
31608       /*
31609       SeqPortSeek (spp, (strt - 2), SEEK_SET);
31610       residue1 = SeqPortGetResidue (spp);
31611       residue2 = SeqPortGetResidue (spp);
31612       */
31613       if (residue1 == '-' && residue2 == '-') {
31614         /* ignore gap, and suppress UnnecessaryException message */
31615         has_errors = TRUE;
31616       } else if (IS_residue (residue1) && IS_residue (residue2)) {
31617         if (ConsistentWithA ((Char)residue1) && ConsistentWithG ((Char)residue2)) {
31618         } else {
31619           if (checkExonAcceptor) {
31620             severity = SEV_WARNING;
31621           } else if (reportAsError) {
31622             severity = SEV_ERROR;
31623           } else {
31624             severity = SEV_WARNING;
31625           }
31626           tbuf[0] = '\0';
31627           if (bsp == NULL) {
31628             StringCpy (tbuf, "?");
31629             SeqIdWrite (sip, tbuf, PRINTID_FASTA_SHORT, 39);
31630           } else if (vsp->suppressContext || vsp->convertGiToAccn) {
31631               //LCOV_EXCL_START
31632               // option not used
31633               WorstBioseqLabel(bsp, tbuf, 39, OM_LABEL_CONTENT);
31634               //LCOV_EXCL_STOP
31635           } else {
31636             BioseqLabel (bsp, tbuf, 39, OM_LABEL_CONTENT);
31637           }
31638           tbuf[39] = '\0';
31639           has_errors = TRUE;
31640           if (report_errors) {
31641             ValidErr (vsp, severity, ERR_SEQ_FEAT_NotSpliceConsensusAcceptor,
31642                       "Splice acceptor consensus (AG) not found before exon starting at position %ld of %s", (long) (acceptor + 1), tbuf);
31643           }
31644         }
31645       } else {
31646         //LCOV_EXCL_START
31647         //StreamCacheGetResidue converts bad residues to "good"
31648         has_errors = TRUE;
31649         if (report_errors) {
31650           ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_NotSpliceConsensusAcceptor,
31651                     "Bad sequence at splice acceptor before exon starting at position %ld of %s", (long) (acceptor + 1), tbuf);
31652         }
31653         //LCOV_EXCL_STOP
31654       }
31655     }
31656 
31657     first = FALSE;
31658     slp = nxt;
31659   }
31660 
31661   /* SeqPortFree (spp); */
31662 
31663   if (! report_errors) {
31664     if (! has_errors) {
31665       if (! ribo_slip) {
31666         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_UnnecessaryException, "feature has exception but passes splice site test");
31667       }
31668     }
31669   }
31670 }
31671 
31672 //LCOV_EXCL_START
31673 //lcov is just being weird
SpliceCheck(ValidStructPtr vsp,SeqFeatPtr sfp)31674 NLM_EXTERN void SpliceCheck (ValidStructPtr vsp, SeqFeatPtr sfp)
31675 {
31676   SpliceCheckEx (vsp, sfp, FALSE);
31677 }
31678 //LCOV_EXCL_STOP
31679 
31680 /*****************************************************************************
31681 *
31682 *   CdsProductIdCheck (vsp, sfp)
31683 *      code taken from asn2gnbk.c - release mode expects CDS product Bioseqs
31684 *
31685 *****************************************************************************/
CdsProductIdCheck(ValidStructPtr vsp,SeqFeatPtr sfp)31686 static void CdsProductIdCheck (ValidStructPtr vsp, SeqFeatPtr sfp)
31687 
31688 {
31689   GeneRefPtr  grp;
31690   Boolean     juststop = FALSE;
31691   Boolean     okay = FALSE;
31692   Boolean     partial5;
31693   Boolean     partial3;
31694   Boolean     pseudo = FALSE;
31695   Boolean     suppressed = FALSE;
31696 
31697   /* non-pseudo CDS must have /product */
31698   grp = GetGeneByFeat (sfp, &pseudo, &suppressed);
31699   if (sfp->location != NULL) {
31700     if (CheckSeqLocForPartial (sfp->location, &partial5, &partial3)) {
31701       if (partial5 && (! partial3)) {
31702         if (SeqLocLen (sfp->location) <= 5) {
31703           juststop = TRUE;
31704         }
31705       }
31706     }
31707   }
31708   if (pseudo || juststop) {
31709     okay = TRUE;
31710   } else if (sfp->product != NULL) {
31711     okay = TRUE;
31712   } else {
31713     if (sfp->excpt && (! StringHasNoText (sfp->except_text))) {
31714       if (StringStr (sfp->except_text, "rearrangement required for product") != NULL) {
31715         okay = TRUE;
31716       }
31717     }
31718   }
31719   if (! okay) {
31720     ValidErr (vsp, vsp->is_geneious ? SEV_WARNING : SEV_ERROR,
31721               ERR_SEQ_FEAT_MissingCDSproduct, "Expected CDS product absent");
31722   }
31723 }
31724 
31725 /*****************************************************************************
31726 *
31727 *   ValidateSeqLoc(vsp, slp, prefix)
31728 *
31729 *****************************************************************************/
31730 
SeqLocMixCount(SeqLocPtr slp)31731 static Int2 SeqLocMixCount (SeqLocPtr slp)
31732 
31733 {
31734   Int2       count = 0;
31735   SeqLocPtr  loc;
31736 
31737   if (slp == NULL) return 0;
31738 
31739   while (slp != NULL) {
31740     if (slp->choice == SEQLOC_MIX) {
31741       count++;
31742       loc = (SeqLocPtr) slp->data.ptrvalue;
31743       count += SeqLocMixCount (loc);
31744     }
31745     slp = slp->next;
31746   }
31747 
31748   return count;
31749 }
31750 
ValidateSeqLoc(ValidStructPtr vsp,SeqLocPtr slp,Boolean report_abutting,CharPtr prefix)31751 NLM_EXTERN void ValidateSeqLoc (ValidStructPtr vsp, SeqLocPtr slp, Boolean report_abutting, CharPtr prefix)
31752 {
31753   SeqLocPtr       tmp, prev;
31754   Boolean         retval = TRUE, tmpval, mixed_strand = FALSE, unmarked_strand = FALSE,
31755                   ordered = TRUE, adjacent = FALSE, circular = FALSE, exception = FALSE,
31756                   bad = FALSE, has_other = FALSE, has_not_other = FALSE,
31757                   inconsistent_others = FALSE;
31758   CharPtr         ctmp;
31759   Uint1           strand2 = 0, strand1;
31760   ErrSev          sev, oldsev;
31761   SeqIntPtr       sip1, sip2, prevsip;
31762   SeqBondPtr      sbp;
31763   SeqPntPtr       spp;
31764   PackSeqPntPtr   pspp;
31765   SeqIdPtr        id1 = NULL, id2 = NULL;
31766   BioseqPtr       bsp;
31767   SeqFeatPtr      sfp = NULL;
31768   Int2            zeroGi = 0;
31769   Char            buf [32];
31770   SeqIdPtr        sip;
31771 
31772   if (slp == NULL)
31773     return;
31774 
31775   sfp = vsp->sfp;
31776 
31777   tmp = NULL;
31778   while ((tmp = SeqLocFindNext (slp, tmp)) != NULL) {
31779     sip = SeqLocId (tmp);
31780     if (sip != NULL && sip->choice == SEQID_GI && sip->data.intvalue <= 0) {
31781       zeroGi++;
31782     }
31783   }
31784   if (zeroGi > 0) {
31785     StringCpy (buf, "?");
31786     bsp = vsp->bsp;
31787     if (bsp != NULL) {
31788       SeqIdWrite (bsp->id, buf, PRINTID_FASTA_LONG, sizeof (buf) - 1);
31789     }
31790     if (zeroGi > 1) {
31791       ValidErr (vsp, SEV_REJECT, ERR_SEQ_FEAT_FeatureLocationIsGi0, "Feature has %d gi|0 locations on Bioseq %s",
31792                 (int) zeroGi, buf);
31793     } else if (zeroGi > 0) {
31794       ValidErr (vsp, SEV_REJECT, ERR_SEQ_FEAT_FeatureLocationIsGi0, "Feature has %d gi|0 location on Bioseq %s",
31795                 (int) zeroGi, buf);
31796     }
31797   }
31798 
31799   bsp = BioseqFindFromSeqLoc (slp);
31800   if (bsp != NULL && bsp->topology == 2) {
31801     circular = TRUE;
31802   }
31803 
31804   if (SeqLocMixCount (slp) > 1) {
31805       //LCOV_EXCL_START
31806       //C code fails before location with multiple mixes can be validated
31807       retval = FALSE;
31808       ctmp = SeqLocPrint (slp);
31809       if (ctmp != NULL && StringLen (ctmp) > 800) {
31810         StringCpy (ctmp + 797, "...");
31811       }
31812       ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_NestedSeqLocMix, "%s: SeqLoc [%s] has nested SEQLOC_MIX elements", prefix, ctmp);
31813       MemFree (ctmp);
31814       //LCOV_EXCL_STOP
31815   }
31816 
31817   tmp = NULL;
31818   prev = NULL;
31819   sip1 = NULL;
31820   prevsip = NULL;
31821   strand1 = Seq_strand_other;
31822   while ((tmp = SeqLocFindNext (slp, tmp)) != NULL) {
31823     tmpval = TRUE;
31824     switch (tmp->choice) {
31825     case SEQLOC_INT:
31826       sip1 = prevsip;
31827       sip2 = (SeqIntPtr) (tmp->data.ptrvalue);
31828       strand2 = sip2->strand;
31829       id2 = sip2->id;
31830 
31831       if (sip2->from == sip2->to) {
31832         /*
31833         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_SeqLocTypeProblem, "Seq-loc.int has identical from and to values, should be Seq-loc.pt");
31834         */
31835       }
31836 
31837       /* for SQD-663 */
31838       if (sip2->if_from != NULL && sip2->if_to != NULL) {
31839         if (sip2->if_from->choice == sip2->if_to->choice && sip2->if_from->choice == 4) {
31840           if(sip2->if_from->a == sip2->if_to->a) {
31841             if (sip2->if_from->a == 4) {
31842               ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidFuzz,
31843                     "Should not specify 'space to left' for both ends of interval");
31844             } else if (sip2->if_from->a == 3) {
31845               ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_InvalidFuzz,
31846                     "Should not specify 'space to right' for both ends of interval");
31847             }
31848           }
31849         }
31850       }
31851 
31852       tmpval = SeqIntCheck (sip2);
31853       if ((tmpval) && (sip1 != NULL)) {
31854         if (SeqIdForSameBioseq (sip1->id, sip2->id)) {
31855           if (strand2 == Seq_strand_minus) {
31856             if (sip1->to < sip2->to && (! circular)) {
31857               ordered = FALSE;
31858             }
31859             if (sip2->to + 1 == sip1->from) {
31860               adjacent = TRUE;
31861             }
31862           } else {
31863             if (sip1->to > sip2->to && (! circular)) {
31864               ordered = FALSE;
31865             }
31866             if (sip1->to + 1 == sip2->from) {
31867               adjacent = TRUE;
31868             }
31869           }
31870         }
31871       }
31872       if (prevsip != NULL) {
31873         if (SeqIdForSameBioseq (prevsip->id, sip2->id)) {
31874           if (prevsip->strand == sip2->strand && prevsip->from == sip2->from && prevsip->to == sip2->to) {
31875             ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_DuplicateInterval, "Duplicate exons in location");
31876           }
31877         }
31878       }
31879       prevsip = sip2;
31880       break;
31881     case SEQLOC_PNT:
31882       spp = (SeqPntPtr) (tmp->data.ptrvalue);
31883       strand2 = spp->strand;
31884       id2 = spp->id;
31885       tmpval = SeqPntCheck (spp);
31886       prevsip = NULL;
31887       break;
31888     case SEQLOC_PACKED_PNT:
31889       pspp = (PackSeqPntPtr) (tmp->data.ptrvalue);
31890       strand2 = pspp->strand;
31891       id2 = pspp->id;
31892       tmpval = PackSeqPntCheck (pspp);
31893       prevsip = NULL;
31894       break;
31895     case SEQLOC_BOND:
31896       sbp = (SeqBondPtr) tmp->data.ptrvalue;
31897       if (sbp != NULL) {
31898         spp = (SeqPntPtr) sbp->a;
31899         if (spp != NULL) {
31900           tmpval = SeqPntCheck (spp);
31901         }
31902         /* if already failed, no need to check second point */
31903         if (tmpval) {
31904           spp = (SeqPntPtr) sbp->b;
31905           if (spp != NULL) {
31906             tmpval = SeqPntCheck (spp);
31907           }
31908         }
31909       }
31910     case SEQLOC_NULL:
31911       break;
31912     default:
31913       strand2 = Seq_strand_other;
31914       id2 = NULL;
31915       prevsip = NULL;
31916       break;
31917     }
31918     if (strand2 == Seq_strand_other) {
31919       has_other = TRUE;
31920     } else if (strand2 == Seq_strand_plus || strand2 == Seq_strand_minus) {
31921       has_not_other = TRUE;
31922     }
31923     if (!tmpval) {
31924       retval = FALSE;
31925       ctmp = SeqLocPrint (tmp);
31926       if (ctmp != NULL && StringLen (ctmp) > 800) {
31927         StringCpy (ctmp + 797, "...");
31928       }
31929       ValidErr (vsp, SEV_REJECT, ERR_SEQ_FEAT_Range, "%s: SeqLoc [%s] out of range", prefix, ctmp);
31930       MemFree (ctmp);
31931 
31932     }
31933 
31934     if (tmp->choice != SEQLOC_NULL) {
31935       if ((strand1 != Seq_strand_other) && (strand2 != Seq_strand_other)) {
31936         if (SeqIdForSameBioseq (id1, id2)) {
31937           if (strand1 != strand2) {
31938             if (strand1 == Seq_strand_plus && strand2 == Seq_strand_unknown) {
31939               unmarked_strand = TRUE;
31940             } else if (strand1 == Seq_strand_unknown && strand2 == Seq_strand_plus) {
31941               unmarked_strand = TRUE;
31942             } else {
31943               mixed_strand = TRUE;
31944             }
31945           }
31946         }
31947       }
31948       if (has_other && has_not_other) {
31949         if (strand1 != strand2) {
31950           inconsistent_others = TRUE;
31951         }
31952       }
31953 
31954       strand1 = strand2;
31955       id1 = id2;
31956     }
31957   }
31958 
31959   if (sfp != NULL) {
31960 
31961     /* Publication intervals ordering does not matter */
31962 
31963     if (sfp->idx.subtype == FEATDEF_PUB) {
31964       ordered = TRUE;
31965       adjacent = FALSE;
31966     }
31967 
31968     /* ignore ordering of heterogen bonds */
31969 
31970     if (sfp->data.choice == SEQFEAT_HET) {
31971       ordered = TRUE;
31972       adjacent = FALSE;
31973     }
31974 
31975     /* misc_recomb intervals SHOULD be in reverse order */
31976 
31977     if (sfp->idx.subtype == FEATDEF_misc_recomb) {
31978       ordered = TRUE;
31979     }
31980 
31981     /* primer_bind intervals MAY be in on opposite strands */
31982 
31983     if (sfp->idx.subtype == FEATDEF_primer_bind) {
31984       mixed_strand = FALSE;
31985       unmarked_strand = FALSE;
31986       ordered = TRUE;
31987     }
31988 
31989     if (sfp->excpt) {
31990       exception = TRUE;
31991     }
31992   }
31993 
31994   if (adjacent && report_abutting) {
31995     ctmp = SeqLocPrint (slp);
31996     if (exception) {
31997       sev = SEV_WARNING;
31998     } else {
31999       sev = SEV_ERROR;
32000     }
32001     if (ctmp != NULL && StringLen (ctmp) > 800) {
32002       StringCpy (ctmp + 797, "...");
32003     }
32004     ValidErr (vsp, sev, ERR_SEQ_FEAT_AbuttingIntervals, "%s: Adjacent intervals in SeqLoc [%s]", prefix, ctmp);
32005     MemFree (ctmp);
32006   }
32007 
32008   if (exception) {
32009     /* trans splicing exception turns off both mixed_strand and out_of_order messages */
32010     if (StringISearch (sfp->except_text, "trans-splicing") != NULL) {
32011       return;
32012     }
32013   }
32014 
32015   if (mixed_strand || unmarked_strand || (!ordered) || inconsistent_others) {
32016     ctmp = SeqLocPrint (slp);
32017     if (ctmp != NULL && StringLen (ctmp) > 800) {
32018       StringCpy (ctmp + 797, "...");
32019     }
32020     sev = SEV_ERROR;
32021     if (vsp->is_small_genome_set) {
32022       sev = SEV_WARNING;
32023     }
32024     if (mixed_strand) {
32025       if (vsp->is_small_genome_set) {
32026         ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_MixedStrand, "%s: Mixed strands in SeqLoc [%s] in small genome set - set trans-splicing exception if appropriate", prefix, ctmp);
32027       } else {
32028         ValidErr (vsp, vsp->is_geneious ? SEV_WARNING : SEV_ERROR,
32029                   ERR_SEQ_FEAT_MixedStrand, "%s: Mixed strands in SeqLoc [%s]", prefix, ctmp);
32030       }
32031     } else if (unmarked_strand) {
32032       ValidErr (vsp, SEV_WARNING, ERR_SEQ_FEAT_MixedStrand, "%s: Mixed plus and unknown strands in SeqLoc [%s]", prefix, ctmp);
32033     }
32034     if (!ordered) {
32035       ValidErr (vsp, sev, ERR_SEQ_FEAT_SeqLocOrder, "%s: Intervals out of order in SeqLoc [%s]", prefix, ctmp);
32036     }
32037     if (inconsistent_others) {
32038       ValidErr (vsp, sev, ERR_SEQ_FEAT_MixedStrand, "%s: Inconsistent use of other strand SeqLoc [%s]", prefix, ctmp);
32039     }
32040     MemFree (ctmp);
32041     return;
32042   }
32043 
32044   if (sfp != NULL) {
32045 
32046     /* ignore special case features here as well */
32047 
32048     if (sfp->idx.subtype == FEATDEF_PUB ||
32049         sfp->data.choice == SEQFEAT_HET ||
32050         sfp->idx.subtype == FEATDEF_misc_recomb ||
32051         sfp->idx.subtype == FEATDEF_primer_bind)
32052       return;
32053   }
32054 
32055   /* newer check for intervals out of order on segmented bioseq */
32056 
32057   if (bsp == NULL || bsp->repr != Seq_repr_seg) return;
32058 //LCOV_EXCL_START
32059 // Only for SegSets
32060 
32061   oldsev = ErrSetMessageLevel (SEV_ERROR);
32062   bad = SeqLocBadSortOrder (bsp, slp);
32063   ErrSetMessageLevel (oldsev);
32064   if (bad) {
32065     ctmp = SeqLocPrint (slp);
32066     if (ctmp != NULL && StringLen (ctmp) > 800) {
32067       StringCpy (ctmp + 797, "...");
32068     }
32069     ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_SeqLocOrder, "%s: Intervals out of order in SeqLoc [%s]", prefix, ctmp);
32070     MemFree (ctmp);
32071   }
32072 
32073   /* newer check for mixed strand on segmented bioseq */
32074 
32075   oldsev = ErrSetMessageLevel (SEV_ERROR);
32076   bad = SeqLocMixedStrands (bsp, slp);
32077   ErrSetMessageLevel (oldsev);
32078   if (bad) {
32079     ctmp = SeqLocPrint (slp);
32080     if (ctmp != NULL && StringLen (ctmp) > 800) {
32081       StringCpy (ctmp + 797, "...");
32082     }
32083     ValidErr (vsp, SEV_ERROR, ERR_SEQ_FEAT_MixedStrand, "%s: Mixed strands in SeqLoc [%s]", prefix, ctmp);
32084     MemFree (ctmp);
32085   }
32086 //LCOV_EXCL_STOP
32087 }
32088 
32089 /*****************************************************************************
32090 *
32091 *   SeqGraph validation section
32092 *
32093 *****************************************************************************/
32094 
32095 typedef struct gphgetdata
32096 {
32097   ValNodePtr      vnp;
32098   BioseqPtr       bsp;
32099 }
32100 GphGetData     , PNTR GphGetPtr;
32101 
32102 typedef struct grphitem
32103 {
32104   SeqGraphPtr     sgp;
32105   Int4            left;
32106   Int4            right;
32107   Int2            index;
32108 }
32109 GrphItem       , PNTR GrphItemPtr;
32110 
GetGraphsProc(SeqGraphPtr sgp,Pointer userdata)32111 static void GetGraphsProc (SeqGraphPtr sgp, Pointer userdata)
32112 {
32113   GphGetPtr       ggp;
32114   GrphItemPtr     gip;
32115 
32116   ggp = (GphGetPtr) userdata;
32117   if (ggp == NULL || sgp == NULL) return;
32118   /* only phrap or gap4 currently allowed */
32119   if (StringICmp (sgp->title, "Phrap Quality") == 0 || StringICmp (sgp->title, "Phred Quality") == 0 || StringICmp (sgp->title, "Gap4") == 0) {
32120     /* data type must be bytes */
32121     if (sgp->flags[2] == 3) {
32122       if (SeqIdIn (SeqLocId (sgp->loc), ggp->bsp->id)) {
32123         gip = (GrphItemPtr) MemNew (sizeof (GrphItem));
32124         if (gip == NULL) return;
32125         gip->sgp = sgp;
32126         gip->left = GetOffsetInBioseq (sgp->loc, ggp->bsp, SEQLOC_LEFT_END);
32127         gip->right = GetOffsetInBioseq (sgp->loc, ggp->bsp, SEQLOC_RIGHT_END);
32128         ValNodeAddPointer (&(ggp->vnp), 0, (Pointer) gip);
32129       }
32130     }
32131   }
32132   return;
32133 }
32134 
SortSeqGraphProc(VoidPtr ptr1,VoidPtr ptr2)32135 static int LIBCALLBACK SortSeqGraphProc (VoidPtr ptr1, VoidPtr ptr2)
32136 {
32137   GrphItemPtr     gip1, gip2;
32138   ValNodePtr      vnp1, vnp2;
32139 
32140   if (ptr1 == NULL || ptr2 == NULL)
32141     return 0;
32142   vnp1 = *((ValNodePtr PNTR) ptr1);
32143   vnp2 = *((ValNodePtr PNTR) ptr2);
32144   if (vnp1 == NULL || vnp2 == NULL)
32145     return 0;
32146   gip1 = (GrphItemPtr) vnp1->data.ptrvalue;
32147   gip2 = (GrphItemPtr) vnp2->data.ptrvalue;
32148   if (gip1 == NULL || gip2 == NULL)
32149     return 0;
32150   if (gip1->left > gip2->left) {
32151     return 1;
32152   } else if (gip1->left < gip2->left) {
32153     return -1;
32154   } else if (gip1->right > gip2->right) {
32155     return -1;
32156   } else if (gip2->right < gip2->right) {
32157     return 1;
32158   }
32159   return 0;
32160 }
32161 
32162 /* gets valnode list of sorted graphs in GrphItem structures */
32163 
GetSeqGraphsOnBioseq(Uint2 entityID,BioseqPtr bsp)32164 static ValNodePtr GetSeqGraphsOnBioseq (Uint2 entityID, BioseqPtr bsp)
32165 {
32166   GphGetData   ggd;
32167   GrphItemPtr  gip;
32168   Int2         index;
32169   ValNodePtr   vnp;
32170 
32171   ggd.vnp = NULL;
32172   ggd.bsp = bsp;
32173   VisitGraphsOnBsp (bsp, (Pointer) &ggd, GetGraphsProc);
32174   for (vnp = ggd.vnp, index = 1; vnp != NULL; vnp = vnp->next, index++) {
32175     gip = (GrphItemPtr) vnp->data.ptrvalue;
32176     if (gip != NULL) {
32177       gip->index = index;
32178     }
32179   }
32180   ggd.vnp = ValNodeSort (ggd.vnp, SortSeqGraphProc);
32181   return ggd.vnp;
32182 }
32183 
NextLitLength(DeltaSeqPtr next,Int4Ptr lenp)32184 static Boolean NextLitLength (DeltaSeqPtr next, Int4Ptr lenp)
32185 
32186 {
32187   SeqLitPtr  slp;
32188 
32189   if (lenp == NULL) return FALSE;
32190   *lenp = 0;
32191   if (next == NULL || next->choice != 2) return FALSE;
32192   slp = (SeqLitPtr) next->data.ptrvalue;
32193   if (slp == NULL || slp->seq_data == NULL) return FALSE;
32194   *lenp = slp->length;
32195   return TRUE;
32196 }
32197 
ValidateGraphsOnBioseq(GatherContextPtr gcp)32198 static void ValidateGraphsOnBioseq (GatherContextPtr gcp)
32199 {
32200   Byte            scores [400];
32201   ByteStorePtr    bs;
32202   BioseqPtr       bsp;
32203   Int2            k, val, index, scount;
32204   Int4            curroffset = 0, gphlen = 0, seqlen = 0, slplen,
32205                   bslen, min = INT4_MAX, max = INT4_MIN, j, lastloc = -1,
32206                   numBases, NsWithScore, GapsWithScore, ACGTsWithoutScore,
32207                   ambigWithoutScore, valsBelowMin, valsAboveMax,
32208                   firstN, firstACGT, firstAmbig, pos, litlen, nxtlen;
32209   FloatHi         pct;
32210   DeltaSeqPtr     dsp, next;
32211   Uint2           entityID, olditemtype = 0, numdsp = 0, numsgp = 0;
32212   Uint4           firstsgitemid = 0;
32213   Uint4           olditemid = 0;
32214   GrphItemPtr     gip;
32215   ValNodePtr      head, vnp;
32216   Boolean         outOfOrder = FALSE, fa2htgsBug = FALSE, overlaps = FALSE;
32217   Uint1           residue;
32218   SeqGraphPtr     sgp;
32219   SeqIntPtr       sintp;
32220   SeqLocPtr       slocp;
32221   SeqLitPtr       slp;
32222   StreamCache     sc;
32223   ValidStructPtr  vsp;
32224   Boolean         single_report_mode = TRUE;
32225   CharPtr         ctmp;
32226 
32227   vsp = (ValidStructPtr) gcp->userdata;
32228   bsp = (BioseqPtr) gcp->thisitem;
32229   if (vsp == NULL || bsp == NULL)
32230     return;
32231   if (!ISA_na (bsp->mol))
32232     return;
32233 
32234   vsp->bsp = bsp;
32235   vsp->descr = NULL;
32236   vsp->sfp = NULL;
32237   vsp->bssp = (BioseqSetPtr) gcp->parentitem;
32238 
32239   if (SeqMgrGetParentOfPart (bsp, NULL) != NULL)
32240     return;
32241 
32242   entityID = ObjMgrGetEntityIDForPointer (bsp);
32243   head = GetSeqGraphsOnBioseq (entityID, bsp);
32244   if (head == NULL)
32245     return;
32246 
32247   olditemid = gcp->itemID;
32248   olditemtype = gcp->thistype;
32249   gcp->thistype = OBJ_SEQGRAPH;
32250 
32251   for (vnp = head, index = 1; vnp != NULL; vnp = vnp->next, index++) {
32252     gip = (GrphItemPtr) vnp->data.ptrvalue;
32253     if (gip == NULL)
32254       continue;
32255 
32256     sgp = gip->sgp;
32257     if (sgp == NULL)
32258       continue;
32259     gcp->itemID = sgp->idx.itemID;
32260     if (firstsgitemid == 0) {
32261       firstsgitemid = sgp->idx.itemID;
32262     }
32263 
32264     if (gip->index != index) {
32265       outOfOrder = TRUE;
32266       if (gip->index == 129 && index == 2) {
32267         fa2htgsBug = TRUE;
32268       }
32269     }
32270     if (gip->left <= lastloc) {
32271       overlaps = TRUE;
32272     }
32273     lastloc = gip->right;
32274     min = MIN ((Int4) min, (Int4) sgp->min.intvalue);
32275     max = MAX ((Int4) max, (Int4) sgp->max.intvalue);
32276 
32277     if (sgp->min.intvalue < 0 || sgp->min.intvalue > 100) {
32278       ValidErr (vsp, SEV_WARNING, ERR_SEQ_GRAPH_GraphMin, "Graph min (%ld) out of range", (long) sgp->min.intvalue);
32279     }
32280 
32281     if (sgp->max.intvalue > 100) {
32282       ValidErr (vsp, SEV_WARNING, ERR_SEQ_GRAPH_GraphMax, "Graph max (%ld) out of range", (long) sgp->max.intvalue);
32283     }
32284     if (sgp->max.intvalue <= 0) {
32285       ValidErr (vsp, SEV_ERROR, ERR_SEQ_GRAPH_GraphMax, "Graph max (%ld) out of range", (long) sgp->max.intvalue);
32286     }
32287 
32288     gphlen += sgp->numval;
32289     bs = (ByteStorePtr) sgp->values;
32290     if (bs != NULL) {
32291       bslen = BSLen (bs);
32292       if (sgp->numval != bslen) {
32293         ValidErr (vsp, SEV_ERROR, ERR_SEQ_GRAPH_GraphByteLen, "SeqGraph (%ld) and ByteStore (%ld) length mismatch", (long) sgp->numval, (long) bslen);
32294       }
32295     }
32296   }
32297   if (outOfOrder) {
32298     gcp->itemID = firstsgitemid;
32299     if (fa2htgsBug) {
32300         //LCOV_EXCL_START
32301         //fa2htgs bug no longer seen
32302       ValidErr (vsp, SEV_ERROR, ERR_SEQ_GRAPH_GraphOutOfOrder, "Graph components are out of order - probably caused by old fa2htgs bug");
32303       //LCOV_EXCL_STOP
32304     } else {
32305       ValidErr (vsp, SEV_WARNING, ERR_SEQ_GRAPH_GraphOutOfOrder, "Graph components are out of order - may be a software bug");
32306     }
32307   }
32308   if (overlaps) {
32309     gcp->itemID = firstsgitemid;
32310     ValidErr (vsp, SEV_ERROR, ERR_SEQ_GRAPH_GraphOverlap, "Graph components overlap, with multiple scores for a single base");
32311   }
32312 
32313   if (bsp->repr == Seq_repr_raw) {
32314     seqlen = bsp->length;
32315   } else if (bsp->repr == Seq_repr_delta) {
32316     for (dsp = (DeltaSeqPtr) (bsp->seq_ext); dsp != NULL; dsp = dsp->next) {
32317       switch (dsp->choice) {
32318       case 1:
32319         slocp = (SeqLocPtr) dsp->data.ptrvalue;
32320         if (slocp == NULL)
32321           break;
32322         if (slocp->choice != SEQLOC_NULL) {
32323           seqlen += SeqLocLen (slocp);
32324         }
32325         break;
32326       case 2:
32327         slp = (SeqLitPtr) dsp->data.ptrvalue;
32328         if (slp == NULL || slp->seq_data == NULL)
32329           break;
32330         seqlen += slp->length;
32331         break;
32332       default:
32333         break;
32334       }
32335     }
32336   }
32337 
32338   if (seqlen != gphlen && bsp->length != gphlen) {
32339     gcp->itemID = firstsgitemid;
32340     ValidErr (vsp, SEV_ERROR, ERR_SEQ_GRAPH_GraphBioseqLen, "SeqGraph (%ld) and Bioseq (%ld) length mismatch", (long) gphlen, (long) seqlen);
32341   }
32342 
32343   if (bsp->repr == Seq_repr_delta) {
32344     if (head != NULL && head->next != NULL) {
32345       for (dsp = (DeltaSeqPtr) (bsp->seq_ext), vnp = head; dsp != NULL && vnp != NULL; dsp = next) {
32346         next = dsp->next;
32347         gip = (GrphItemPtr) vnp->data.ptrvalue;
32348         if (gip == NULL)
32349           continue;
32350         sgp = gip->sgp;
32351         if (sgp == NULL)
32352           continue;
32353         switch (dsp->choice) {
32354         case 1:
32355           slocp = (SeqLocPtr) dsp->data.ptrvalue;
32356           if (slocp != NULL && slocp->choice != SEQLOC_NULL) {
32357             slplen = SeqLocLen (slocp);
32358             curroffset += slplen;
32359             if (sgp->numval != slplen) {
32360               gcp->itemID = sgp->idx.itemID;
32361               ValidErr (vsp, SEV_WARNING, ERR_SEQ_GRAPH_GraphSeqLocLen, "SeqGraph (%ld) and SeqLoc (%ld) length mismatch", (long) sgp->numval, (long) slplen);
32362             }
32363             numdsp++;
32364             if (vnp != NULL) {
32365               vnp = vnp->next;
32366               numsgp++;
32367             }
32368           }
32369           break;
32370         case 2:
32371           slp = (SeqLitPtr) dsp->data.ptrvalue;
32372           litlen = 0;
32373           if (slp != NULL) {
32374             litlen = slp->length;
32375           }
32376           if (slp != NULL && slp->seq_data != NULL) {
32377             while (NextLitLength (next, &nxtlen)) {
32378               litlen += nxtlen;
32379               next = next->next;
32380             }
32381             if (sgp->numval != litlen) {
32382               gcp->itemID = sgp->idx.itemID;
32383               ValidErr (vsp, SEV_ERROR, ERR_SEQ_GRAPH_GraphSeqLitLen, "SeqGraph (%ld) and SeqLit (%ld) length mismatch",
32384                         (long) sgp->numval, (long) litlen);
32385             }
32386             slocp = sgp->loc;
32387             if (slocp != NULL && slocp->choice == SEQLOC_INT) {
32388               sintp = (SeqIntPtr) slocp->data.ptrvalue;
32389               if (sintp != NULL) {
32390                 if (sintp->from != curroffset) {
32391                   gcp->itemID = sgp->idx.itemID;
32392                   ValidErr (vsp, SEV_ERROR, ERR_SEQ_GRAPH_GraphStartPhase, "SeqGraph (%ld) and SeqLit (%ld) start do not coincide",
32393                             (long) sintp->from, (long) curroffset);
32394                 }
32395                 if (sintp->to != litlen + curroffset - 1) {
32396                   gcp->itemID = sgp->idx.itemID;
32397                   ValidErr (vsp, SEV_ERROR, ERR_SEQ_GRAPH_GraphStopPhase, "SeqGraph (%ld) and SeqLit (%ld) stop do not coincide",
32398                             (long) sintp->to, (long) (litlen + curroffset - 1));
32399                 }
32400               }
32401             }
32402             numdsp++;
32403             if (vnp != NULL) {
32404               vnp = vnp->next;
32405               numsgp++;
32406             }
32407           }
32408           if (slp != NULL) {
32409             curroffset += litlen;
32410           }
32411           break;
32412         default:
32413           break;
32414         }
32415       }
32416       for (dsp = (DeltaSeqPtr) (bsp->seq_ext), numdsp = 0; dsp != NULL; dsp = next) {
32417         next = dsp->next;
32418         switch (dsp->choice) {
32419         case 1:
32420           slocp = (SeqLocPtr) dsp->data.ptrvalue;
32421           if (slocp != NULL && slocp->choice != SEQLOC_NULL) {
32422             numdsp++;
32423           }
32424           break;
32425         case 2:
32426           slp = (SeqLitPtr) dsp->data.ptrvalue;
32427           if (slp != NULL && slp->seq_data != NULL) {
32428             while (NextLitLength (next, &nxtlen)) {
32429               next = next->next;
32430             }
32431             numdsp++;
32432           }
32433           break;
32434         default:
32435           break;
32436         }
32437       }
32438       for (vnp = head, numsgp = 0; vnp != NULL; vnp = vnp->next, numsgp++)
32439         continue;
32440       if (numdsp != numsgp) {
32441         gcp->itemID = firstsgitemid;
32442         ValidErr (vsp, SEV_ERROR, ERR_SEQ_GRAPH_GraphDiffNumber, "Different number of SeqGraph (%d) and SeqLit (%d) components", (int) numsgp, (int) numdsp);
32443       }
32444     }
32445   }
32446 
32447   numBases = 0;
32448   NsWithScore = 0;
32449   GapsWithScore = 0;
32450   ACGTsWithoutScore = 0;
32451   ambigWithoutScore = 0;
32452   valsBelowMin = 0;
32453   valsAboveMax = 0;
32454   firstN = -1;
32455   firstACGT = -1;
32456   firstAmbig = -1;
32457 
32458   for (vnp = head; vnp != NULL; vnp = vnp->next) {
32459     gip = (GrphItemPtr) vnp->data.ptrvalue;
32460     if (gip == NULL)
32461       continue;
32462     sgp = gip->sgp;
32463     if (sgp == NULL)
32464       continue;
32465 
32466     if (sgp->loc == NULL ||
32467         SeqLocStart (sgp->loc) < 0 ||
32468         SeqLocStop (sgp->loc) >= bsp->length ||
32469         (sgp->loc->choice != SEQLOC_INT && sgp->loc->choice != SEQLOC_WHOLE) ||
32470         SeqLocStrand (sgp->loc) == Seq_strand_minus) {
32471       gcp->itemID = sgp->idx.itemID;
32472       ctmp = SeqLocPrint (sgp->loc);
32473       ValidErr (vsp, SEV_ERROR, ERR_SEQ_GRAPH_GraphLocInvalid, "SeqGraph location (%s) is invalid", ctmp == NULL ? "Unknown" : ctmp);
32474       ctmp = MemFree (ctmp);
32475       continue;
32476     }
32477 
32478     if (! StreamCacheSetup (NULL, sgp->loc, EXPAND_GAPS_TO_DASHES, &sc)) continue;
32479     slplen = SeqLocLen (sgp->loc);
32480 
32481     bs = (ByteStorePtr) sgp->values;
32482     BSSeek (bs, 0, SEEK_SET);
32483     j = 0;
32484     val = 0;
32485 
32486     scount = (Int2) BSRead (bs, scores, sizeof (scores));
32487     k = 0;
32488 
32489     if (! single_report_mode) {
32490       numBases = 0;
32491       NsWithScore = 0;
32492       GapsWithScore = 0;
32493       ACGTsWithoutScore = 0;
32494       ambigWithoutScore = 0;
32495       valsBelowMin = 0;
32496       valsAboveMax = 0;
32497       firstN = -1;
32498       firstACGT = -1;
32499       firstAmbig = -1;
32500     }
32501 
32502     pos = gip->left;
32503 
32504     while ((residue = StreamCacheGetResidue (&sc)) != '\0' && j < sgp->numval) {
32505       if (IS_residue (residue)) {
32506         numBases++;
32507         /* val = (Int2) BSGetByte (bs); */
32508         if (k >= scount) {
32509           if (scount > 0) {
32510             scount = (Int2) BSRead (bs, scores, sizeof (scores));
32511           }
32512           k = 0;
32513         }
32514         if (scount > 0) {
32515           val = (Int2) scores [k];
32516           k++;
32517         } else {
32518           val = 0;
32519         }
32520         if (val < sgp->min.intvalue || val < 0) {
32521           valsBelowMin++;
32522         }
32523         if (val > sgp->max.intvalue || val > 100) {
32524           valsAboveMax++;
32525         }
32526         j++;
32527         switch (residue) {
32528         case '-': /* 0 */
32529           if (val > 0) {
32530             GapsWithScore++;
32531           }
32532           break;
32533         case 'A': /* 1, 2, 4, 8 */
32534         case 'C':
32535         case 'G':
32536         case 'T':
32537           if (val == 0) {
32538             ACGTsWithoutScore++;
32539             if (firstACGT == -1) {
32540               firstACGT = pos;
32541             }
32542           }
32543           break;
32544         case 'N': /* 15 */
32545           if (val > 0) {
32546             NsWithScore++;
32547             if (firstN == -1) {
32548               firstN = pos;
32549             }
32550           }
32551           break;
32552         default:
32553           if (val == 0) {
32554             ambigWithoutScore++;
32555             if (firstAmbig == -1) {
32556               firstAmbig = pos;
32557             }
32558           }
32559           break;
32560         }
32561       }
32562       pos++;
32563     }
32564 
32565     if (! single_report_mode) {
32566       gcp->itemID = sgp->idx.itemID;
32567       if (ACGTsWithoutScore > 0) {
32568         if (ACGTsWithoutScore * 10 >= numBases) {
32569           pct = (FloatHi) (ACGTsWithoutScore) * 100.0 / (FloatHi) numBases;
32570           ValidErr (vsp, SEV_WARNING, ERR_SEQ_GRAPH_GraphACGTScoreMany, "%ld ACGT bases (%3.2f%s) have zero score value - first one at position %ld",
32571                     (long) ACGTsWithoutScore, (double) pct, "%", (long) (firstACGT + 1));
32572         } else {
32573           ValidErr (vsp, SEV_WARNING, ERR_SEQ_GRAPH_GraphACGTScore, "%ld ACGT bases have zero score value - first one at position %ld",
32574                     (long) ACGTsWithoutScore, (long) (firstACGT + 1));
32575         }
32576       }
32577       if (NsWithScore > 0) {
32578         if (NsWithScore * 10 >= numBases) {
32579           pct = (FloatHi) (NsWithScore) * 100.0 / (FloatHi) numBases;
32580           ValidErr (vsp, SEV_WARNING, ERR_SEQ_GRAPH_GraphNScoreMany, "%ld N bases (%3.2f%s) have positive score value - first one at position %ld",
32581                     (long) NsWithScore, (double) pct, "%", (long) (firstN + 1));
32582         } else {
32583           ValidErr (vsp, SEV_WARNING, ERR_SEQ_GRAPH_GraphNScore, "%ld N bases have positive score value - first one at position %ld",
32584                     (long) NsWithScore, (long) (firstN + 1));
32585         }
32586       }
32587       if (GapsWithScore > 0) {
32588         ValidErr (vsp, SEV_ERROR, ERR_SEQ_GRAPH_GraphGapScore, "%ld gap bases have positive score value", (long) GapsWithScore);
32589       }
32590       if (valsBelowMin > 0) {
32591         ValidErr (vsp, SEV_WARNING, ERR_SEQ_GRAPH_GraphBelow, "%ld quality scores have values below the reported minimum or 0", (long) valsBelowMin);
32592       }
32593       if (valsAboveMax > 0) {
32594         ValidErr (vsp, SEV_WARNING, ERR_SEQ_GRAPH_GraphAbove, "%ld quality scores have values above the reported maximum or 100", (long) valsAboveMax);
32595       }
32596     }
32597   }
32598 
32599   gcp->itemID = olditemid;
32600   gcp->thistype = olditemtype;
32601 
32602   if (single_report_mode) {
32603     if (ACGTsWithoutScore > 0) {
32604       if (ACGTsWithoutScore * 10 >= numBases) {
32605         pct = (FloatHi) (ACGTsWithoutScore) * 100.0 / (FloatHi) numBases;
32606         ValidErr (vsp, SEV_WARNING, ERR_SEQ_GRAPH_GraphACGTScoreMany, "%ld ACGT bases (%3.2f%s) have zero score value - first one at position %ld",
32607                   (long) ACGTsWithoutScore, (double) pct, "%", (long) (firstACGT + 1));
32608       } else {
32609         ValidErr (vsp, SEV_WARNING, ERR_SEQ_GRAPH_GraphACGTScore, "%ld ACGT bases have zero score value - first one at position %ld",
32610                   (long) ACGTsWithoutScore, (long) (firstACGT + 1));
32611       }
32612     }
32613     if (NsWithScore > 0) {
32614       if (NsWithScore * 10 >= numBases) {
32615         pct = (FloatHi) (NsWithScore) * 100.0 / (FloatHi) numBases;
32616         ValidErr (vsp, SEV_WARNING, ERR_SEQ_GRAPH_GraphNScoreMany, "%ld N bases (%3.2f%s) have positive score value - first one at position %ld",
32617                   (long) NsWithScore, (double) pct, "%", (long) (firstN + 1));
32618       } else {
32619         ValidErr (vsp, SEV_WARNING, ERR_SEQ_GRAPH_GraphNScore, "%ld N bases have positive score value - first one at position %ld",
32620                   (long) NsWithScore, (long) (firstN + 1));
32621       }
32622     }
32623     if (GapsWithScore > 0) {
32624       ValidErr (vsp, SEV_ERROR, ERR_SEQ_GRAPH_GraphGapScore, "%ld gap bases have positive score value", (long) GapsWithScore);
32625     }
32626     if (valsBelowMin > 0) {
32627       ValidErr (vsp, SEV_WARNING, ERR_SEQ_GRAPH_GraphBelow, "%ld quality scores have values below the reported minimum or 0", (long) valsBelowMin);
32628     }
32629     if (valsAboveMax > 0) {
32630       ValidErr (vsp, SEV_WARNING, ERR_SEQ_GRAPH_GraphAbove, "%ld quality scores have values above the reported maximum or 100", (long) valsAboveMax);
32631     }
32632   }
32633 
32634   ValNodeFreeData (head);
32635 }
32636 
32637 //LCOV_EXCL_START
32638 // patch_seq is never set, function is never called
32639 /*****************************************************************************
32640 *
32641 *   PatchBadSequence(bsp)
32642 *
32643 *****************************************************************************/
PatchBadSequence(BioseqPtr bsp)32644 NLM_EXTERN Boolean PatchBadSequence (BioseqPtr bsp)
32645 {
32646   ByteStorePtr    newseq;
32647   SeqPortPtr      spp;
32648   Boolean         is_na;
32649   Uint1           seqcode;
32650   Int2            repchar, residue;
32651   Int4            i, len;
32652 
32653   if (bsp == NULL)
32654     return FALSE;
32655   if (!((bsp->repr == Seq_repr_raw) || (bsp->repr == Seq_repr_const)))
32656     return FALSE;
32657 
32658   is_na = ISA_na (bsp->mol);
32659   if (is_na) {
32660     seqcode = Seq_code_iupacna;
32661     repchar = (Int2) 'N';       /* N */
32662   } else {
32663     seqcode = Seq_code_iupacaa;
32664     repchar = (Int2) 'X';
32665   }
32666 
32667   spp = SeqPortNew (bsp, 0, -1, 0, seqcode);
32668   if (spp == NULL)
32669     return FALSE;
32670 
32671   len = bsp->length;
32672   newseq = BSNew (len);
32673   if (newseq == NULL) {
32674     SeqPortFree (spp);
32675     return FALSE;
32676   }
32677 
32678   for (i = 0; i < len; i++) {
32679     residue = SeqPortGetResidue (spp);
32680     if (residue == INVALID_RESIDUE) {
32681       residue = repchar;
32682     }
32683     BSPutByte (newseq, residue);
32684   }
32685 
32686   SeqPortFree (spp);
32687   SeqDataFree (bsp->seq_data, bsp->seq_data_type);
32688   bsp->seq_data = (SeqDataPtr) newseq;
32689   bsp->seq_data_type = seqcode;
32690 
32691   BioseqRawPack (bsp);
32692 
32693   return TRUE;
32694 }
32695 
FindABioseq(SeqEntryPtr sep,Pointer data,Int4 index,Int2 indent)32696 static void FindABioseq (SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent)
32697 {
32698   BioseqPtr PNTR  bp;
32699   BioseqPtr       bsp;
32700 
32701   bp = (BioseqPtr PNTR) data;
32702   if (*bp != NULL)              /* already got one */
32703     return;
32704 
32705   if (IS_Bioseq (sep)) {
32706     bsp = (BioseqPtr) (sep->data.ptrvalue);
32707     *bp = bsp;
32708   }
32709   return;
32710 }
32711 
32712 
FindIDForEntry(SeqEntryPtr sep,CharPtr buf)32713 NLM_EXTERN CharPtr FindIDForEntry (SeqEntryPtr sep, CharPtr buf)
32714 {
32715   BioseqPtr       bsp = NULL;
32716 
32717   if ((sep == NULL) || (buf == NULL))
32718     return NULL;
32719 
32720   *buf = '\0';
32721   SeqEntryExplore (sep, (Pointer) (&bsp), FindABioseq);
32722 
32723   if (bsp == NULL)
32724     return NULL;
32725 
32726   SeqIdPrint (bsp->id, buf, PRINTID_FASTA_LONG);
32727   return buf;
32728 }
32729 //LCOV_EXCL_STOP
32730 
TrimSpacesOnEitherSide(CharPtr str)32731 static CharPtr TrimSpacesOnEitherSide (CharPtr str)
32732 {
32733   Uchar           ch;
32734   CharPtr         dst;
32735   CharPtr         ptr;
32736 
32737   if (str != NULL && str[0] != '\0') {
32738     dst = str;
32739     ptr = str;
32740     ch = *ptr;
32741     while (ch != '\0' && ch <= ' ') {
32742       ptr++;
32743       ch = *ptr;
32744     }
32745     while (ch != '\0') {
32746       *dst = ch;
32747       dst++;
32748       ptr++;
32749       ch = *ptr;
32750     }
32751     *dst = '\0';
32752     dst = NULL;
32753     ptr = str;
32754     ch = *ptr;
32755     while (ch != '\0') {
32756       if (ch != ' ') {
32757         dst = NULL;
32758       } else if (dst == NULL) {
32759         dst = ptr;
32760       }
32761       ptr++;
32762       ch = *ptr;
32763     }
32764     if (dst != NULL) {
32765       *dst = '\0';
32766     }
32767   }
32768   return str;
32769 }
32770 
CopyLetters(CharPtr dest,CharPtr source,size_t maxsize)32771 static void CopyLetters (CharPtr dest, CharPtr source, size_t maxsize)
32772 {
32773   Char            ch;
32774   CharPtr         tmp;
32775 
32776   if (dest == NULL || maxsize < 1)
32777     return;
32778   *dest = '\0';
32779   if (source == NULL)
32780     return;
32781   maxsize--;
32782   tmp = dest;
32783   ch = *source;
32784   while (maxsize > 1 && ch != '\0') {
32785     if (ch != '.') {
32786       *dest = ch;
32787       dest++;
32788       maxsize--;
32789     }
32790     source++;
32791     ch = *source;
32792   }
32793   *dest = '\0';
32794   TrimSpacesOnEitherSide (tmp);
32795 }
32796 
LookForEtAl(ValidStructPtr vsp,ValNodePtr tmp)32797 static void LookForEtAl (ValidStructPtr vsp, ValNodePtr tmp)
32798 {
32799   AuthorPtr       ap;
32800   AuthListPtr     authors = NULL;
32801   CitArtPtr       cap;
32802   CitBookPtr      cbp;
32803   CitGenPtr       cgp;
32804   CitSubPtr       csp;
32805   Char            first[64];
32806   Char            initials[16];
32807   Char            last[64];
32808   ValNodePtr      names;
32809   NameStdPtr      nsp;
32810   PersonIdPtr     pid;
32811 
32812   if (vsp == NULL || tmp == NULL)
32813     return;
32814   switch (tmp->choice) {
32815   case PUB_Article:
32816     cap = (CitArtPtr) (tmp->data.ptrvalue);
32817     authors = cap->authors;
32818     break;
32819   case PUB_Man:
32820   case PUB_Book:
32821   case PUB_Proc:
32822     cbp = (CitBookPtr) (tmp->data.ptrvalue);
32823     authors = cbp->authors;
32824     break;
32825   case PUB_Gen:
32826     cgp = (CitGenPtr) (tmp->data.ptrvalue);
32827     authors = cgp->authors;
32828     break;
32829   case PUB_Sub:
32830     csp = (CitSubPtr) (tmp->data.ptrvalue);
32831     authors = csp->authors;
32832     break;
32833   default:
32834     break;
32835   }
32836   if (authors == NULL || authors->choice != 1)
32837     return;
32838   for (names = authors->names; names != NULL; names = names->next) {
32839     ap = names->data.ptrvalue;
32840     if (ap != NULL) {
32841       pid = ap->name;
32842       if (pid != NULL && pid->choice == 2) {
32843         nsp = pid->data;
32844         if (nsp != NULL && nsp->names[0] != NULL) {
32845           CopyLetters (last, nsp->names[0], sizeof (last));
32846           CopyLetters (first, nsp->names[1], sizeof (first));
32847           CopyLetters (initials, nsp->names[4], sizeof (initials));
32848           if ((StringICmp (last, "et al") == 0) || (StringCmp (initials, "al") == 0 && StringCmp (last, "et") == 0 && first[0] == '\0')) {
32849             if (names->next == NULL) {
32850               ValidErr (vsp, SEV_WARNING, ERR_GENERIC_AuthorListHasEtAl, "Author list ends in et al.");
32851             } else {
32852               ValidErr (vsp, SEV_WARNING, ERR_GENERIC_AuthorListHasEtAl, "Author list contains et al.");
32853             }
32854           }
32855         }
32856       }
32857     }
32858   }
32859 }
32860 
32861 //LCOV_EXCL_START
32862 // spellcheck function never supplied
SpellCheckPub(ValidStructPtr vsp,ValNodePtr tmp)32863 static void SpellCheckPub (ValidStructPtr vsp, ValNodePtr tmp)
32864 {
32865   CitArtPtr       cap;
32866   CitBookPtr      cbp;
32867   CitGenPtr       cgp;
32868   ValNodePtr      titles = NULL;
32869 
32870   if ((vsp == NULL) || (tmp == NULL))
32871     return;
32872 
32873   switch (tmp->choice) {
32874   case PUB_Article:
32875     cap = (CitArtPtr) (tmp->data.ptrvalue);
32876     titles = cap->title;
32877     break;
32878   case PUB_Man:
32879   case PUB_Book:
32880   case PUB_Proc:
32881     cbp = (CitBookPtr) (tmp->data.ptrvalue);
32882     titles = cbp->title;
32883     break;
32884   case PUB_Gen:
32885     cgp = (CitGenPtr) (tmp->data.ptrvalue);
32886     if (cgp->cit != NULL)
32887       SpellCheckString (vsp, cgp->cit);
32888     if (cgp->title != NULL)
32889       SpellCheckString (vsp, cgp->title);
32890     break;
32891   default:
32892     break;
32893   }
32894 
32895   if (titles != NULL) {
32896     for (; titles != NULL; titles = titles->next) {
32897       if (titles->choice == Cit_title_name)
32898         SpellCheckString (vsp, (CharPtr) (titles->data.ptrvalue));
32899     }
32900   }
32901 
32902   return;
32903 }
32904 
32905 // spellcheck function never supplied
SpellCheckSeqDescr(GatherContextPtr gcp)32906 static void SpellCheckSeqDescr (GatherContextPtr gcp)
32907 {
32908   PubdescPtr      pdp;
32909   ValNodePtr      tmp, vnp;
32910   ValidStructPtr  vsp;
32911 
32912   vsp = (ValidStructPtr) (gcp->userdata);
32913   if (vsp == NULL)
32914     return;
32915 
32916   vnp = (ValNodePtr) (gcp->thisitem);
32917   if (vnp == NULL)
32918     return;
32919 
32920   vsp->descr = vnp;
32921   vsp->sfp = NULL;
32922 
32923   if (vnp->choice == Seq_descr_pub) {
32924     pdp = (PubdescPtr) (vnp->data.ptrvalue);
32925     for (tmp = pdp->pub; tmp != NULL; tmp = tmp->next) {
32926       LookForEtAl (vsp, tmp);
32927     }
32928   }
32929 
32930   if (vsp->spellfunc == NULL)
32931     return;
32932 
32933   switch (vnp->choice) {
32934   case Seq_descr_title:
32935   case Seq_descr_region:
32936   case Seq_descr_comment:
32937     SpellCheckString (vsp, (CharPtr) (vnp->data.ptrvalue));
32938     break;
32939   case Seq_descr_pub:
32940     pdp = (PubdescPtr) (vnp->data.ptrvalue);
32941     for (tmp = pdp->pub; tmp != NULL; tmp = tmp->next) {
32942       SpellCheckPub (vsp, tmp);
32943     }
32944     SpellCheckString (vsp, pdp->comment);
32945     break;
32946   default:
32947     break;
32948   }
32949   return;
32950 }
32951 
32952 // spellcheck function never supplied
SpellCheckSeqFeat(GatherContextPtr gcp)32953 NLM_EXTERN void SpellCheckSeqFeat (GatherContextPtr gcp)
32954 {
32955   PubdescPtr      pdp;
32956   SeqFeatPtr      sfp;
32957   ProtRefPtr      prp;
32958   ValidStructPtr  vsp;
32959   ValNodePtr      vnp;
32960 
32961   vsp = (ValidStructPtr) (gcp->userdata);
32962   if (vsp == NULL)
32963     return;
32964 
32965   sfp = (SeqFeatPtr) (gcp->thisitem);
32966   if (sfp == NULL)
32967     return;
32968 
32969   vsp->descr = NULL;
32970   vsp->sfp = sfp;
32971 
32972   if (sfp->data.choice == SEQFEAT_PUB) {
32973     pdp = (PubdescPtr) (sfp->data.value.ptrvalue);
32974     for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
32975       LookForEtAl (vsp, vnp);
32976     }
32977   }
32978 
32979   if (vsp->spellfunc == NULL)
32980     return;
32981 
32982   SpellCheckString (vsp, sfp->comment);
32983 
32984   switch (sfp->data.choice) {
32985   case 1:                      /* Gene-ref */
32986     break;
32987   case 2:                      /* Org-ref */
32988     break;
32989   case 3:                      /* Cdregion */
32990     break;
32991   case 4:                      /* Prot-ref */
32992     prp = (ProtRefPtr) (sfp->data.value.ptrvalue);
32993     for (vnp = prp->name; vnp != NULL; vnp = vnp->next)
32994       SpellCheckString (vsp, (CharPtr) (vnp->data.ptrvalue));
32995     SpellCheckString (vsp, prp->desc);
32996     break;
32997   case 5:                      /* RNA-ref */
32998     break;
32999   case 6:                      /* Pub */
33000     pdp = (PubdescPtr) (sfp->data.value.ptrvalue);
33001     for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
33002       SpellCheckPub (vsp, vnp);
33003     }
33004     SpellCheckString (vsp, pdp->comment);
33005     break;
33006   case 7:                      /* Seq */
33007     break;
33008   case 8:                      /* Imp-feat */
33009     break;
33010   case 9:                      /* Region */
33011     SpellCheckString (vsp, (CharPtr) (sfp->data.value.ptrvalue));
33012     break;
33013   case 10:                     /* Comment */
33014     break;
33015   case 11:                     /* Bond */
33016     break;
33017   case 12:                     /* Site */
33018     break;
33019   case 13:                     /* Rsite-ref */
33020     break;
33021   case 14:                     /* User-object */
33022     break;
33023   case 15:                     /* TxInit */
33024     break;
33025   case 16:                     /* Numbering */
33026     break;
33027   case 17:                     /* Secondary Structure */
33028     break;
33029   case 18:                     /* NonStdRes */
33030     break;
33031   case 19:                     /* Heterogen */
33032     break;
33033   case 20:                     /* BioSource */
33034     break;
33035   default:
33036     break;
33037   }
33038 
33039   return;
33040 }
33041 
33042 // spellcheck function never supplied
SpellCheckString(ValidStructPtr vsp,CharPtr str)33043 NLM_EXTERN void SpellCheckString (ValidStructPtr vsp, CharPtr str)
33044 {
33045   if ((vsp == NULL) || (str == NULL))
33046     return;
33047 
33048   if (vsp->spellfunc == NULL)
33049     return;
33050 
33051   (*(vsp->spellfunc)) ((char *) str, (vsp->spellcallback));
33052 
33053   return;
33054 }
33055 
33056 // spellcheck function never supplied
SpellCallBack(char * str)33057 NLM_EXTERN void SpellCallBack (char *str)
33058 {
33059   ErrSev          sev;
33060 
33061   sev = SEV_ERROR;
33062   if (globalvsp != NULL && globalvsp->justwarnonspell) {
33063     sev = SEV_WARNING;
33064   }
33065   ValidErr (globalvsp, sev, ERR_GENERIC_Spell, "[ %s ]", (CharPtr) str);
33066   return;
33067 }
33068 
33069 
33070 // This section of code is used for converting features with
33071 // certain types of validation errors to misc_features
33072 typedef struct intpair {
33073   Int4 errcode;
33074   Int4 subcode;
33075 } Int4PairData, PNTR Int4PairPtr;
33076 
33077 
33078 static Int4PairData s_ErrCodeList[] = {
33079   {ERR_SEQ_INST_StopInProtein},
33080   {ERR_SEQ_FEAT_InternalStop},
33081   {ERR_SEQ_FEAT_StartCodon},
33082   {ERR_SEQ_INST_BadProteinStart},
33083   {ERR_SEQ_FEAT_NoStop},
33084   {0,0}
33085 };
33086 
33087 
33088 
33089 typedef struct conversionlists {
33090   ValNodePtr remove_gene;
33091   ValNodePtr keep_gene;
33092 } ConversionListsData, PNTR ConversionListsPtr;
33093 
33094 
s_ErrorQualifiesForConversion(Int4 errcode,Int4 subcode)33095 static Boolean s_ErrorQualifiesForConversion(Int4 errcode, Int4 subcode)
33096 {
33097   Int4 i;
33098 
33099   for (i = 0; s_ErrCodeList[i].errcode != 0; i++) {
33100     if (errcode == s_ErrCodeList[i].errcode && subcode == s_ErrCodeList[i].subcode) {
33101       return TRUE;
33102     }
33103   }
33104   return FALSE;
33105 }
33106 
33107 
ValidCountProblemsCallback(ErrSev severity,int errcode,int subcode,Uint2 entityID,Uint2 itemtype,Uint4 itemID,CharPtr accession,CharPtr seqid,CharPtr featureID,CharPtr message,CharPtr objtype,CharPtr label,CharPtr context,CharPtr location,CharPtr product,Pointer userdata)33108 static void LIBCALLBACK ValidCountProblemsCallback(
33109   ErrSev severity,
33110   int errcode,
33111   int subcode,
33112   Uint2 entityID,
33113   Uint2 itemtype,
33114   Uint4 itemID,
33115   CharPtr accession,
33116   CharPtr seqid,
33117   CharPtr featureID,
33118   CharPtr message,
33119   CharPtr objtype,
33120   CharPtr label,
33121   CharPtr context,
33122   CharPtr location,
33123   CharPtr product,
33124   Pointer userdata
33125 )
33126 
33127 {
33128   SeqFeatPtr sfp;
33129   SeqMgrFeatContext fcontext;
33130   BioseqPtr bsp;
33131   ConversionListsPtr lists;
33132 
33133   if ((lists = (ConversionListsPtr) userdata) == NULL) {
33134     return;
33135   }
33136 
33137   if (itemtype != OBJ_SEQFEAT) {
33138     return;
33139   }
33140   /* limit the errors we pay attention to by severity, errcode and subcode */
33141   if (severity < SEV_NONE || severity > SEV_MAX) {
33142     severity = SEV_MAX;
33143   }
33144   if (!s_ErrorQualifiesForConversion(errcode, subcode)) {
33145     return;
33146   }
33147 
33148   sfp = SeqMgrGetDesiredFeature (entityID, NULL, itemID, 0, NULL, &fcontext);
33149   if (sfp != NULL && sfp->data.choice == SEQFEAT_PROT) {
33150     bsp = BioseqFindFromSeqLoc (sfp->location);
33151     sfp = SeqMgrGetCDSgivenProduct (bsp, NULL);
33152   }
33153   if (sfp != NULL && (sfp->data.choice == SEQFEAT_CDREGION || sfp->data.choice == SEQFEAT_RNA)) {
33154     ValNodeAddPointer (&(lists->keep_gene), OBJ_SEQFEAT, sfp);
33155   }
33156 
33157 }
33158 
33159 
CountAllCDSAndRna(SeqFeatPtr sfp,Pointer data)33160 static void CountAllCDSAndRna (SeqFeatPtr sfp, Pointer data)
33161 {
33162   Int4Ptr pNum;
33163 
33164   if (sfp != NULL
33165       && (sfp->data.choice == SEQFEAT_CDREGION || sfp->data.choice == SEQFEAT_RNA)
33166       && (pNum = (Int4Ptr) data)) {
33167     (*pNum)++;
33168   }
33169 }
33170 
33171 
ItemListFromAllSubcategories(ValNodePtr subcategories)33172 static ValNodePtr ItemListFromAllSubcategories (ValNodePtr subcategories)
33173 {
33174   ValNodePtr       vnp;
33175   ClickableItemPtr cip;
33176   ValNodePtr       item_list = NULL;
33177 
33178   for (vnp = subcategories; vnp != NULL; vnp = vnp->next) {
33179     cip = (ClickableItemPtr) vnp->data.ptrvalue;
33180     if (cip != NULL) {
33181       ValNodeLink (&item_list, ClickableItemObjectListCopy(cip->item_list));
33182       ValNodeLink (&item_list, ItemListFromAllSubcategories(cip->subcategories));
33183     }
33184   }
33185   return item_list;
33186 }
33187 
33188 
ListFeaturesWithConfigProblems(ValNodePtr sep_list,DiscrepancyConfigPtr config)33189 static ValNodePtr ListFeaturesWithConfigProblems(ValNodePtr sep_list, DiscrepancyConfigPtr config)
33190 {
33191   ValNodePtr errs, feat_list, vnp;
33192   SeqFeatPtr sfp;
33193 
33194   errs = CollectDiscrepancies (config, sep_list, NULL);
33195   feat_list = ItemListFromAllSubcategories (errs);
33196   errs = FreeClickableList(errs);
33197   for (vnp = feat_list; vnp != NULL; vnp = vnp->next) {
33198     if (vnp->choice != OBJ_SEQFEAT) {
33199       vnp->choice = 0;
33200     } else if ((sfp = (SeqFeatPtr) vnp->data.ptrvalue) == NULL) {
33201       vnp->choice = 0;
33202     } else if (sfp->data.choice != SEQFEAT_CDREGION && sfp->data.choice != SEQFEAT_RNA) {
33203       vnp->choice = 0;
33204     }
33205   }
33206   vnp = ValNodeExtractList (&feat_list, 0);
33207   vnp = ValNodeFree (vnp);
33208   return feat_list;
33209 }
33210 
33211 
FilterOutFeatures(ValNodePtr PNTR list,Uint1 datachoice)33212 static void FilterOutFeatures (ValNodePtr PNTR list, Uint1 datachoice)
33213 {
33214   SeqFeatPtr sfp;
33215   ValNodePtr vnp, remove;
33216 
33217   if (list == NULL || *list == NULL) {
33218     return;
33219   }
33220   for (vnp = *list; vnp != NULL; vnp = vnp->next) {
33221     if (vnp->choice == OBJ_SEQFEAT && (sfp = (SeqFeatPtr) vnp->data.ptrvalue) != NULL && sfp->data.choice == datachoice) {
33222       vnp->choice = 0;
33223     }
33224   }
33225   remove = ValNodeExtractList (list, 0);
33226   remove = ValNodeFree (remove);
33227 }
33228 
33229 
GetCodingRegionsAndRNAsWithDiscrepancies(SeqEntryPtr sep,ConversionListsPtr lists)33230 static void GetCodingRegionsAndRNAsWithDiscrepancies (SeqEntryPtr sep, ConversionListsPtr lists)
33231 {
33232   ValNodePtr sep_list = NULL;
33233   DiscrepancyConfigData config;
33234   ValNodePtr overlap_list;
33235 
33236   ValNodeAddPointer (&sep_list, 0, sep);
33237 
33238   MemSet (&config, 0, sizeof (DiscrepancyConfigData));
33239 
33240   config.conf_list[DISC_BACTERIAL_PARTIAL_NONEXTENDABLE_PROBLEMS] = TRUE;
33241   ValNodeLink (&(lists->keep_gene), ListFeaturesWithConfigProblems(sep_list, &config));
33242 
33243   config.conf_list[DISC_BACTERIAL_PARTIAL_NONEXTENDABLE_PROBLEMS] = FALSE;
33244   config.conf_list[DISC_SHORT_RRNA] = TRUE;
33245   ValNodeLink (&(lists->remove_gene), ListFeaturesWithConfigProblems(sep_list, &config));
33246 
33247   config.conf_list[DISC_BACTERIAL_PARTIAL_NONEXTENDABLE_PROBLEMS] = FALSE;
33248   config.conf_list[DISC_SHORT_RRNA] = FALSE;
33249   config.conf_list[DISC_RNA_CDS_OVERLAP] = TRUE;
33250   overlap_list = ListFeaturesWithConfigProblems(sep_list, &config);
33251   FilterOutFeatures(&overlap_list, SEQFEAT_RNA);
33252   ValNodeLink (&(lists->remove_gene), overlap_list);
33253 
33254   sep_list = ValNodeFree (sep_list);
33255 }
33256 
33257 
ConvertFailedCodingRegionsAndRNAsToMiscFeatures(SeqEntryPtr sep,LogInfoPtr lip)33258 NLM_EXTERN void ConvertFailedCodingRegionsAndRNAsToMiscFeatures(SeqEntryPtr sep, LogInfoPtr lip)
33259 {
33260   ValidStructPtr vsp;
33261   ConversionListsData lists;
33262   ValNodePtr all_list;
33263   Int4 num_total = 0, num_bad = 0;
33264 
33265   MemSet (&lists, 0, sizeof (ConversionListsData));
33266   vsp = ValidStructNew ();
33267   vsp->errfunc = ValidCountProblemsCallback;
33268   vsp->userdata = &lists;
33269 
33270   ValidateSeqEntry (sep, vsp);
33271   GetCodingRegionsAndRNAsWithDiscrepancies(sep, &lists);
33272 
33273   if (lists.keep_gene == NULL && lists.remove_gene == NULL) {
33274     /* nothing to do here */
33275     return;
33276   }
33277 
33278   /* consolidate lists */
33279   lists.keep_gene = ValNodeSort (lists.keep_gene, SortVnpByChoiceAndPtrvalue);
33280   ValNodeUnique(&(lists.keep_gene), SortVnpByChoiceAndPtrvalue, ValNodeFree);
33281   lists.remove_gene = ValNodeSort (lists.remove_gene, SortVnpByChoiceAndPtrvalue);
33282   ValNodeUnique(&(lists.remove_gene), SortVnpByChoiceAndPtrvalue, ValNodeFree);
33283 
33284   /* check to see if there are too many bad features in total */
33285   all_list = ValNodeCopyPtr(lists.keep_gene);
33286   ValNodeLink (&all_list, ValNodeCopyPtr(lists.remove_gene));
33287   all_list = ValNodeSort (all_list, SortVnpByChoiceAndPtrvalue);
33288   ValNodeUnique(&(all_list), SortVnpByChoiceAndPtrvalue, ValNodeFree);
33289   num_bad = ValNodeLen (all_list);
33290   all_list = ValNodeFree (all_list);
33291 
33292   /* count total number of features, compare with number bad, do nothing if more
33293    * than 50% are bad
33294    */
33295   VisitFeaturesInSep (sep, &num_total, CountAllCDSAndRna);
33296   if (num_total < 2 * num_bad) {
33297     Message (MSG_ERROR, "More than 50%% of coding regions and RNA features are bad");
33298     lists.keep_gene = ValNodeFree (lists.keep_gene);
33299     lists.remove_gene = ValNodeFree (lists.remove_gene);
33300     return;
33301   }
33302 
33303   /* convert bad features to misc */
33304   ConvertListToMiscFeat (lists.keep_gene, FALSE, lip);
33305   ConvertListToMiscFeat (lists.remove_gene, TRUE, lip);
33306 
33307   lists.keep_gene = ValNodeFree(lists.keep_gene);
33308   lists.remove_gene = ValNodeFree (lists.remove_gene);
33309 }
33310 //LCOV_EXCL_STOP
33311